package com.k_int.discover.util; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.xpath.XPathAPI; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.traversal.NodeIterator; import com.k_int.discover.datamodel.ParsedDocumentElement; import java.util.LinkedHashSet; import javax.xml.transform.TransformerException; /** * A utility class to hold the different methods for processing of data * in lists of ParsedDocumentElement objects for use throughout the * handlers * * @author rpb rich@k-int.com * @version 1.0 22.10.09 */ public final class ParsedDocumentElementUtils { private static Log log = LogFactory.getLog(ParsedDocumentElementUtils.class); public static List getValues(Node metadata_record, String xpath, Node namespace_node) throws javax.xml.transform.TransformerException { List returnValue = new ArrayList(); // Go and get the node(s) we're interested in NodeIterator nodeList = XPathAPI.selectNodeIterator(metadata_record, xpath, namespace_node); // Loop through the nodes Node actualNode; while ((actualNode = nodeList.nextNode()) != null) { ParsedDocumentElement thisElement = new ParsedDocumentElement(actualNode); // Get all of the attributes if ( actualNode.hasAttributes() ) { NamedNodeMap attributes = actualNode.getAttributes(); for(int ctr = 0; ctr < attributes.getLength(); ctr++) { Node thisAttr = attributes.item(ctr); if ( thisAttr != null ) { String attrName = thisAttr.getNodeName(); String attrVal = thisAttr.getNodeValue(); if ( attrName != null && attrVal != null ) { thisElement.addAttribute(attrName, attrVal); } } } } // Get the contents of the element if any String contents = extractText(actualNode); // Unescape any html entites, etc. in the string (a couple of times since there are some places // that encode multiple times and then remove the HTML // log.debug("about to unescape the value: " + value); if ( contents != null ) { contents = StringEscapeUtils.unescapeHtml4(StringEscapeUtils.unescapeHtml4(contents)); contents = contents.replaceAll("\\<.*?\\>", " "); } thisElement.setContents(contents); returnValue.add(thisElement); } return returnValue; } public static String getValueFromXPath(Node metadata_record, String xpath, Node namespace_node) { String value = null; try { // Go and get the node(s) we're interested in NodeIterator nodeList = XPathAPI.selectNodeIterator(metadata_record, xpath, namespace_node); // Loop through the nodes, until wee find one with a value Node node; while (((node = nodeList.nextNode()) != null) && ((value == null) || value.isEmpty())) { value = extractText(node); } } catch (TransformerException e) { log.error("TransformerException thrown while getting value of xpath", e); } return(value); } public static String extractText(Node n) { Node node = null; if (n.getNodeType() == Node.ATTRIBUTE_NODE) { node = n; } else { try { node = XPathAPI.selectSingleNode(n,"./text()"); if ( node != null ) { node.normalize(); } } catch ( javax.xml.transform.TransformerException te ) { te.printStackTrace(); } } return((node == null) ? null : (node.getNodeValue().trim())); } public static Set getElementContentsAsSet(List parsedElements) { return(getElementContentsAsSet(parsedElements, null)); } public static Set getElementContentsAsSet(List parsedElements, String attribute) { Set contentsList = new LinkedHashSet(); if ( parsedElements != null && parsedElements.size() > 0 ) { // We have some elements.. Get the list of values Iterator elementIter = parsedElements.iterator(); while(elementIter.hasNext()) { ParsedDocumentElement nextElement = elementIter.next(); String content = nextElement.getContents(); if ((content != null) && !content.trim().isEmpty()) { // Do we need to prefix the content with the value of an attribute if ((attribute != null) && (nextElement.getAttributes() != null)) { String attributeValue = nextElement.getAttributes().get(attribute); if ((attributeValue != null) && !attributeValue.trim().isEmpty()) { // We have some content and it is not empty content = attributeValue.trim() + ": " + content; } } // Now we have dealt with any attribute prefix, add the value to the set contentsList.add(content.trim()); } } } return contentsList; } public static List getElementContentsAsList(List parsedElements) { List contentsList = new ArrayList(); if ( parsedElements != null && parsedElements.size() > 0 ) { // We have some elements.. Get the list of values Iterator elementIter = parsedElements.iterator(); while(elementIter.hasNext()) { ParsedDocumentElement nextElement = elementIter.next(); if ( nextElement.getContents() != null && !"".equals(nextElement.getContents().trim()) ) { contentsList.add(nextElement.getContents().trim()); } } } return contentsList; } public static String getAllElementContentsAsString(List parsedElements) { return(getAllElementContentsAsString(parsedElements, null)); } public static String getAllElementContentsAsString(List parsedElements, String attribute) { StringBuilder returnBuilder = new StringBuilder(); Set contentsAsList = getElementContentsAsSet(parsedElements, attribute); Iterator contentIter = contentsAsList.iterator(); while(contentIter.hasNext()) { if ( returnBuilder.length() > 0 ) returnBuilder.append("\n"); String tempString = contentIter.next(); returnBuilder.append(tempString); } return(returnBuilder.toString()); } public static List getElementAttributeAsList(List parsedElements, String attributeName) { List attrList = new ArrayList(); if ( parsedElements != null && attributeName != null && !"".equals(attributeName.trim()) ) { // We have some elements - get the list of values of the specified attribute from them Iterator elementIter = parsedElements.iterator(); while(elementIter.hasNext()) { ParsedDocumentElement nextElement = elementIter.next(); if ( nextElement != null && nextElement.getAttribute(attributeName) != null ) { attrList.add(nextElement.getAttribute(attributeName)); } } } return attrList; } public static String getElementContentOrAttributeAsString(List parsedElements, String attributeName, boolean wantElementContent) { String returnValue = null; // We have some elements if (parsedElements != null) { String value = null; // Are we after an attribute element boolean wantAttribute = (attributeName != null) && !attributeName.trim().isEmpty(); // Loop through all the elements and return as soon as we have a value Iterator elementIter = parsedElements.iterator(); while(elementIter.hasNext() && (returnValue == null)) { ParsedDocumentElement nextElement = elementIter.next(); if (nextElement != null) { // Are we interested in the element content if (wantElementContent) { value = nextElement.getContents(); // Do we have some content and it is not empty if (value != null) { value = value.trim(); if (!value.isEmpty()) { returnValue = value.trim(); } } } // If we did not find the value in the content, do we need to try and retrieve the data from the elements attribute if ((returnValue == null) && wantAttribute) { Map attributes = nextElement.getAttributes(); if (attributes != null) { // We have some attributes, now do we have the one we are interested in value = attributes.get(attributeName); if ((value != null) && !value.trim().isEmpty()) { // We have some content and it is not empty returnValue = value.trim(); } } } } } } return returnValue; } public static String getElementAttributeAsString(List parsedElements, String attributeName) { return(getElementContentOrAttributeAsString(parsedElements, attributeName, false)); } public static String getElementContentsAsString(List parsedElements) { return(getElementContentOrAttributeAsString(parsedElements, null, true)); } public static Double getElementContentsAsDouble(List parsedElements) { Double result = null; String value = getElementContentOrAttributeAsString(parsedElements, null, true); if ((value != null) && !value.isEmpty()) { try { result = Double.parseDouble(value); } catch (NumberFormatException e) { log.error("Failed to parse \"" + value + "\" as a Double", e); } } return(result); } /** * Workout the identifier in the list that should be used as the primary identifier * - this is the identifier that starts with 'http'. If there isn't one then just use the first * @param identifiers the list of possible identifiers * @return String - the identifier to be used as the primary identifier */ public static Map> workoutPrimaryIdentifier(List identifiers) { Map> returnValue = new HashMap>(); if ( identifiers != null && identifiers.size() > 0 ) { List allIdentifiers = ParsedDocumentElementUtils.getElementContentsAsList(identifiers); returnValue = workoutPrimaryIdentifierFromList(allIdentifiers); } return returnValue; } public static Map> workoutPrimaryIdentifierFromAttribute(List identifiers, String attribute) { Map> returnValue = new HashMap>(); if ( identifiers != null && identifiers.size() > 0 ) { List allIdentifiers = ParsedDocumentElementUtils.getElementAttributeAsList(identifiers, "identifier"); returnValue = workoutPrimaryIdentifierFromList(allIdentifiers); } return returnValue; } public static Map> workoutPrimaryIdentifierFromList(List identifiers) { Map> returnValue = new HashMap>(); if ( identifiers != null && identifiers.size() > 0 ) { Set otherIdentifiers = new LinkedHashSet(); Iterator allIdsIter = identifiers.iterator(); String primaryId = null; while(allIdsIter.hasNext()) { String thisId = allIdsIter.next(); if ( thisId != null ) { // If this is the first time through the loop remember this identifier in case we don't get one // that starts with 'http' if ( primaryId == null ) { primaryId = thisId; } // If this id starts with 'http' then use this as the primary identifier if ( thisId.toLowerCase().startsWith("http") ) { primaryId = thisId; break; } } } if ( primaryId == null ) { log.error("Null primary identifier even though we had some identifiers to work with!"); } else { Set defaultIds = new LinkedHashSet(); Iterator idIter = identifiers.iterator(); while(idIter.hasNext()) { defaultIds.add(idIter.next()); } otherIdentifiers = defaultIds; otherIdentifiers.remove(primaryId); // Remember the primary identifier and all other identifiers to be returned returnValue.put(primaryId, otherIdentifiers); } } return returnValue; } }