001/*
002 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.operation.xml;
022
023import javax.xml.parsers.DocumentBuilder;
024import javax.xml.xpath.XPathConstants;
025import javax.xml.xpath.XPathExpressionException;
026
027import cascading.flow.FlowProcess;
028import cascading.operation.Function;
029import cascading.operation.FunctionCall;
030import cascading.operation.OperationException;
031import cascading.tuple.Fields;
032import cascading.tuple.Tuple;
033import cascading.util.Pair;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036import org.w3c.dom.Document;
037import org.w3c.dom.NodeList;
038
039/**
040 * XPathParser will extract a value from the passed Tuple argument into a new Tuple field. One field
041 * for every given XPath expression will be created. This function effectively converts an XML document into
042 * a table.
043 * <p/>
044 * If the returned value of the expression is a NodeList, only the first Node is used. The Node is converted to a new
045 * XML document and converted to a String. If only the text values are required, search on the text() nodes, or consider
046 * using {@link XPathGenerator} to handle multiple NodeList values.
047 */
048public class XPathParser extends XPathOperation implements Function<Pair<DocumentBuilder, Tuple>>
049  {
050  /** Field LOG */
051  private static final Logger LOG = LoggerFactory.getLogger( XPathParser.class );
052
053  /**
054   * Constructor XPathParser creates a new XPathParser instance.
055   *
056   * @param fieldDeclaration of type Fields
057   * @param namespaces       of type String[][]
058   * @param paths            of type String...
059   */
060  public XPathParser( Fields fieldDeclaration, String[][] namespaces, String... paths )
061    {
062    super( 1, fieldDeclaration, namespaces, paths );
063
064    if( !fieldDeclaration.isSubstitution() && fieldDeclaration.size() != paths.length )
065      throw new IllegalArgumentException( "declared fields and given xpath expressions are not the same size: " + fieldDeclaration.print() + " paths: " + paths.length );
066    }
067
068  /**
069   * Constructor XPathParser creates a new XPathParser instance.
070   *
071   * @param fieldDeclaration of type Fields
072   * @param paths            of type String...
073   */
074  public XPathParser( Fields fieldDeclaration, String... paths )
075    {
076    super( 1, fieldDeclaration, null, paths );
077
078    if( !fieldDeclaration.isSubstitution() && fieldDeclaration.size() != paths.length )
079      throw new IllegalArgumentException( "declared fields and given xpath expressions are not the same size: " + fieldDeclaration.print() + " paths: " + paths.length );
080    }
081
082  @Override
083  public void operate( FlowProcess flowProcess, FunctionCall<Pair<DocumentBuilder, Tuple>> functionCall )
084    {
085    Tuple tuple = functionCall.getContext().getRhs();
086
087    tuple.clear();
088
089    String argument = functionCall.getArguments().getString( 0 );
090    Document document = parseDocument( functionCall.getContext().getLhs(), argument );
091
092    for( int i = 0; i < getExpressions().size(); i++ )
093      {
094      try
095        {
096        NodeList value = (NodeList) getExpressions().get( i ).evaluate( document, XPathConstants.NODESET );
097
098        if( LOG.isDebugEnabled() )
099          LOG.debug( "xpath: {} was: {}", paths[ i ], value != null && value.getLength() != 0 );
100
101        if( value != null && value.getLength() != 0 )
102          tuple.add( writeAsXML( value.item( 0 ) ) );
103        else
104          tuple.add( "" );
105        }
106      catch( XPathExpressionException exception )
107        {
108        throw new OperationException( "could not evaluate xpath expression: " + paths[ i ], exception );
109        }
110      }
111
112    functionCall.getOutputCollector().add( tuple );
113    }
114  }