001/*
002 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.operation.xml;
022
023import java.beans.ConstructorProperties;
024import java.io.IOException;
025import java.io.StringReader;
026import java.io.StringWriter;
027import java.util.HashMap;
028import java.util.Map;
029
030import cascading.flow.FlowProcess;
031import cascading.operation.BaseOperation;
032import cascading.operation.Function;
033import cascading.operation.FunctionCall;
034import cascading.tuple.Fields;
035import cascading.tuple.Tuple;
036import org.ccil.cowan.tagsoup.HTMLSchema;
037import org.ccil.cowan.tagsoup.Parser;
038import org.ccil.cowan.tagsoup.XMLWriter;
039import org.slf4j.Logger;
040import org.slf4j.LoggerFactory;
041import org.xml.sax.InputSource;
042import org.xml.sax.SAXException;
043import org.xml.sax.SAXNotRecognizedException;
044import org.xml.sax.SAXNotSupportedException;
045
046/**
047 * Class TagSoupParser uses the <a href="http://home.ccil.org/~cowan/XML/tagsoup/">Tag Soup</a> library to convert
048 * incoming HTML to clean XHTML.
049 */
050public class TagSoupParser extends BaseOperation implements Function
051  {
052  /** Field LOG */
053  private static final Logger LOG = LoggerFactory.getLogger( TagSoupParser.class );
054
055  /** Field features */
056  private Map<String, Boolean> features;
057  /** Field schema */
058  private transient HTMLSchema schema;
059  /** Field parser */
060  private transient Parser parser;
061
062  /**
063   * Constructor TagSoupParser creates a new TagSoupParser instance.
064   *
065   * @param fieldDeclaration of type Fields
066   */
067  @ConstructorProperties({"fieldDeclaration"})
068  public TagSoupParser( Fields fieldDeclaration )
069    {
070    super( 1, fieldDeclaration );
071
072    if( fieldDeclaration.size() != 1 )
073      throw new IllegalArgumentException( "fieldDeclaration may only declare one field name: " + fieldDeclaration.print() );
074    }
075
076  private HTMLSchema getSchema()
077    {
078    if( schema == null )
079      schema = new HTMLSchema();
080
081    return schema;
082    }
083
084  private Parser getParser() throws SAXNotSupportedException, SAXNotRecognizedException
085    {
086    if( parser != null )
087      return parser;
088
089    parser = new Parser();
090    parser.setProperty( Parser.schemaProperty, getSchema() );
091
092    if( features != null )
093      {
094      for( Map.Entry<String, Boolean> entry : features.entrySet() )
095        parser.setFeature( entry.getKey(), entry.getValue() );
096      }
097
098    return parser;
099    }
100
101  /**
102   * Method setFeature allows the user to set 'features' directly on the TagSoup parser, {@link Parser#setFeature}.
103   * <p/>
104   * Note, all features are lazily added when the Parser is instantiated.
105   *
106   * @param feature of type String
107   * @param value   of type boolean
108   */
109  public void setFeature( String feature, boolean value )
110    {
111    if( features == null )
112      features = new HashMap<String, Boolean>();
113
114    features.put( feature, value );
115    }
116
117  /** @see cascading.operation.Function#operate(cascading.flow.FlowProcess, cascading.operation.FunctionCall) */
118  public void operate( FlowProcess flowProcess, FunctionCall functionCall )
119    {
120    try
121      {
122      StringWriter writer = new StringWriter();
123      XMLWriter xmlWriter = new XMLWriter( writer );
124
125      xmlWriter.setPrefix( getSchema().getURI(), "" );
126      xmlWriter.setOutputProperty( XMLWriter.OMIT_XML_DECLARATION, "yes" );
127
128      InputSource source = new InputSource( new StringReader( (String) functionCall.getArguments().getObject( 0 ) ) );
129
130      getParser().setContentHandler( xmlWriter );
131
132      getParser().parse( source );
133
134      functionCall.getOutputCollector().add( new Tuple( writer.getBuffer().toString() ) );
135      }
136    catch( SAXNotRecognizedException exception )
137      {
138      LOG.warn( "ignoring TagSoup exception", exception );
139      }
140    catch( SAXNotSupportedException exception )
141      {
142      LOG.warn( "ignoring TagSoup exception", exception );
143      }
144    catch( IOException exception )
145      {
146      LOG.warn( "ignoring TagSoup exception", exception );
147      }
148    catch( SAXException exception )
149      {
150      LOG.warn( "ignoring TagSoup exception", exception );
151      }
152    }
153
154  @Override
155  public boolean equals( Object object )
156    {
157    if( this == object )
158      return true;
159    if( !( object instanceof TagSoupParser ) )
160      return false;
161    if( !super.equals( object ) )
162      return false;
163
164    TagSoupParser that = (TagSoupParser) object;
165
166    if( features != null ? !features.equals( that.features ) : that.features != null )
167      return false;
168
169    return true;
170    }
171
172  @Override
173  public int hashCode()
174    {
175    int result = super.hashCode();
176    result = 31 * result + ( features != null ? features.hashCode() : 0 );
177    return result;
178    }
179  }