diff --git a/ext/java/nokogiri/Html4Document.java b/ext/java/nokogiri/Html4Document.java index b0de1adf1e..f31262b505 100644 --- a/ext/java/nokogiri/Html4Document.java +++ b/ext/java/nokogiri/Html4Document.java @@ -141,7 +141,7 @@ public class Html4Document extends XmlDocument public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) { - HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]); + HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[3], args[2]); ctx.setIOInputSource(context, args[0], args[1]); return ctx.parse(context, (RubyClass) klass, args[1]); } @@ -150,7 +150,7 @@ public class Html4Document extends XmlDocument public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) { - HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]); + HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[3], args[2]); ctx.setStringInputSource(context, args[0], args[1]); return ctx.parse(context, (RubyClass) klass, args[1]); } diff --git a/ext/java/nokogiri/Html4SaxParserContext.java b/ext/java/nokogiri/Html4SaxParserContext.java index ed34e5b313..c409959200 100644 --- a/ext/java/nokogiri/Html4SaxParserContext.java +++ b/ext/java/nokogiri/Html4SaxParserContext.java @@ -1,5 +1,8 @@ package nokogiri; +import nokogiri.internals.*; +import static nokogiri.internals.NokogiriHelpers.rubyStringToString; + import java.io.ByteArrayInputStream; import java.io.InputStream; import java.nio.charset.Charset; @@ -8,7 +11,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.xerces.parsers.AbstractSAXParser; import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser; import org.jruby.Ruby; import org.jruby.RubyClass; @@ -16,12 +18,11 @@ import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; +import org.jruby.runtime.Helpers; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.xml.sax.SAXException; -import nokogiri.internals.NokogiriHandler; -import static nokogiri.internals.NokogiriHelpers.rubyStringToString; /** * Class for Nokogiri::HTML4::SAX::ParserContext. @@ -31,7 +32,7 @@ * @author Yoko Harada */ @JRubyClass(name = "Nokogiri::HTML4::SAX::ParserContext", parent = "Nokogiri::XML::SAX::ParserContext") -public class Html4SaxParserContext extends XmlSaxParserContext +public class Html4SaxParserContext extends SaxParserContext { private static final long serialVersionUID = 1L; @@ -50,7 +51,7 @@ public class Html4SaxParserContext extends XmlSaxParserContext } @Override - protected AbstractSAXParser + protected SAXParser createParser() throws SAXException { SAXParser parser = new SAXParser(); @@ -279,11 +280,69 @@ static EncodingType get(final int ordinal) return ctx; } - @Override - protected void - preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler) + protected Options + defaultParseOptions(ThreadContext context) + { + return new ParserContext.Options( + RubyFixnum.fix2long(Helpers.invoke(context, + ((RubyClass)context.getRuntime().getClassFromPath("Nokogiri::XML::ParseOptions")) + .getConstant("DEFAULT_HTML"), + "to_i")) + ); + } + + @JRubyMethod + public IRubyObject + parse_with(ThreadContext context, IRubyObject rubyParser) + { + return super.parse_with(context, rubyParser); + } + + @JRubyMethod(name = "replace_entities=") + public IRubyObject + set_replace_entities(ThreadContext context, IRubyObject value) + { + replaceEntities = value.isTrue(); + return this; + } + + @JRubyMethod(name = "replace_entities") + public IRubyObject + get_replace_entities(ThreadContext context) + { + return context.runtime.newBoolean(replaceEntities); + } + + @JRubyMethod(name = "recovery=") + public IRubyObject + set_recovery(ThreadContext context, IRubyObject value) + { + recovery = value.isTrue(); + return this; + } + + @JRubyMethod(name = "recovery") + public IRubyObject + get_recovery(ThreadContext context) + { + return context.runtime.newBoolean(recovery); + } + + @JRubyMethod(name = "column") + public IRubyObject + column(ThreadContext context) { - // this function is meant to be empty. It overrides the one in XmlSaxParserContext + final Integer number = handler.getColumn(); + if (number == null) { return context.getRuntime().getNil(); } + return RubyFixnum.newFixnum(context.getRuntime(), number.longValue()); } + @JRubyMethod(name = "line") + public IRubyObject + line(ThreadContext context) + { + final Integer number = handler.getLine(); + if (number == null) { return context.getRuntime().getNil(); } + return RubyFixnum.newFixnum(context.getRuntime(), number.longValue()); + } } diff --git a/ext/java/nokogiri/Html4SaxPushParser.java b/ext/java/nokogiri/Html4SaxPushParser.java index c338fd3696..d77db393fa 100644 --- a/ext/java/nokogiri/Html4SaxPushParser.java +++ b/ext/java/nokogiri/Html4SaxPushParser.java @@ -4,6 +4,7 @@ import nokogiri.internals.NokogiriBlockingQueueInputStream; import nokogiri.internals.NokogiriHelpers; import nokogiri.internals.ParserContext; +import nokogiri.internals.SaxParserContext; import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyObject; @@ -146,7 +147,7 @@ public class Html4SaxPushParser extends RubyObject stream = new NokogiriBlockingQueueInputStream(); assert saxParser != null : "saxParser null"; - parserTask = new ParserTask(context, saxParser, stream); + parserTask = new ParserTask(context, saxParser, parse(context.runtime, stream), stream); futureTask = new FutureTask((Callable) parserTask); executor = Executors.newSingleThreadExecutor(new ThreadFactory() { @Override @@ -192,22 +193,12 @@ public Thread newThread(Runnable r) { return Html4SaxParserContext.parse_stream(runtime, klazz, stream); } - static class ParserTask extends XmlSaxPushParser.ParserTask /* */ + static class ParserTask extends SaxParserContext.ParserTask { - private - ParserTask(ThreadContext context, IRubyObject handler, InputStream stream) - { - super(context, handler, parse(context.runtime, stream), stream); - } - - @Override - public Html4SaxParserContext - call() throws Exception + ParserTask(ThreadContext context, IRubyObject handler, Html4SaxParserContext parser, InputStream stream) { - return (Html4SaxParserContext) super.call(); + super(context, handler, parser, stream); } - } - } diff --git a/ext/java/nokogiri/XmlDocument.java b/ext/java/nokogiri/XmlDocument.java index 3141ae28c4..63ab80cf58 100644 --- a/ext/java/nokogiri/XmlDocument.java +++ b/ext/java/nokogiri/XmlDocument.java @@ -352,7 +352,7 @@ private static class DocumentBuilderFactoryHolder public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) { - XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[2], args[3]); + XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[3], args[2]); ctx.setIOInputSource(context, args[0], args[1]); return ctx.parse(context, (RubyClass) klass, args[1]); } @@ -361,7 +361,7 @@ private static class DocumentBuilderFactoryHolder public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) { - XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[2], args[3]); + XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[3], args[2]); ctx.setStringInputSource(context, args[0], args[1]); return ctx.parse(context, (RubyClass) klass, args[1]); } diff --git a/ext/java/nokogiri/XmlSaxParserContext.java b/ext/java/nokogiri/XmlSaxParserContext.java index 53ea7383b0..aacd18b437 100644 --- a/ext/java/nokogiri/XmlSaxParserContext.java +++ b/ext/java/nokogiri/XmlSaxParserContext.java @@ -1,7 +1,6 @@ package nokogiri; import nokogiri.internals.*; -import org.apache.xerces.parsers.AbstractSAXParser; import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyFixnum; @@ -12,7 +11,6 @@ import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.xml.sax.SAXException; -import org.xml.sax.SAXParseException; import java.io.IOException; import java.io.InputStream; @@ -26,7 +24,7 @@ * @author Yoko Harada */ @JRubyClass(name = "Nokogiri::XML::SAX::ParserContext") -public class XmlSaxParserContext extends ParserContext +public class XmlSaxParserContext extends SaxParserContext { private static final long serialVersionUID = 1L; @@ -36,15 +34,6 @@ public class XmlSaxParserContext extends ParserContext "http://xml.org/sax/features/namespace-prefixes"; protected static final String FEATURE_LOAD_EXTERNAL_DTD = "http://apache.org/xml/features/nonvalidating/load-external-dtd"; - protected static final String FEATURE_CONTINUE_AFTER_FATAL_ERROR = - "http://apache.org/xml/features/continue-after-fatal-error"; - - protected AbstractSAXParser parser; - - protected NokogiriHandler handler; - protected NokogiriErrorHandler errorHandler; - private boolean replaceEntities = true; - private boolean recovery = false; public XmlSaxParserContext(final Ruby ruby, RubyClass rubyClass) @@ -52,32 +41,13 @@ public class XmlSaxParserContext extends ParserContext super(ruby, rubyClass); } - protected void - initialize(Ruby runtime) - { - try { - parser = createParser(); - } catch (SAXException se) { - // Unexpected failure in XML subsystem - RaiseException ex = runtime.newRuntimeError(se.toString()); - ex.initCause(se); - throw ex; - } - } - - /** - * Create and return a copy of this object. - * - * @return a clone of this object - */ - @Override - public Object - clone() throws CloneNotSupportedException + private static XmlSaxParserContext + newInstance(final Ruby runtime, final RubyClass klazz) { - return super.clone(); + return (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(runtime, klazz); } - protected AbstractSAXParser + protected XmlSaxParser createParser() throws SAXException { XmlSaxParser parser = new XmlSaxParser(); @@ -157,49 +127,7 @@ public class XmlSaxParserContext extends ParserContext return ctx; } - private static XmlSaxParserContext - newInstance(final Ruby runtime, final RubyClass klazz) - { - return (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(runtime, klazz); - } - - public final NokogiriHandler - getNokogiriHandler() { return handler; } - - public final NokogiriErrorHandler - getNokogiriErrorHandler() { return errorHandler; } - - /** - * Perform any initialization prior to parsing with the handler - * handlerRuby. Convenience hook for subclasses. - */ - protected void - preParse(Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler) - { - ((XmlSaxParser) parser).setXmlDeclHandler(handler); - if (recovery) { - try { - parser.setFeature(FEATURE_CONTINUE_AFTER_FATAL_ERROR, true); - } catch (Exception e) { - // Unexpected failure in XML subsystem - throw runtime.newRuntimeError(e.getMessage()); - } - } - } - - protected void - postParse(Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler) - { - // noop - } - - protected void - do_parse() throws SAXException, IOException - { - parser.parse(getInputSource()); - } - - protected static Options + protected Options defaultParseOptions(ThreadContext context) { return new ParserContext.Options( @@ -210,68 +138,27 @@ public class XmlSaxParserContext extends ParserContext ); } - @JRubyMethod - public IRubyObject - parse_with(ThreadContext context, IRubyObject handlerRuby) + protected void + parseSetup(ThreadContext context, IRubyObject rubyParser) { - final Ruby runtime = context.getRuntime(); - - if (!invoke(context, handlerRuby, "respond_to?", runtime.newSymbol("document")).isTrue()) { - throw runtime.newArgumentError("argument must respond_to document"); - } - - /* TODO: how should we pass in parse options? */ - ParserContext.Options options = defaultParseOptions(context); - - errorHandler = new NokogiriStrictErrorHandler(runtime, options.noError, options.noWarning); - handler = new NokogiriHandler(runtime, handlerRuby, errorHandler); - - preParse(runtime, handlerRuby, handler); - parser.setContentHandler(handler); - parser.setErrorHandler(handler); - parser.setEntityResolver(new NokogiriEntityResolver(runtime, errorHandler, options)); + parser.setXmlDeclHandler(handler); - try { - parser.setProperty("http://xml.org/sax/properties/lexical-handler", handler); - } catch (Exception ex) { - throw runtime.newRuntimeError("Problem while creating XML SAX Parser: " + ex.toString()); - } - - try { + if (recovery) { try { - do_parse(); - } catch (SAXParseException ex) { - // A bad document () should call the - // error handler instead of raising a SAX exception. - - // However, an EMPTY document should raise a RuntimeError. - // This is a bit kludgy, but AFAIK SAX doesn't distinguish - // between empty and bad whereas Nokogiri does. - String message = ex.getMessage(); - if (message != null && message.contains("Premature end of file.") && stringDataSize < 1) { - throw runtime.newRuntimeError("couldn't parse document: " + message); - } - handler.error(ex); + parser.setFeature(FEATURE_CONTINUE_AFTER_FATAL_ERROR, true); + } catch (SAXException e) { + throw context.runtime.newRuntimeError(e.getMessage()); } - } catch (SAXException ex) { - // Unexpected failure in XML subsystem - throw runtime.newRuntimeError(ex.getMessage()); - } catch (IOException ex) { - throw runtime.newIOErrorFromException(ex); } + } - postParse(runtime, handlerRuby, handler); - - return runtime.getNil(); + @JRubyMethod + public IRubyObject + parse_with(ThreadContext context, IRubyObject rubyParser) + { + return super.parse_with(context, rubyParser); } - /** - * Can take a boolean assignment. - * - * @param context - * @param value - * @return - */ @JRubyMethod(name = "replace_entities=") public IRubyObject set_replace_entities(ThreadContext context, IRubyObject value) @@ -287,13 +174,6 @@ public class XmlSaxParserContext extends ParserContext return context.runtime.newBoolean(replaceEntities); } - /** - * Can take a boolean assignment. - * - * @param context - * @param value - * @return - */ @JRubyMethod(name = "recovery=") public IRubyObject set_recovery(ThreadContext context, IRubyObject value) diff --git a/ext/java/nokogiri/XmlSaxPushParser.java b/ext/java/nokogiri/XmlSaxPushParser.java index 26261a33e3..26cb118f29 100644 --- a/ext/java/nokogiri/XmlSaxPushParser.java +++ b/ext/java/nokogiri/XmlSaxPushParser.java @@ -1,9 +1,12 @@ package nokogiri; -import nokogiri.internals.*; +import nokogiri.internals.ClosedStreamException; +import nokogiri.internals.NokogiriBlockingQueueInputStream; +import nokogiri.internals.NokogiriHelpers; +import nokogiri.internals.ParserContext; +import nokogiri.internals.SaxParserContext; import org.jruby.Ruby; import org.jruby.RubyClass; -import org.jruby.RubyException; import org.jruby.RubyObject; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; @@ -14,7 +17,6 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.List; import java.util.concurrent.*; import static nokogiri.internals.NokogiriHelpers.getNokogiriClass; @@ -168,7 +170,7 @@ public class XmlSaxPushParser extends RubyObject stream = new NokogiriBlockingQueueInputStream(); assert saxParser != null : "saxParser null"; - parserTask = new ParserTask(context, saxParser, stream); + parserTask = new ParserTask(context, saxParser, parse(context.runtime, stream), stream); futureTask = new FutureTask(parserTask); executor = Executors.newSingleThreadExecutor(new ThreadFactory() { @Override @@ -233,56 +235,12 @@ public Thread newThread(Runnable r) { return XmlSaxParserContext.parse_stream(runtime, klazz, stream); } - static class ParserTask extends ParserContext.ParserTask + static class ParserTask extends SaxParserContext.ParserTask { - - final InputStream stream; - private - ParserTask(ThreadContext context, IRubyObject handler, InputStream stream) - { - this(context, handler, parse(context.runtime, stream), stream); - } - - // IMPL with Html4SaxPushParser - protected ParserTask(ThreadContext context, IRubyObject handler, XmlSaxParserContext parser, InputStream stream) { - super(context, handler, parser); - this.stream = stream; - } - - @Override - public XmlSaxParserContext - call() throws Exception - { - try { - parser.parse_with(context, handler); - } finally { stream.close(); } - // we have to close the stream before exiting, otherwise someone - // can add a chunk and block on task.get() forever. - return parser; - } - - final NokogiriHandler - getNokogiriHandler() - { - return parser.getNokogiriHandler(); - } - - synchronized final int - getErrorCount() - { - // check for null because thread may not have started yet - if (parser.getNokogiriErrorHandler() == null) { return 0; } - return parser.getNokogiriErrorHandler().getErrors().size(); - } - - synchronized final RubyException - getLastError() - { - List errors = parser.getNokogiriErrorHandler().getErrors(); - return errors.get(errors.size() - 1); + super(context, handler, parser, stream); } } } diff --git a/ext/java/nokogiri/internals/DomParserContext.java b/ext/java/nokogiri/internals/DomParserContext.java new file mode 100644 index 0000000000..1748eb41d1 --- /dev/null +++ b/ext/java/nokogiri/internals/DomParserContext.java @@ -0,0 +1,157 @@ +package nokogiri.internals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.jruby.Ruby; +import org.jruby.RubyArray; +import org.jruby.RubyClass; +import org.jruby.RubyException; +import org.jruby.RubyFixnum; +import org.jruby.runtime.Helpers; +import org.jruby.runtime.ThreadContext; +import org.jruby.runtime.builtin.IRubyObject; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import nokogiri.internals.ParserContext; +import nokogiri.internals.ParserContext.Options; + +import nokogiri.XmlDocument; +import nokogiri.XmlDtd; +import nokogiri.XmlSyntaxError; + +import static nokogiri.internals.NokogiriHelpers.isBlank; + +public abstract class DomParserContext extends ParserContext +{ + private static final long serialVersionUID = 1L; + + protected ParserContext.Options options; + protected TParser parser; + protected IRubyObject ruby_encoding; + protected NokogiriErrorHandler errorHandler; + + public + DomParserContext(Ruby ruby, IRubyObject parserOptions, IRubyObject encoding) + { + super(ruby, ruby.getObject()); // class 'Object' because this class hierarchy isn't exposed to Ruby + options = new ParserContext.Options(RubyFixnum.fix2long(parserOptions)); + java_encoding = NokogiriHelpers.getValidEncodingOrNull(encoding); + ruby_encoding = encoding; + + if (options.recover) { + errorHandler = new NokogiriNonStrictErrorHandler(ruby, options.noError, options.noWarning); + } else { + errorHandler = new NokogiriStrictErrorHandler(ruby, options.noError, options.noWarning); + } + } + + public XmlDocument + parse(ThreadContext context, RubyClass klass, IRubyObject url) + { + XmlDocument xmlDoc; + try { + parser.parse(getInputSource()); + } catch (NullPointerException ex) { + // FIXME: this is really a hack to fix #838. Xerces will throw a NullPointerException + // if we tried to parse ''. We should submit a patch to Xerces. + } catch (SAXException e) { + return getDocumentWithErrorsOrRaiseException(context, klass, e); + } catch (IOException e) { + return getDocumentWithErrorsOrRaiseException(context, klass, e); + } + + if (options.noBlanks) { + List emptyNodes = new ArrayList(); + findEmptyTexts(parser.getDocument(), emptyNodes); + if (emptyNodes.size() > 0) { + for (Node node : emptyNodes) { + node.getParentNode().removeChild(node); + } + } + } + xmlDoc = wrapDocument(context, klass, parser.getDocument()); + xmlDoc.setUrl(url); + addErrorsIfNecessary(context, xmlDoc); + return xmlDoc; + } + + public XmlDocument + getDocumentWithErrorsOrRaiseException(ThreadContext context, RubyClass klazz, Exception ex) + { + if (options.recover) { + XmlDocument xmlDocument = getInterruptedOrNewXmlDocument(context, klazz); + this.addErrorsIfNecessary(context, xmlDocument); + XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime); + xmlSyntaxError.setException(ex); + ((RubyArray) xmlDocument.getInstanceVariable("@errors")).append(xmlSyntaxError); + return xmlDocument; + } else { + XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime); + xmlSyntaxError.setException(ex); + throw xmlSyntaxError.toThrowable(); + } + } + + private XmlDocument + getInterruptedOrNewXmlDocument(ThreadContext context, RubyClass klass) + { + Document document = parser.getDocument(); + XmlDocument xmlDocument = new XmlDocument(context.runtime, klass, document); + xmlDocument.setEncoding(ruby_encoding); + return xmlDocument; + } + + public void + addErrorsIfNecessary(ThreadContext context, XmlDocument doc) + { + doc.setInstanceVariable("@errors", mapErrors(context, errorHandler)); + } + + private static void + findEmptyTexts(Node node, List emptyNodes) + { + if (node.getNodeType() == Node.TEXT_NODE && isBlank(node.getTextContent())) { + emptyNodes.add(node); + } else { + NodeList children = node.getChildNodes(); + for (int i = 0; i < children.getLength(); i++) { + findEmptyTexts(children.item(i), emptyNodes); + } + } + } + + protected XmlDocument + wrapDocument(ThreadContext context, RubyClass klass, Document doc) + { + XmlDocument xmlDocument = new XmlDocument(context.runtime, klass, doc); + Helpers.invoke(context, xmlDocument, "initialize"); + xmlDocument.setEncoding(ruby_encoding); + + if (options.dtdLoad) { + IRubyObject dtd = XmlDtd.newFromExternalSubset(context.runtime, doc); + if (!dtd.isNil()) { + doc.setUserData(XmlDocument.DTD_EXTERNAL_SUBSET, (XmlDtd) dtd, null); + } + } + return xmlDocument; + } + + public static RubyArray + mapErrors(ThreadContext context, NokogiriErrorHandler errorHandler) + { + final Ruby runtime = context.runtime; + final List errors = errorHandler.getErrors(); + final IRubyObject[] errorsAry = new IRubyObject[errors.size()]; + for (int i = 0; i < errors.size(); i++) { + errorsAry[i] = errors.get(i); + } + return runtime.newArrayNoCopy(errorsAry); + } +} diff --git a/ext/java/nokogiri/internals/HtmlDomParserContext.java b/ext/java/nokogiri/internals/HtmlDomParserContext.java index e3968f8c84..5a861267b9 100644 --- a/ext/java/nokogiri/internals/HtmlDomParserContext.java +++ b/ext/java/nokogiri/internals/HtmlDomParserContext.java @@ -41,13 +41,13 @@ public class HtmlDomParserContext extends XmlDomParserContext public HtmlDomParserContext(Ruby runtime, IRubyObject options) { - this(runtime, runtime.getNil(), options); + this(runtime, options, runtime.getNil()); } public - HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options) + HtmlDomParserContext(Ruby runtime, IRubyObject options, IRubyObject encoding) { - super(runtime, encoding, options); + super(runtime, options, encoding); java_encoding = NokogiriHelpers.getValidEncoding(encoding); } diff --git a/ext/java/nokogiri/internals/ParserContext.java b/ext/java/nokogiri/internals/ParserContext.java index 27fa835dc8..5d5f810d88 100644 --- a/ext/java/nokogiri/internals/ParserContext.java +++ b/ext/java/nokogiri/internals/ParserContext.java @@ -7,7 +7,6 @@ import java.io.IOException; import java.io.InputStream; import java.net.URI; -import java.util.concurrent.Callable; import org.jruby.Ruby; import org.jruby.RubyClass; @@ -35,13 +34,6 @@ public abstract class ParserContext extends RubyObject protected int stringDataSize = -1; protected String java_encoding; - public - ParserContext(Ruby runtime) - { - // default to class 'Object' because this class isn't exposed to Ruby - super(runtime, runtime.getObject()); - } - public ParserContext(Ruby runtime, RubyClass klass) { @@ -225,38 +217,4 @@ public static class Options noXIncNode = test(options, NOXINCNODE); } } - - /* - public static class NokogiriXInlcudeEntityResolver implements org.xml.sax.EntityResolver { - InputSource source; - public NokogiriXInlcudeEntityResolver(InputSource source) { - this.source = source; - } - - @Override - public InputSource resolveEntity(String publicId, String systemId) - throws SAXException, IOException { - if (systemId != null) source.setSystemId(systemId); - if (publicId != null) source.setPublicId(publicId); - return source; - } - } */ - - public static abstract class ParserTask implements Callable - { - - protected final ThreadContext context; // TODO does not seem like a good idea!? - protected final IRubyObject handler; - protected final T parser; - - protected - ParserTask(ThreadContext context, IRubyObject handler, T parser) - { - this.context = context; - this.handler = handler; - this.parser = parser; - } - - } - } diff --git a/ext/java/nokogiri/internals/SaxParserContext.java b/ext/java/nokogiri/internals/SaxParserContext.java new file mode 100644 index 0000000000..efe24fccfb --- /dev/null +++ b/ext/java/nokogiri/internals/SaxParserContext.java @@ -0,0 +1,181 @@ +package nokogiri.internals; + +import nokogiri.internals.*; + +import org.jruby.Ruby; +import org.jruby.RubyClass; +import org.jruby.RubyException; +import org.jruby.RubyFixnum; +import org.jruby.anno.JRubyMethod; +import org.jruby.exceptions.RaiseException; +import org.jruby.runtime.ThreadContext; +import org.jruby.runtime.builtin.IRubyObject; +import static org.jruby.runtime.Helpers.invoke; + +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.concurrent.Callable; + +public abstract class SaxParserContext extends ParserContext +{ + private static final long serialVersionUID = 1L; + + protected TParser parser; + protected NokogiriHandler handler; + protected NokogiriErrorHandler errorHandler; + + protected boolean replaceEntities = true; + protected boolean recovery = false; + + protected static final String FEATURE_CONTINUE_AFTER_FATAL_ERROR = + "http://apache.org/xml/features/continue-after-fatal-error"; + protected static final String PROPERTY_LEXICAL_HANDLER = + "http://xml.org/sax/properties/lexical-handler"; + + public + SaxParserContext(final Ruby ruby, RubyClass rubyClass) + { + super(ruby, rubyClass); + } + + @Override + public Object + clone() throws CloneNotSupportedException + { + return super.clone(); + } + + protected void + initialize(Ruby runtime) + { + try { + parser = createParser(); + } catch (SAXException se) { + // Unexpected failure in XML subsystem + RaiseException ex = runtime.newRuntimeError(se.toString()); + ex.initCause(se); + throw ex; + } + } + + protected abstract TParser createParser() throws SAXException; + + public final NokogiriHandler + getNokogiriHandler() { return handler; } + + public final NokogiriErrorHandler + getNokogiriErrorHandler() { return errorHandler; } + + protected abstract Options defaultParseOptions(ThreadContext context); + + protected void + parseSetup(ThreadContext context, IRubyObject rubyParser) + { + } + + public IRubyObject + parse_with(ThreadContext context, IRubyObject rubyParser) + { + final Ruby runtime = context.runtime; + + if (!invoke(context, rubyParser, "respond_to?", runtime.newSymbol("document")).isTrue()) { + throw runtime.newArgumentError("argument must respond_to document"); + } + + /* TODO: how should we pass in parse options? */ + ParserContext.Options options = defaultParseOptions(context); + + errorHandler = new NokogiriStrictErrorHandler(runtime, options.noError, options.noWarning); + handler = new NokogiriHandler(runtime, rubyParser, errorHandler); + + parseSetup(context, rubyParser); + + parser.setContentHandler(handler); + parser.setErrorHandler(handler); + parser.setEntityResolver(new NokogiriEntityResolver(runtime, errorHandler, options)); + try { + parser.setProperty(PROPERTY_LEXICAL_HANDLER, handler); + } catch (SAXException e) { + throw runtime.newRuntimeError(e.getMessage()); + } + + try { + try { + parser.parse(getInputSource()); + } catch (SAXParseException ex) { + // A bad document () should call the + // error handler instead of raising a SAX exception. + + // However, an EMPTY document should raise a RuntimeError. + // This is a bit kludgy, but AFAIK SAX doesn't distinguish + // between empty and bad whereas Nokogiri does. + String message = ex.getMessage(); + if (message != null && message.contains("Premature end of file.") && stringDataSize < 1) { + throw runtime.newRuntimeError("couldn't parse document: " + message); + } + handler.error(ex); + } + } catch (SAXException ex) { + // Unexpected failure in XML subsystem + throw runtime.newRuntimeError(ex.getMessage()); + } catch (IOException ex) { + throw runtime.newIOErrorFromException(ex); + } + + return runtime.getNil(); + } + + public static abstract class ParserTask> implements Callable + { + protected final ThreadContext context; // TODO does not seem like a good idea!? + protected final IRubyObject handler; + protected final T parser; + final InputStream stream; + + public + ParserTask(ThreadContext context, IRubyObject handler, T parser, InputStream stream) + { + this.context = context; + this.handler = handler; + this.parser = parser; + this.stream = stream; + } + + public final NokogiriHandler + getNokogiriHandler() + { + return parser.getNokogiriHandler(); + } + + public synchronized final int + getErrorCount() + { + // check for null because thread may not have started yet + if (parser.getNokogiriErrorHandler() == null) { return 0; } + return parser.getNokogiriErrorHandler().getErrors().size(); + } + + public synchronized final RubyException + getLastError() + { + List errors = parser.getNokogiriErrorHandler().getErrors(); + return errors.get(errors.size() - 1); + } + + @Override + public T + call() throws Exception + { + try { + parser.parse_with(context, handler); + } finally { stream.close(); } + // we have to close the stream before exiting, otherwise someone + // can add a chunk and block on task.get() forever. + return parser; + } + } +} diff --git a/ext/java/nokogiri/internals/XmlDomParserContext.java b/ext/java/nokogiri/internals/XmlDomParserContext.java index 557c2f18e3..19b1e58364 100644 --- a/ext/java/nokogiri/internals/XmlDomParserContext.java +++ b/ext/java/nokogiri/internals/XmlDomParserContext.java @@ -1,24 +1,15 @@ package nokogiri.internals; -import nokogiri.XmlDocument; -import nokogiri.XmlDtd; -import nokogiri.XmlSyntaxError; import org.apache.xerces.parsers.DOMParser; -import org.jruby.*; -import org.jruby.exceptions.RaiseException; -import org.jruby.runtime.Helpers; + +import org.jruby.Ruby; +import org.jruby.RubyClass; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import org.xml.sax.SAXException; -import static nokogiri.internals.NokogiriHelpers.isBlank; +import nokogiri.XmlDocument; /** * Parser class for XML DOM processing. This class actually parses XML document @@ -28,7 +19,7 @@ * @author sergio * @author Yoko Harada */ -public class XmlDomParserContext extends ParserContext +public class XmlDomParserContext extends DomParserContext { private static final long serialVersionUID = 1L; @@ -45,36 +36,18 @@ public class XmlDomParserContext extends ParserContext protected static final String FEATURE_VALIDATION = "http://xml.org/sax/features/validation"; private static final String SECURITY_MANAGER = "http://apache.org/xml/properties/security-manager"; - protected ParserContext.Options options; - protected DOMParser parser; - protected NokogiriErrorHandler errorHandler; - protected IRubyObject ruby_encoding; - public XmlDomParserContext(Ruby runtime, IRubyObject options) { - this(runtime, runtime.getNil(), options); + this(runtime, options, runtime.getNil()); } public - XmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options) + XmlDomParserContext(Ruby runtime, IRubyObject parserOptions, IRubyObject encoding) { - super(runtime); - this.options = new ParserContext.Options(RubyFixnum.fix2long(options)); - java_encoding = NokogiriHelpers.getValidEncodingOrNull(encoding); - ruby_encoding = encoding; - initErrorHandler(runtime); - initParser(runtime); - } + super(runtime, parserOptions, encoding); - protected void - initErrorHandler(Ruby runtime) - { - if (options.recover) { - errorHandler = new NokogiriNonStrictErrorHandler(runtime, options.noError, options.noWarning); - } else { - errorHandler = new NokogiriStrictErrorHandler(runtime, options.noError, options.noWarning); - } + initParser(runtime); } protected void @@ -144,122 +117,13 @@ public class XmlDomParserContext extends ParserContext } } - public void - addErrorsIfNecessary(ThreadContext context, XmlDocument doc) - { - doc.setInstanceVariable("@errors", mapErrors(context, errorHandler)); - } - - - public static RubyArray - mapErrors(ThreadContext context, NokogiriErrorHandler errorHandler) - { - final Ruby runtime = context.runtime; - final List errors = errorHandler.getErrors(); - final IRubyObject[] errorsAry = new IRubyObject[errors.size()]; - for (int i = 0; i < errors.size(); i++) { - errorsAry[i] = errors.get(i); - } - return runtime.newArrayNoCopy(errorsAry); - } - - public XmlDocument - getDocumentWithErrorsOrRaiseException(ThreadContext context, RubyClass klazz, Exception ex) - { - if (options.recover) { - XmlDocument xmlDocument = getInterruptedOrNewXmlDocument(context, klazz); - this.addErrorsIfNecessary(context, xmlDocument); - XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime); - xmlSyntaxError.setException(ex); - ((RubyArray) xmlDocument.getInstanceVariable("@errors")).append(xmlSyntaxError); - return xmlDocument; - } else { - XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime); - xmlSyntaxError.setException(ex); - throw xmlSyntaxError.toThrowable(); - } - } - - private XmlDocument - getInterruptedOrNewXmlDocument(ThreadContext context, RubyClass klass) - { - Document document = parser.getDocument(); - XmlDocument xmlDocument = new XmlDocument(context.runtime, klass, document); - xmlDocument.setEncoding(ruby_encoding); - return xmlDocument; - } - - /** - * This method is broken out so that HtmlDomParserContext can - * override it. - */ - protected XmlDocument - wrapDocument(ThreadContext context, RubyClass klass, Document doc) - { - XmlDocument xmlDocument = new XmlDocument(context.runtime, klass, doc); - Helpers.invoke(context, xmlDocument, "initialize"); - xmlDocument.setEncoding(ruby_encoding); - - if (options.dtdLoad) { - IRubyObject dtd = XmlDtd.newFromExternalSubset(context.runtime, doc); - if (!dtd.isNil()) { - doc.setUserData(XmlDocument.DTD_EXTERNAL_SUBSET, (XmlDtd) dtd, null); - } - } - return xmlDocument; - } - /** * Must call setInputSource() before this method. */ + @Override public XmlDocument parse(ThreadContext context, RubyClass klass, IRubyObject url) { - XmlDocument xmlDoc; - try { - Document doc = do_parse(); - xmlDoc = wrapDocument(context, klass, doc); - xmlDoc.setUrl(url); - addErrorsIfNecessary(context, xmlDoc); - return xmlDoc; - } catch (SAXException e) { - return getDocumentWithErrorsOrRaiseException(context, klass, e); - } catch (IOException e) { - return getDocumentWithErrorsOrRaiseException(context, klass, e); - } - } - - protected Document - do_parse() throws SAXException, IOException - { - try { - parser.parse(getInputSource()); - } catch (NullPointerException ex) { - // FIXME: this is really a hack to fix #838. Xerces will throw a NullPointerException - // if we tried to parse ''. We should submit a patch to Xerces. - } - if (options.noBlanks) { - List emptyNodes = new ArrayList(); - findEmptyTexts(parser.getDocument(), emptyNodes); - if (emptyNodes.size() > 0) { - for (Node node : emptyNodes) { - node.getParentNode().removeChild(node); - } - } - } - return parser.getDocument(); - } - - private static void - findEmptyTexts(Node node, List emptyNodes) - { - if (node.getNodeType() == Node.TEXT_NODE && isBlank(node.getTextContent())) { - emptyNodes.add(node); - } else { - NodeList children = node.getChildNodes(); - for (int i = 0; i < children.getLength(); i++) { - findEmptyTexts(children.item(i), emptyNodes); - } - } + return super.parse(context, klass, url); } } diff --git a/ext/java/nokogiri/internals/XmlSaxParser.java b/ext/java/nokogiri/internals/XmlSaxParser.java index 478fe8fd1a..b55ae178a6 100644 --- a/ext/java/nokogiri/internals/XmlSaxParser.java +++ b/ext/java/nokogiri/internals/XmlSaxParser.java @@ -12,7 +12,6 @@ */ public class XmlSaxParser extends SAXParser { - protected XmlDeclHandler xmlDeclHandler = null; public diff --git a/nokogiri.gemspec b/nokogiri.gemspec index 9611ade2fc..86871eb94c 100644 --- a/nokogiri.gemspec +++ b/nokogiri.gemspec @@ -111,6 +111,7 @@ Gem::Specification.new do |spec| "ext/java/nokogiri/internals/ParserContext.java", "ext/java/nokogiri/internals/ReaderNode.java", "ext/java/nokogiri/internals/SaveContextVisitor.java", + "ext/java/nokogiri/internals/SaxParserContext.java", "ext/java/nokogiri/internals/SchemaErrorHandler.java", "ext/java/nokogiri/internals/XalanDTMManagerPatch.java", "ext/java/nokogiri/internals/XmlDeclHandler.java",