diff --git a/ext/java/nokogiri/Html4Document.java b/ext/java/nokogiri/Html4Document.java
index b0de1adf1e..f31262b505 100644
--- a/ext/java/nokogiri/Html4Document.java
+++ b/ext/java/nokogiri/Html4Document.java
@@ -141,7 +141,7 @@ public class Html4Document extends XmlDocument
public static IRubyObject
read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args)
{
- HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
+ HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[3], args[2]);
ctx.setIOInputSource(context, args[0], args[1]);
return ctx.parse(context, (RubyClass) klass, args[1]);
}
@@ -150,7 +150,7 @@ public class Html4Document extends XmlDocument
public static IRubyObject
read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args)
{
- HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
+ HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[3], args[2]);
ctx.setStringInputSource(context, args[0], args[1]);
return ctx.parse(context, (RubyClass) klass, args[1]);
}
diff --git a/ext/java/nokogiri/Html4SaxParserContext.java b/ext/java/nokogiri/Html4SaxParserContext.java
index ed34e5b313..c409959200 100644
--- a/ext/java/nokogiri/Html4SaxParserContext.java
+++ b/ext/java/nokogiri/Html4SaxParserContext.java
@@ -1,5 +1,8 @@
package nokogiri;
+import nokogiri.internals.*;
+import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
+
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
@@ -8,7 +11,6 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.xerces.parsers.AbstractSAXParser;
import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser;
import org.jruby.Ruby;
import org.jruby.RubyClass;
@@ -16,12 +18,11 @@
import org.jruby.RubyString;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
+import org.jruby.runtime.Helpers;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.xml.sax.SAXException;
-import nokogiri.internals.NokogiriHandler;
-import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
/**
* Class for Nokogiri::HTML4::SAX::ParserContext.
@@ -31,7 +32,7 @@
* @author Yoko Harada
*/
@JRubyClass(name = "Nokogiri::HTML4::SAX::ParserContext", parent = "Nokogiri::XML::SAX::ParserContext")
-public class Html4SaxParserContext extends XmlSaxParserContext
+public class Html4SaxParserContext extends SaxParserContext
{
private static final long serialVersionUID = 1L;
@@ -50,7 +51,7 @@ public class Html4SaxParserContext extends XmlSaxParserContext
}
@Override
- protected AbstractSAXParser
+ protected SAXParser
createParser() throws SAXException
{
SAXParser parser = new SAXParser();
@@ -279,11 +280,69 @@ static EncodingType get(final int ordinal)
return ctx;
}
- @Override
- protected void
- preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler)
+ protected Options
+ defaultParseOptions(ThreadContext context)
+ {
+ return new ParserContext.Options(
+ RubyFixnum.fix2long(Helpers.invoke(context,
+ ((RubyClass)context.getRuntime().getClassFromPath("Nokogiri::XML::ParseOptions"))
+ .getConstant("DEFAULT_HTML"),
+ "to_i"))
+ );
+ }
+
+ @JRubyMethod
+ public IRubyObject
+ parse_with(ThreadContext context, IRubyObject rubyParser)
+ {
+ return super.parse_with(context, rubyParser);
+ }
+
+ @JRubyMethod(name = "replace_entities=")
+ public IRubyObject
+ set_replace_entities(ThreadContext context, IRubyObject value)
+ {
+ replaceEntities = value.isTrue();
+ return this;
+ }
+
+ @JRubyMethod(name = "replace_entities")
+ public IRubyObject
+ get_replace_entities(ThreadContext context)
+ {
+ return context.runtime.newBoolean(replaceEntities);
+ }
+
+ @JRubyMethod(name = "recovery=")
+ public IRubyObject
+ set_recovery(ThreadContext context, IRubyObject value)
+ {
+ recovery = value.isTrue();
+ return this;
+ }
+
+ @JRubyMethod(name = "recovery")
+ public IRubyObject
+ get_recovery(ThreadContext context)
+ {
+ return context.runtime.newBoolean(recovery);
+ }
+
+ @JRubyMethod(name = "column")
+ public IRubyObject
+ column(ThreadContext context)
{
- // this function is meant to be empty. It overrides the one in XmlSaxParserContext
+ final Integer number = handler.getColumn();
+ if (number == null) { return context.getRuntime().getNil(); }
+ return RubyFixnum.newFixnum(context.getRuntime(), number.longValue());
}
+ @JRubyMethod(name = "line")
+ public IRubyObject
+ line(ThreadContext context)
+ {
+ final Integer number = handler.getLine();
+ if (number == null) { return context.getRuntime().getNil(); }
+ return RubyFixnum.newFixnum(context.getRuntime(), number.longValue());
+ }
}
diff --git a/ext/java/nokogiri/Html4SaxPushParser.java b/ext/java/nokogiri/Html4SaxPushParser.java
index c338fd3696..d77db393fa 100644
--- a/ext/java/nokogiri/Html4SaxPushParser.java
+++ b/ext/java/nokogiri/Html4SaxPushParser.java
@@ -4,6 +4,7 @@
import nokogiri.internals.NokogiriBlockingQueueInputStream;
import nokogiri.internals.NokogiriHelpers;
import nokogiri.internals.ParserContext;
+import nokogiri.internals.SaxParserContext;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
@@ -146,7 +147,7 @@ public class Html4SaxPushParser extends RubyObject
stream = new NokogiriBlockingQueueInputStream();
assert saxParser != null : "saxParser null";
- parserTask = new ParserTask(context, saxParser, stream);
+ parserTask = new ParserTask(context, saxParser, parse(context.runtime, stream), stream);
futureTask = new FutureTask((Callable) parserTask);
executor = Executors.newSingleThreadExecutor(new ThreadFactory() {
@Override
@@ -192,22 +193,12 @@ public Thread newThread(Runnable r) {
return Html4SaxParserContext.parse_stream(runtime, klazz, stream);
}
- static class ParserTask extends XmlSaxPushParser.ParserTask /* */
+ static class ParserTask extends SaxParserContext.ParserTask
{
-
private
- ParserTask(ThreadContext context, IRubyObject handler, InputStream stream)
- {
- super(context, handler, parse(context.runtime, stream), stream);
- }
-
- @Override
- public Html4SaxParserContext
- call() throws Exception
+ ParserTask(ThreadContext context, IRubyObject handler, Html4SaxParserContext parser, InputStream stream)
{
- return (Html4SaxParserContext) super.call();
+ super(context, handler, parser, stream);
}
-
}
-
}
diff --git a/ext/java/nokogiri/XmlDocument.java b/ext/java/nokogiri/XmlDocument.java
index 3141ae28c4..63ab80cf58 100644
--- a/ext/java/nokogiri/XmlDocument.java
+++ b/ext/java/nokogiri/XmlDocument.java
@@ -352,7 +352,7 @@ private static class DocumentBuilderFactoryHolder
public static IRubyObject
read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args)
{
- XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[2], args[3]);
+ XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[3], args[2]);
ctx.setIOInputSource(context, args[0], args[1]);
return ctx.parse(context, (RubyClass) klass, args[1]);
}
@@ -361,7 +361,7 @@ private static class DocumentBuilderFactoryHolder
public static IRubyObject
read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args)
{
- XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[2], args[3]);
+ XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[3], args[2]);
ctx.setStringInputSource(context, args[0], args[1]);
return ctx.parse(context, (RubyClass) klass, args[1]);
}
diff --git a/ext/java/nokogiri/XmlSaxParserContext.java b/ext/java/nokogiri/XmlSaxParserContext.java
index 53ea7383b0..aacd18b437 100644
--- a/ext/java/nokogiri/XmlSaxParserContext.java
+++ b/ext/java/nokogiri/XmlSaxParserContext.java
@@ -1,7 +1,6 @@
package nokogiri;
import nokogiri.internals.*;
-import org.apache.xerces.parsers.AbstractSAXParser;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
@@ -12,7 +11,6 @@
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.xml.sax.SAXException;
-import org.xml.sax.SAXParseException;
import java.io.IOException;
import java.io.InputStream;
@@ -26,7 +24,7 @@
* @author Yoko Harada
*/
@JRubyClass(name = "Nokogiri::XML::SAX::ParserContext")
-public class XmlSaxParserContext extends ParserContext
+public class XmlSaxParserContext extends SaxParserContext
{
private static final long serialVersionUID = 1L;
@@ -36,15 +34,6 @@ public class XmlSaxParserContext extends ParserContext
"http://xml.org/sax/features/namespace-prefixes";
protected static final String FEATURE_LOAD_EXTERNAL_DTD =
"http://apache.org/xml/features/nonvalidating/load-external-dtd";
- protected static final String FEATURE_CONTINUE_AFTER_FATAL_ERROR =
- "http://apache.org/xml/features/continue-after-fatal-error";
-
- protected AbstractSAXParser parser;
-
- protected NokogiriHandler handler;
- protected NokogiriErrorHandler errorHandler;
- private boolean replaceEntities = true;
- private boolean recovery = false;
public
XmlSaxParserContext(final Ruby ruby, RubyClass rubyClass)
@@ -52,32 +41,13 @@ public class XmlSaxParserContext extends ParserContext
super(ruby, rubyClass);
}
- protected void
- initialize(Ruby runtime)
- {
- try {
- parser = createParser();
- } catch (SAXException se) {
- // Unexpected failure in XML subsystem
- RaiseException ex = runtime.newRuntimeError(se.toString());
- ex.initCause(se);
- throw ex;
- }
- }
-
- /**
- * Create and return a copy of this object.
- *
- * @return a clone of this object
- */
- @Override
- public Object
- clone() throws CloneNotSupportedException
+ private static XmlSaxParserContext
+ newInstance(final Ruby runtime, final RubyClass klazz)
{
- return super.clone();
+ return (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(runtime, klazz);
}
- protected AbstractSAXParser
+ protected XmlSaxParser
createParser() throws SAXException
{
XmlSaxParser parser = new XmlSaxParser();
@@ -157,49 +127,7 @@ public class XmlSaxParserContext extends ParserContext
return ctx;
}
- private static XmlSaxParserContext
- newInstance(final Ruby runtime, final RubyClass klazz)
- {
- return (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(runtime, klazz);
- }
-
- public final NokogiriHandler
- getNokogiriHandler() { return handler; }
-
- public final NokogiriErrorHandler
- getNokogiriErrorHandler() { return errorHandler; }
-
- /**
- * Perform any initialization prior to parsing with the handler
- * handlerRuby
. Convenience hook for subclasses.
- */
- protected void
- preParse(Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler)
- {
- ((XmlSaxParser) parser).setXmlDeclHandler(handler);
- if (recovery) {
- try {
- parser.setFeature(FEATURE_CONTINUE_AFTER_FATAL_ERROR, true);
- } catch (Exception e) {
- // Unexpected failure in XML subsystem
- throw runtime.newRuntimeError(e.getMessage());
- }
- }
- }
-
- protected void
- postParse(Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler)
- {
- // noop
- }
-
- protected void
- do_parse() throws SAXException, IOException
- {
- parser.parse(getInputSource());
- }
-
- protected static Options
+ protected Options
defaultParseOptions(ThreadContext context)
{
return new ParserContext.Options(
@@ -210,68 +138,27 @@ public class XmlSaxParserContext extends ParserContext
);
}
- @JRubyMethod
- public IRubyObject
- parse_with(ThreadContext context, IRubyObject handlerRuby)
+ protected void
+ parseSetup(ThreadContext context, IRubyObject rubyParser)
{
- final Ruby runtime = context.getRuntime();
-
- if (!invoke(context, handlerRuby, "respond_to?", runtime.newSymbol("document")).isTrue()) {
- throw runtime.newArgumentError("argument must respond_to document");
- }
-
- /* TODO: how should we pass in parse options? */
- ParserContext.Options options = defaultParseOptions(context);
-
- errorHandler = new NokogiriStrictErrorHandler(runtime, options.noError, options.noWarning);
- handler = new NokogiriHandler(runtime, handlerRuby, errorHandler);
-
- preParse(runtime, handlerRuby, handler);
- parser.setContentHandler(handler);
- parser.setErrorHandler(handler);
- parser.setEntityResolver(new NokogiriEntityResolver(runtime, errorHandler, options));
+ parser.setXmlDeclHandler(handler);
- try {
- parser.setProperty("http://xml.org/sax/properties/lexical-handler", handler);
- } catch (Exception ex) {
- throw runtime.newRuntimeError("Problem while creating XML SAX Parser: " + ex.toString());
- }
-
- try {
+ if (recovery) {
try {
- do_parse();
- } catch (SAXParseException ex) {
- // A bad document () should call the
- // error handler instead of raising a SAX exception.
-
- // However, an EMPTY document should raise a RuntimeError.
- // This is a bit kludgy, but AFAIK SAX doesn't distinguish
- // between empty and bad whereas Nokogiri does.
- String message = ex.getMessage();
- if (message != null && message.contains("Premature end of file.") && stringDataSize < 1) {
- throw runtime.newRuntimeError("couldn't parse document: " + message);
- }
- handler.error(ex);
+ parser.setFeature(FEATURE_CONTINUE_AFTER_FATAL_ERROR, true);
+ } catch (SAXException e) {
+ throw context.runtime.newRuntimeError(e.getMessage());
}
- } catch (SAXException ex) {
- // Unexpected failure in XML subsystem
- throw runtime.newRuntimeError(ex.getMessage());
- } catch (IOException ex) {
- throw runtime.newIOErrorFromException(ex);
}
+ }
- postParse(runtime, handlerRuby, handler);
-
- return runtime.getNil();
+ @JRubyMethod
+ public IRubyObject
+ parse_with(ThreadContext context, IRubyObject rubyParser)
+ {
+ return super.parse_with(context, rubyParser);
}
- /**
- * Can take a boolean assignment.
- *
- * @param context
- * @param value
- * @return
- */
@JRubyMethod(name = "replace_entities=")
public IRubyObject
set_replace_entities(ThreadContext context, IRubyObject value)
@@ -287,13 +174,6 @@ public class XmlSaxParserContext extends ParserContext
return context.runtime.newBoolean(replaceEntities);
}
- /**
- * Can take a boolean assignment.
- *
- * @param context
- * @param value
- * @return
- */
@JRubyMethod(name = "recovery=")
public IRubyObject
set_recovery(ThreadContext context, IRubyObject value)
diff --git a/ext/java/nokogiri/XmlSaxPushParser.java b/ext/java/nokogiri/XmlSaxPushParser.java
index 26261a33e3..26cb118f29 100644
--- a/ext/java/nokogiri/XmlSaxPushParser.java
+++ b/ext/java/nokogiri/XmlSaxPushParser.java
@@ -1,9 +1,12 @@
package nokogiri;
-import nokogiri.internals.*;
+import nokogiri.internals.ClosedStreamException;
+import nokogiri.internals.NokogiriBlockingQueueInputStream;
+import nokogiri.internals.NokogiriHelpers;
+import nokogiri.internals.ParserContext;
+import nokogiri.internals.SaxParserContext;
import org.jruby.Ruby;
import org.jruby.RubyClass;
-import org.jruby.RubyException;
import org.jruby.RubyObject;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
@@ -14,7 +17,6 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.List;
import java.util.concurrent.*;
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
@@ -168,7 +170,7 @@ public class XmlSaxPushParser extends RubyObject
stream = new NokogiriBlockingQueueInputStream();
assert saxParser != null : "saxParser null";
- parserTask = new ParserTask(context, saxParser, stream);
+ parserTask = new ParserTask(context, saxParser, parse(context.runtime, stream), stream);
futureTask = new FutureTask(parserTask);
executor = Executors.newSingleThreadExecutor(new ThreadFactory() {
@Override
@@ -233,56 +235,12 @@ public Thread newThread(Runnable r) {
return XmlSaxParserContext.parse_stream(runtime, klazz, stream);
}
- static class ParserTask extends ParserContext.ParserTask
+ static class ParserTask extends SaxParserContext.ParserTask
{
-
- final InputStream stream;
-
private
- ParserTask(ThreadContext context, IRubyObject handler, InputStream stream)
- {
- this(context, handler, parse(context.runtime, stream), stream);
- }
-
- // IMPL with Html4SaxPushParser
- protected
ParserTask(ThreadContext context, IRubyObject handler, XmlSaxParserContext parser, InputStream stream)
{
- super(context, handler, parser);
- this.stream = stream;
- }
-
- @Override
- public XmlSaxParserContext
- call() throws Exception
- {
- try {
- parser.parse_with(context, handler);
- } finally { stream.close(); }
- // we have to close the stream before exiting, otherwise someone
- // can add a chunk and block on task.get() forever.
- return parser;
- }
-
- final NokogiriHandler
- getNokogiriHandler()
- {
- return parser.getNokogiriHandler();
- }
-
- synchronized final int
- getErrorCount()
- {
- // check for null because thread may not have started yet
- if (parser.getNokogiriErrorHandler() == null) { return 0; }
- return parser.getNokogiriErrorHandler().getErrors().size();
- }
-
- synchronized final RubyException
- getLastError()
- {
- List errors = parser.getNokogiriErrorHandler().getErrors();
- return errors.get(errors.size() - 1);
+ super(context, handler, parser, stream);
}
}
}
diff --git a/ext/java/nokogiri/internals/DomParserContext.java b/ext/java/nokogiri/internals/DomParserContext.java
new file mode 100644
index 0000000000..1748eb41d1
--- /dev/null
+++ b/ext/java/nokogiri/internals/DomParserContext.java
@@ -0,0 +1,157 @@
+package nokogiri.internals;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.jruby.Ruby;
+import org.jruby.RubyArray;
+import org.jruby.RubyClass;
+import org.jruby.RubyException;
+import org.jruby.RubyFixnum;
+import org.jruby.runtime.Helpers;
+import org.jruby.runtime.ThreadContext;
+import org.jruby.runtime.builtin.IRubyObject;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import nokogiri.internals.ParserContext;
+import nokogiri.internals.ParserContext.Options;
+
+import nokogiri.XmlDocument;
+import nokogiri.XmlDtd;
+import nokogiri.XmlSyntaxError;
+
+import static nokogiri.internals.NokogiriHelpers.isBlank;
+
+public abstract class DomParserContext extends ParserContext
+{
+ private static final long serialVersionUID = 1L;
+
+ protected ParserContext.Options options;
+ protected TParser parser;
+ protected IRubyObject ruby_encoding;
+ protected NokogiriErrorHandler errorHandler;
+
+ public
+ DomParserContext(Ruby ruby, IRubyObject parserOptions, IRubyObject encoding)
+ {
+ super(ruby, ruby.getObject()); // class 'Object' because this class hierarchy isn't exposed to Ruby
+ options = new ParserContext.Options(RubyFixnum.fix2long(parserOptions));
+ java_encoding = NokogiriHelpers.getValidEncodingOrNull(encoding);
+ ruby_encoding = encoding;
+
+ if (options.recover) {
+ errorHandler = new NokogiriNonStrictErrorHandler(ruby, options.noError, options.noWarning);
+ } else {
+ errorHandler = new NokogiriStrictErrorHandler(ruby, options.noError, options.noWarning);
+ }
+ }
+
+ public XmlDocument
+ parse(ThreadContext context, RubyClass klass, IRubyObject url)
+ {
+ XmlDocument xmlDoc;
+ try {
+ parser.parse(getInputSource());
+ } catch (NullPointerException ex) {
+ // FIXME: this is really a hack to fix #838. Xerces will throw a NullPointerException
+ // if we tried to parse ' ?>'. We should submit a patch to Xerces.
+ } catch (SAXException e) {
+ return getDocumentWithErrorsOrRaiseException(context, klass, e);
+ } catch (IOException e) {
+ return getDocumentWithErrorsOrRaiseException(context, klass, e);
+ }
+
+ if (options.noBlanks) {
+ List emptyNodes = new ArrayList();
+ findEmptyTexts(parser.getDocument(), emptyNodes);
+ if (emptyNodes.size() > 0) {
+ for (Node node : emptyNodes) {
+ node.getParentNode().removeChild(node);
+ }
+ }
+ }
+ xmlDoc = wrapDocument(context, klass, parser.getDocument());
+ xmlDoc.setUrl(url);
+ addErrorsIfNecessary(context, xmlDoc);
+ return xmlDoc;
+ }
+
+ public XmlDocument
+ getDocumentWithErrorsOrRaiseException(ThreadContext context, RubyClass klazz, Exception ex)
+ {
+ if (options.recover) {
+ XmlDocument xmlDocument = getInterruptedOrNewXmlDocument(context, klazz);
+ this.addErrorsIfNecessary(context, xmlDocument);
+ XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
+ xmlSyntaxError.setException(ex);
+ ((RubyArray) xmlDocument.getInstanceVariable("@errors")).append(xmlSyntaxError);
+ return xmlDocument;
+ } else {
+ XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
+ xmlSyntaxError.setException(ex);
+ throw xmlSyntaxError.toThrowable();
+ }
+ }
+
+ private XmlDocument
+ getInterruptedOrNewXmlDocument(ThreadContext context, RubyClass klass)
+ {
+ Document document = parser.getDocument();
+ XmlDocument xmlDocument = new XmlDocument(context.runtime, klass, document);
+ xmlDocument.setEncoding(ruby_encoding);
+ return xmlDocument;
+ }
+
+ public void
+ addErrorsIfNecessary(ThreadContext context, XmlDocument doc)
+ {
+ doc.setInstanceVariable("@errors", mapErrors(context, errorHandler));
+ }
+
+ private static void
+ findEmptyTexts(Node node, List emptyNodes)
+ {
+ if (node.getNodeType() == Node.TEXT_NODE && isBlank(node.getTextContent())) {
+ emptyNodes.add(node);
+ } else {
+ NodeList children = node.getChildNodes();
+ for (int i = 0; i < children.getLength(); i++) {
+ findEmptyTexts(children.item(i), emptyNodes);
+ }
+ }
+ }
+
+ protected XmlDocument
+ wrapDocument(ThreadContext context, RubyClass klass, Document doc)
+ {
+ XmlDocument xmlDocument = new XmlDocument(context.runtime, klass, doc);
+ Helpers.invoke(context, xmlDocument, "initialize");
+ xmlDocument.setEncoding(ruby_encoding);
+
+ if (options.dtdLoad) {
+ IRubyObject dtd = XmlDtd.newFromExternalSubset(context.runtime, doc);
+ if (!dtd.isNil()) {
+ doc.setUserData(XmlDocument.DTD_EXTERNAL_SUBSET, (XmlDtd) dtd, null);
+ }
+ }
+ return xmlDocument;
+ }
+
+ public static RubyArray>
+ mapErrors(ThreadContext context, NokogiriErrorHandler errorHandler)
+ {
+ final Ruby runtime = context.runtime;
+ final List errors = errorHandler.getErrors();
+ final IRubyObject[] errorsAry = new IRubyObject[errors.size()];
+ for (int i = 0; i < errors.size(); i++) {
+ errorsAry[i] = errors.get(i);
+ }
+ return runtime.newArrayNoCopy(errorsAry);
+ }
+}
diff --git a/ext/java/nokogiri/internals/HtmlDomParserContext.java b/ext/java/nokogiri/internals/HtmlDomParserContext.java
index e3968f8c84..5a861267b9 100644
--- a/ext/java/nokogiri/internals/HtmlDomParserContext.java
+++ b/ext/java/nokogiri/internals/HtmlDomParserContext.java
@@ -41,13 +41,13 @@ public class HtmlDomParserContext extends XmlDomParserContext
public
HtmlDomParserContext(Ruby runtime, IRubyObject options)
{
- this(runtime, runtime.getNil(), options);
+ this(runtime, options, runtime.getNil());
}
public
- HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options)
+ HtmlDomParserContext(Ruby runtime, IRubyObject options, IRubyObject encoding)
{
- super(runtime, encoding, options);
+ super(runtime, options, encoding);
java_encoding = NokogiriHelpers.getValidEncoding(encoding);
}
diff --git a/ext/java/nokogiri/internals/ParserContext.java b/ext/java/nokogiri/internals/ParserContext.java
index 27fa835dc8..5d5f810d88 100644
--- a/ext/java/nokogiri/internals/ParserContext.java
+++ b/ext/java/nokogiri/internals/ParserContext.java
@@ -7,7 +7,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
-import java.util.concurrent.Callable;
import org.jruby.Ruby;
import org.jruby.RubyClass;
@@ -35,13 +34,6 @@ public abstract class ParserContext extends RubyObject
protected int stringDataSize = -1;
protected String java_encoding;
- public
- ParserContext(Ruby runtime)
- {
- // default to class 'Object' because this class isn't exposed to Ruby
- super(runtime, runtime.getObject());
- }
-
public
ParserContext(Ruby runtime, RubyClass klass)
{
@@ -225,38 +217,4 @@ public static class Options
noXIncNode = test(options, NOXINCNODE);
}
}
-
- /*
- public static class NokogiriXInlcudeEntityResolver implements org.xml.sax.EntityResolver {
- InputSource source;
- public NokogiriXInlcudeEntityResolver(InputSource source) {
- this.source = source;
- }
-
- @Override
- public InputSource resolveEntity(String publicId, String systemId)
- throws SAXException, IOException {
- if (systemId != null) source.setSystemId(systemId);
- if (publicId != null) source.setPublicId(publicId);
- return source;
- }
- } */
-
- public static abstract class ParserTask implements Callable
- {
-
- protected final ThreadContext context; // TODO does not seem like a good idea!?
- protected final IRubyObject handler;
- protected final T parser;
-
- protected
- ParserTask(ThreadContext context, IRubyObject handler, T parser)
- {
- this.context = context;
- this.handler = handler;
- this.parser = parser;
- }
-
- }
-
}
diff --git a/ext/java/nokogiri/internals/SaxParserContext.java b/ext/java/nokogiri/internals/SaxParserContext.java
new file mode 100644
index 0000000000..efe24fccfb
--- /dev/null
+++ b/ext/java/nokogiri/internals/SaxParserContext.java
@@ -0,0 +1,181 @@
+package nokogiri.internals;
+
+import nokogiri.internals.*;
+
+import org.jruby.Ruby;
+import org.jruby.RubyClass;
+import org.jruby.RubyException;
+import org.jruby.RubyFixnum;
+import org.jruby.anno.JRubyMethod;
+import org.jruby.exceptions.RaiseException;
+import org.jruby.runtime.ThreadContext;
+import org.jruby.runtime.builtin.IRubyObject;
+import static org.jruby.runtime.Helpers.invoke;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+public abstract class SaxParserContext extends ParserContext
+{
+ private static final long serialVersionUID = 1L;
+
+ protected TParser parser;
+ protected NokogiriHandler handler;
+ protected NokogiriErrorHandler errorHandler;
+
+ protected boolean replaceEntities = true;
+ protected boolean recovery = false;
+
+ protected static final String FEATURE_CONTINUE_AFTER_FATAL_ERROR =
+ "http://apache.org/xml/features/continue-after-fatal-error";
+ protected static final String PROPERTY_LEXICAL_HANDLER =
+ "http://xml.org/sax/properties/lexical-handler";
+
+ public
+ SaxParserContext(final Ruby ruby, RubyClass rubyClass)
+ {
+ super(ruby, rubyClass);
+ }
+
+ @Override
+ public Object
+ clone() throws CloneNotSupportedException
+ {
+ return super.clone();
+ }
+
+ protected void
+ initialize(Ruby runtime)
+ {
+ try {
+ parser = createParser();
+ } catch (SAXException se) {
+ // Unexpected failure in XML subsystem
+ RaiseException ex = runtime.newRuntimeError(se.toString());
+ ex.initCause(se);
+ throw ex;
+ }
+ }
+
+ protected abstract TParser createParser() throws SAXException;
+
+ public final NokogiriHandler
+ getNokogiriHandler() { return handler; }
+
+ public final NokogiriErrorHandler
+ getNokogiriErrorHandler() { return errorHandler; }
+
+ protected abstract Options defaultParseOptions(ThreadContext context);
+
+ protected void
+ parseSetup(ThreadContext context, IRubyObject rubyParser)
+ {
+ }
+
+ public IRubyObject
+ parse_with(ThreadContext context, IRubyObject rubyParser)
+ {
+ final Ruby runtime = context.runtime;
+
+ if (!invoke(context, rubyParser, "respond_to?", runtime.newSymbol("document")).isTrue()) {
+ throw runtime.newArgumentError("argument must respond_to document");
+ }
+
+ /* TODO: how should we pass in parse options? */
+ ParserContext.Options options = defaultParseOptions(context);
+
+ errorHandler = new NokogiriStrictErrorHandler(runtime, options.noError, options.noWarning);
+ handler = new NokogiriHandler(runtime, rubyParser, errorHandler);
+
+ parseSetup(context, rubyParser);
+
+ parser.setContentHandler(handler);
+ parser.setErrorHandler(handler);
+ parser.setEntityResolver(new NokogiriEntityResolver(runtime, errorHandler, options));
+ try {
+ parser.setProperty(PROPERTY_LEXICAL_HANDLER, handler);
+ } catch (SAXException e) {
+ throw runtime.newRuntimeError(e.getMessage());
+ }
+
+ try {
+ try {
+ parser.parse(getInputSource());
+ } catch (SAXParseException ex) {
+ // A bad document () should call the
+ // error handler instead of raising a SAX exception.
+
+ // However, an EMPTY document should raise a RuntimeError.
+ // This is a bit kludgy, but AFAIK SAX doesn't distinguish
+ // between empty and bad whereas Nokogiri does.
+ String message = ex.getMessage();
+ if (message != null && message.contains("Premature end of file.") && stringDataSize < 1) {
+ throw runtime.newRuntimeError("couldn't parse document: " + message);
+ }
+ handler.error(ex);
+ }
+ } catch (SAXException ex) {
+ // Unexpected failure in XML subsystem
+ throw runtime.newRuntimeError(ex.getMessage());
+ } catch (IOException ex) {
+ throw runtime.newIOErrorFromException(ex);
+ }
+
+ return runtime.getNil();
+ }
+
+ public static abstract class ParserTask> implements Callable
+ {
+ protected final ThreadContext context; // TODO does not seem like a good idea!?
+ protected final IRubyObject handler;
+ protected final T parser;
+ final InputStream stream;
+
+ public
+ ParserTask(ThreadContext context, IRubyObject handler, T parser, InputStream stream)
+ {
+ this.context = context;
+ this.handler = handler;
+ this.parser = parser;
+ this.stream = stream;
+ }
+
+ public final NokogiriHandler
+ getNokogiriHandler()
+ {
+ return parser.getNokogiriHandler();
+ }
+
+ public synchronized final int
+ getErrorCount()
+ {
+ // check for null because thread may not have started yet
+ if (parser.getNokogiriErrorHandler() == null) { return 0; }
+ return parser.getNokogiriErrorHandler().getErrors().size();
+ }
+
+ public synchronized final RubyException
+ getLastError()
+ {
+ List errors = parser.getNokogiriErrorHandler().getErrors();
+ return errors.get(errors.size() - 1);
+ }
+
+ @Override
+ public T
+ call() throws Exception
+ {
+ try {
+ parser.parse_with(context, handler);
+ } finally { stream.close(); }
+ // we have to close the stream before exiting, otherwise someone
+ // can add a chunk and block on task.get() forever.
+ return parser;
+ }
+ }
+}
diff --git a/ext/java/nokogiri/internals/XmlDomParserContext.java b/ext/java/nokogiri/internals/XmlDomParserContext.java
index 557c2f18e3..19b1e58364 100644
--- a/ext/java/nokogiri/internals/XmlDomParserContext.java
+++ b/ext/java/nokogiri/internals/XmlDomParserContext.java
@@ -1,24 +1,15 @@
package nokogiri.internals;
-import nokogiri.XmlDocument;
-import nokogiri.XmlDtd;
-import nokogiri.XmlSyntaxError;
import org.apache.xerces.parsers.DOMParser;
-import org.jruby.*;
-import org.jruby.exceptions.RaiseException;
-import org.jruby.runtime.Helpers;
+
+import org.jruby.Ruby;
+import org.jruby.RubyClass;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
-import org.w3c.dom.Document;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
+import org.xml.sax.SAXException;
-import static nokogiri.internals.NokogiriHelpers.isBlank;
+import nokogiri.XmlDocument;
/**
* Parser class for XML DOM processing. This class actually parses XML document
@@ -28,7 +19,7 @@
* @author sergio
* @author Yoko Harada
*/
-public class XmlDomParserContext extends ParserContext
+public class XmlDomParserContext extends DomParserContext
{
private static final long serialVersionUID = 1L;
@@ -45,36 +36,18 @@ public class XmlDomParserContext extends ParserContext
protected static final String FEATURE_VALIDATION = "http://xml.org/sax/features/validation";
private static final String SECURITY_MANAGER = "http://apache.org/xml/properties/security-manager";
- protected ParserContext.Options options;
- protected DOMParser parser;
- protected NokogiriErrorHandler errorHandler;
- protected IRubyObject ruby_encoding;
-
public
XmlDomParserContext(Ruby runtime, IRubyObject options)
{
- this(runtime, runtime.getNil(), options);
+ this(runtime, options, runtime.getNil());
}
public
- XmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options)
+ XmlDomParserContext(Ruby runtime, IRubyObject parserOptions, IRubyObject encoding)
{
- super(runtime);
- this.options = new ParserContext.Options(RubyFixnum.fix2long(options));
- java_encoding = NokogiriHelpers.getValidEncodingOrNull(encoding);
- ruby_encoding = encoding;
- initErrorHandler(runtime);
- initParser(runtime);
- }
+ super(runtime, parserOptions, encoding);
- protected void
- initErrorHandler(Ruby runtime)
- {
- if (options.recover) {
- errorHandler = new NokogiriNonStrictErrorHandler(runtime, options.noError, options.noWarning);
- } else {
- errorHandler = new NokogiriStrictErrorHandler(runtime, options.noError, options.noWarning);
- }
+ initParser(runtime);
}
protected void
@@ -144,122 +117,13 @@ public class XmlDomParserContext extends ParserContext
}
}
- public void
- addErrorsIfNecessary(ThreadContext context, XmlDocument doc)
- {
- doc.setInstanceVariable("@errors", mapErrors(context, errorHandler));
- }
-
-
- public static RubyArray>
- mapErrors(ThreadContext context, NokogiriErrorHandler errorHandler)
- {
- final Ruby runtime = context.runtime;
- final List errors = errorHandler.getErrors();
- final IRubyObject[] errorsAry = new IRubyObject[errors.size()];
- for (int i = 0; i < errors.size(); i++) {
- errorsAry[i] = errors.get(i);
- }
- return runtime.newArrayNoCopy(errorsAry);
- }
-
- public XmlDocument
- getDocumentWithErrorsOrRaiseException(ThreadContext context, RubyClass klazz, Exception ex)
- {
- if (options.recover) {
- XmlDocument xmlDocument = getInterruptedOrNewXmlDocument(context, klazz);
- this.addErrorsIfNecessary(context, xmlDocument);
- XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
- xmlSyntaxError.setException(ex);
- ((RubyArray) xmlDocument.getInstanceVariable("@errors")).append(xmlSyntaxError);
- return xmlDocument;
- } else {
- XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
- xmlSyntaxError.setException(ex);
- throw xmlSyntaxError.toThrowable();
- }
- }
-
- private XmlDocument
- getInterruptedOrNewXmlDocument(ThreadContext context, RubyClass klass)
- {
- Document document = parser.getDocument();
- XmlDocument xmlDocument = new XmlDocument(context.runtime, klass, document);
- xmlDocument.setEncoding(ruby_encoding);
- return xmlDocument;
- }
-
- /**
- * This method is broken out so that HtmlDomParserContext can
- * override it.
- */
- protected XmlDocument
- wrapDocument(ThreadContext context, RubyClass klass, Document doc)
- {
- XmlDocument xmlDocument = new XmlDocument(context.runtime, klass, doc);
- Helpers.invoke(context, xmlDocument, "initialize");
- xmlDocument.setEncoding(ruby_encoding);
-
- if (options.dtdLoad) {
- IRubyObject dtd = XmlDtd.newFromExternalSubset(context.runtime, doc);
- if (!dtd.isNil()) {
- doc.setUserData(XmlDocument.DTD_EXTERNAL_SUBSET, (XmlDtd) dtd, null);
- }
- }
- return xmlDocument;
- }
-
/**
* Must call setInputSource() before this method.
*/
+ @Override
public XmlDocument
parse(ThreadContext context, RubyClass klass, IRubyObject url)
{
- XmlDocument xmlDoc;
- try {
- Document doc = do_parse();
- xmlDoc = wrapDocument(context, klass, doc);
- xmlDoc.setUrl(url);
- addErrorsIfNecessary(context, xmlDoc);
- return xmlDoc;
- } catch (SAXException e) {
- return getDocumentWithErrorsOrRaiseException(context, klass, e);
- } catch (IOException e) {
- return getDocumentWithErrorsOrRaiseException(context, klass, e);
- }
- }
-
- protected Document
- do_parse() throws SAXException, IOException
- {
- try {
- parser.parse(getInputSource());
- } catch (NullPointerException ex) {
- // FIXME: this is really a hack to fix #838. Xerces will throw a NullPointerException
- // if we tried to parse ' ?>'. We should submit a patch to Xerces.
- }
- if (options.noBlanks) {
- List emptyNodes = new ArrayList();
- findEmptyTexts(parser.getDocument(), emptyNodes);
- if (emptyNodes.size() > 0) {
- for (Node node : emptyNodes) {
- node.getParentNode().removeChild(node);
- }
- }
- }
- return parser.getDocument();
- }
-
- private static void
- findEmptyTexts(Node node, List emptyNodes)
- {
- if (node.getNodeType() == Node.TEXT_NODE && isBlank(node.getTextContent())) {
- emptyNodes.add(node);
- } else {
- NodeList children = node.getChildNodes();
- for (int i = 0; i < children.getLength(); i++) {
- findEmptyTexts(children.item(i), emptyNodes);
- }
- }
+ return super.parse(context, klass, url);
}
}
diff --git a/ext/java/nokogiri/internals/XmlSaxParser.java b/ext/java/nokogiri/internals/XmlSaxParser.java
index 478fe8fd1a..b55ae178a6 100644
--- a/ext/java/nokogiri/internals/XmlSaxParser.java
+++ b/ext/java/nokogiri/internals/XmlSaxParser.java
@@ -12,7 +12,6 @@
*/
public class XmlSaxParser extends SAXParser
{
-
protected XmlDeclHandler xmlDeclHandler = null;
public
diff --git a/nokogiri.gemspec b/nokogiri.gemspec
index 9611ade2fc..86871eb94c 100644
--- a/nokogiri.gemspec
+++ b/nokogiri.gemspec
@@ -111,6 +111,7 @@ Gem::Specification.new do |spec|
"ext/java/nokogiri/internals/ParserContext.java",
"ext/java/nokogiri/internals/ReaderNode.java",
"ext/java/nokogiri/internals/SaveContextVisitor.java",
+ "ext/java/nokogiri/internals/SaxParserContext.java",
"ext/java/nokogiri/internals/SchemaErrorHandler.java",
"ext/java/nokogiri/internals/XalanDTMManagerPatch.java",
"ext/java/nokogiri/internals/XmlDeclHandler.java",