|
|
|
@ -3,7 +3,6 @@ package us.codecraft.webmagic.selector;
|
|
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
|
import org.jsoup.nodes.Document;
|
|
|
|
|
import org.jsoup.nodes.Element;
|
|
|
|
|
import org.jsoup.nodes.Entities;
|
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
|
|
|
@ -20,25 +19,12 @@ public class Html extends HtmlNode {
|
|
|
|
|
|
|
|
|
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
|
|
|
|
|
|
|
|
private static volatile boolean INITED = false;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Disable jsoup html entity escape. It can be set just before any Html instance is created.
|
|
|
|
|
* @deprecated
|
|
|
|
|
*/
|
|
|
|
|
public static boolean DISABLE_HTML_ENTITY_ESCAPE = false;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
|
|
|
|
|
*/
|
|
|
|
|
private void disableJsoupHtmlEntityEscape() {
|
|
|
|
|
if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
|
|
|
|
|
Entities.EscapeMode.base.getMap().clear();
|
|
|
|
|
Entities.EscapeMode.extended.getMap().clear();
|
|
|
|
|
Entities.EscapeMode.xhtml.getMap().clear();
|
|
|
|
|
INITED = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Store parsed document for better performance when only one text exist.
|
|
|
|
|
*/
|
|
|
|
@ -46,7 +32,6 @@ public class Html extends HtmlNode {
|
|
|
|
|
|
|
|
|
|
public Html(String text, String url) {
|
|
|
|
|
try {
|
|
|
|
|
disableJsoupHtmlEntityEscape();
|
|
|
|
|
this.document = Jsoup.parse(text, url);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
this.document = null;
|
|
|
|
@ -56,7 +41,6 @@ public class Html extends HtmlNode {
|
|
|
|
|
|
|
|
|
|
public Html(String text) {
|
|
|
|
|
try {
|
|
|
|
|
disableJsoupHtmlEntityEscape();
|
|
|
|
|
this.document = Jsoup.parse(text);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
this.document = null;
|
|
|
|
|