From 194518fd82f31e1a08f8966f26324c2e9381ddc3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 08:21:34 +0800 Subject: [PATCH] add switch --- .../java/us/codecraft/webmagic/Spider.java | 9 + .../webmagic/selector/CacheElement.java | 36 - .../us/codecraft/webmagic/selector/Html.java | 26 +- .../webmagic/utils/EnvironmentUtil.java | 28 + .../webmagic/utils/EnvironmentUtilTest.java | 18 + .../webmagic/model/PageModelExtractor.java | 31 +- .../webmagic/utils/ExtractorUtils.java | 17 +- .../codecraft/model/ProcessorBenchmark.java | 890 ++++++++++++++++++ 8 files changed, 992 insertions(+), 63 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java create mode 100644 webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index c5c239fb..723e8058 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.ThreadUtils; import java.io.Closeable; @@ -368,6 +369,14 @@ public class Spider implements Runnable, Task { return this; } + /** + * switch off xsoup + * @return + */ + public static void xsoupOff(){ + EnvironmentUtil.setUseXsoup(false); + } + @Override public String getUUID() { if (uuid != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java deleted file mode 100644 index a58eba2a..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java +++ /dev/null @@ -1,36 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.nodes.Element; - -import java.util.List; - -/** - * Cache parsed element for extract. - * - * @author code4crafter@gmail.com - * @since 0.2.2 - */ -public class CacheElement { - - public String text; - - public Element element; - - public String select(Selector selector) { - if (selector instanceof ElementSelector) { - ElementSelector elementSelector = (ElementSelector) selector; - return elementSelector.select(getElement()); - } else { - return selector.select(getText()); - } - } - - public List selectList(Selector selector) { - if (selector instanceof ElementSelector) { - ElementSelector elementSelector = (ElementSelector) selector; - return elementSelector.selectList(getElement()); - } else { - return selector.selectList(getText()); - } - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 74aa976b..17988249 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import us.codecraft.webmagic.utils.EnvironmentUtil; import java.util.ArrayList; import java.util.List; @@ -72,17 +73,22 @@ public class Html extends PlainText { @Override public Selectable xpath(String xpath) { - XsoupSelector xsoupSelector = new XsoupSelector(xpath); - if (document!=null){ - return new Html(xsoupSelector.selectList(document)); + if (EnvironmentUtil.useXsoup()) { + XsoupSelector xsoupSelector = new XsoupSelector(xpath); + if (document != null) { + return new Html(xsoupSelector.selectList(document)); + } + return selectList(xsoupSelector, strings); + } else { + XpathSelector xpathSelector = new XpathSelector(xpath); + return selectList(xpathSelector, strings); } - return selectList(xsoupSelector, strings); } @Override public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); - if (document!=null){ + if (document != null) { return new Html(cssSelector.selectList(document)); } return selectList(cssSelector, strings); @@ -91,7 +97,7 @@ public class Html extends PlainText { @Override public Selectable $(String selector, String attrName) { CssSelector cssSelector = Selectors.$(selector, attrName); - if (document!=null){ + if (document != null) { return new Html(cssSelector.selectList(document)); } return selectList(cssSelector, strings); @@ -102,15 +108,17 @@ public class Html extends PlainText { } public String getText() { + if (strings!=null&&strings.size()>0){ + return strings.get(0); + } return document.html(); } /** - * * @param selector * @return */ - public String select(Selector selector) { + public String selectDocument(Selector selector) { if (selector instanceof ElementSelector) { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.select(getDocument()); @@ -119,7 +127,7 @@ public class Html extends PlainText { } } - public List selectList(Selector selector) { + public List selectDocumentForList(Selector selector) { if (selector instanceof ElementSelector) { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.selectList(getDocument()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java new file mode 100644 index 00000000..1d63aecd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.utils; + +import org.apache.commons.lang3.BooleanUtils; + +import java.util.Properties; + +/** + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public abstract class EnvironmentUtil { + + private static final String USE_XSOUP = "xsoup"; + + public static boolean useXsoup() { + Properties properties = System.getProperties(); + Object o = properties.get(USE_XSOUP); + if (o == null) { + return true; + } + return BooleanUtils.toBoolean(((String) o).toLowerCase()); + } + + public static void setUseXsoup(boolean useXsoup) { + Properties properties = System.getProperties(); + properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false")); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java new file mode 100644 index 00000000..cb620e7a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Test; + +import static junit.framework.Assert.*; + +/** + * @author code4crafter@gmail.com + */ +public class EnvironmentUtilTest { + + @Test + public void test() { + assertTrue(EnvironmentUtil.useXsoup()); + EnvironmentUtil.setUseXsoup(false); + assertFalse(EnvironmentUtil.useXsoup()); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 88490524..03cd3a3a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; -import org.jsoup.nodes.Element; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.selector.*; @@ -185,13 +184,13 @@ class PageModelExtractor { return null; } if (objectExtractor == null) { - return processSingle(page, page.getHtml().toString()); + return processSingle(page, null, false); } else { if (objectExtractor.multi) { List os = new ArrayList(); List list = objectExtractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { - Object o = processSingle(page, s); + Object o = processSingle(page, s, false); if (o != null) { os.add(o); } @@ -199,19 +198,13 @@ class PageModelExtractor { return os; } else { String select = objectExtractor.getSelector().select(page.getHtml().toString()); - Object o = processSingle(page, select); + Object o = processSingle(page, select, false); return o; } } } - private List select(Selector selector,Element element,String html){ - if (selector instanceof ElementSelector){ - - } - } - - private Object processSingle(Page page, String html) { + private Object processSingle(Page page, String html, boolean isRaw) { Object o = null; try { o = clazz.newInstance(); @@ -220,10 +213,14 @@ class PageModelExtractor { List value; switch (fieldExtractor.getSource()) { case RawHtml: - value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); break; case Html: - value = fieldExtractor.getSelector().selectList(html); + if (isRaw) { + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().selectList(html); + } break; case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); @@ -239,10 +236,14 @@ class PageModelExtractor { String value; switch (fieldExtractor.getSource()) { case RawHtml: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); break; case Html: - value = fieldExtractor.getSelector().select(html); + if (isRaw) { + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().select(html); + } break; case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 10996362..2d9fd51f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -8,6 +8,7 @@ import java.util.List; /** * Tools for annotation converting.
+ * * @author code4crafter@gmail.com
* @since 0.2.1 */ @@ -24,17 +25,27 @@ public class ExtractorUtils { selector = new RegexSelector(value); break; case XPath: - selector = new XsoupSelector(value); + selector = getXpathSelector(value); break; default: - selector = new XsoupSelector(value); + selector = getXpathSelector(value); + } + return selector; + } + + private static Selector getXpathSelector(String value) { + Selector selector; + if (EnvironmentUtil.useXsoup()) { + selector = new XsoupSelector(value); + } else { + selector = new XpathSelector(value); } return selector; } public static List getSelectors(ExtractBy[] extractBies) { List selectors = new ArrayList(); - if (extractBies==null){ + if (extractBies == null) { return selectors; } for (ExtractBy extractBy : extractBies) { diff --git a/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java new file mode 100644 index 00000000..c3f2829b --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java @@ -0,0 +1,890 @@ +package us.codecraft.webmagic.model; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class ProcessorBenchmark { + + @Test + public void test() { + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class); + Page page = new Page(); + page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); + page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); + page.setHtml(new Html(html)); + long time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + } + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t\t
\n" + + " \t开源中国社区\n" + + "
\n" + + "
开源项目发现、使用和交流平台
\n" + + "\t\t
\n" + + " \t\n" + + "
\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\t\t
\n" + + "\t\t当前访客身份:\n" + + "\t\t\t\t黄亿华 [ 退出 ]\n" + + "\t\t\t\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t你有0新留言\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\n" + + "\t\t
\n" + + "\t\t
\n" + + " \t\t
\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + + " \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t
\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + "
软件
\n" + + " \n" + + "
\n" + + "\t\t\t\t\t\t\t\n" + + " \t\t
\n" + + "\t\t
\n" + + "\t\t
\n" + + "\t
\n" + + "\t
\t\n" + + "\n" + + "
\n" + + "
\n" + + "\t\t切换风格 \"黄亿华\"\n" + + " \n" + + " 黄亿华\n" + + "\t\t\n" + + "\t\t\t\n" + + " \t\t\t修改资料\n" + + "\t\t\t更换头像\n" + + " \t\t\n" + + " \n" + + "
\n" + + "
\n" + + " \t关注(43)\n" + + " \t粉丝(98)\n" + + " \t积分(173)\n" + + "
\n" + + "
\n" + + "
\n" + + "码农一枚
实用主义者
抵制重复造轮子,却造了不少轮子
http://codecraft.us
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t.发表博文\n" + + "\t.空间管理\n" + + "
\n" + + " 管理» 博客分类\n" + + " \n" + + "
\n" + + "
\n" + + " 管理» 最新评论 \n" + + "
    \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“searchjack”的评论 不是好的就会被认...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:不是好的就会被认可, 干自己的, 到时候, 单干\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:极好的工具,\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@静风流云:貌似,OSC也是类似处理的。\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t
\n" + + "
\n" + + "
\n" + + "访客统计\n" + + "
    \n" + + "\t
  • 6 (查看最新访客»)
  • \n" + + "
  • 284
  • \n" + + "
  • 817
  • \n" + + "
  • 1888
  • \n" + + "
  • 16453
  • \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\t
\n" + + " \t\n" + + "\t
\n" + + "\t\n" + + " \t
\t\t\n" + + "
\n" + + "

Jsoup代码解读之八-防御XSS攻击

\n" + + "
\n" + + " \t\t \t\t \t\t\n" + + " \t\t\t编辑 | 删除\n" + + " \t\t\n" + + "\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" + + " \t\t已有1628次阅读 ,共3个评论\n" + + " \t\t\t\t\t,共 79 人收藏此文 \t
\n" + + "
\n" + + "\t \t
\n" + + "

目录:[ - ]

\n" + + " \n" + + " \t
\n" + + " \n" + + "\t \t

\n" + + "\n" + + "

防御XSS攻击的一般原理

\n" + + "

cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。

\n" + + "

我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<,>,",'是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。

\n" + + "

在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:

\n" + + "
    \n" + + "
  1. 将HTML解析为DOM树

    这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用</textarea>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。

  2. \n" + + "
  3. 过滤高风险标签/属性/属性值

    高风险标签是指<script>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如onclick='alert("xss!")'

  4. \n" + + "
  5. 重新将DOM树输出为HTML文本

    DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。

  6. \n" + + "
\n" + + "\n" + + "

Cleaner与Whitelist

\n" + + "

对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。

\n" + + "

Jsoup给出的答案是白名单。下面是Whitelist的部分代码。

\n" + + "
public class Whitelist {\n" +
+            "    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
+            "    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
+            "    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
+            "    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
+            "    private boolean preserveRelativeLinks; // option to preserve relative links\n" +
+            "}
\n" + + "

这里定义了标签名/属性名/属性值的白名单。

\n" + + "

Cleaner是过滤的执行者。不出所料,Cleaner内部定义了CleaningVisitor来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到Element destination里去。

\n" + + "
private final class CleaningVisitor implements NodeVisitor {\n" +
+            "    private int numDiscarded = 0;\n" +
+            "    private final Element root;\n" +
+            "    private Element destination; // current element to append nodes to\n" +
+            "\n" +
+            "    private CleaningVisitor(Element root, Element destination) {\n" +
+            "        this.root = root;\n" +
+            "        this.destination = destination;\n" +
+            "    }\n" +
+            "\n" +
+            "    public void head(Node source, int depth) {\n" +
+            "        if (source instanceof Element) {\n" +
+            "            Element sourceEl = (Element) source;\n" +
+            "\n" +
+            "            if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
+            "                ElementMeta meta = createSafeElement(sourceEl);\n" +
+            "                Element destChild = meta.el;\n" +
+            "                destination.appendChild(destChild);\n" +
+            "\n" +
+            "                numDiscarded += meta.numAttribsDiscarded;\n" +
+            "                destination = destChild;\n" +
+            "            } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
+            "                numDiscarded++;\n" +
+            "            }\n" +
+            "        } else if (source instanceof TextNode) {\n" +
+            "            TextNode sourceText = (TextNode) source;\n" +
+            "            TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
+            "            destination.appendChild(destText);\n" +
+            "        } else { // else, we don't care about comments, xml proc instructions, etc\n" +
+            "            numDiscarded++;\n" +
+            "        }\n" +
+            "    }\n" +
+            "\n" +
+            "    public void tail(Node source, int depth) {\n" +
+            "        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
+            "            destination = destination.parent(); // would have descended, so pop destination stack\n" +
+            "        }\n" +
+            "    }\n" +
+            "}
\n" + + "\n" + + "

结束语

\n" + + "

至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:

\n" + + "
    \n" + + "
  • 最好的代码抽象,是对现实概念的映射。

    这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。

  • \n" + + "
  • 不要过度抽象

    在Jsoup里,只用到了两个接口,一个是NodeVisitor,一个是Connection,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。

    另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。

  • \n" + + "
\n" + + "

最后继续贴上Jsoup解读系列的github地址:https://github.com/code4craft/jsoup-learning/

\n" + + " \t \t \n" + + " \t\n" + + "\t
\n" + + " \t关键字:\n" + + " \t \tJsoup\n" + + " \t \tXSS\n" + + " \t \tOO\n" + + " \t \t
\n" + + "\t \t \n" + + "
\t\t\n" + + "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + + "\t \t
\n" + + "\n" + + " \n" + + "\t
\n" + + "\n" + + "\t\n" + + "\t
\n" + + "\t\n" + + "\t\n" + + "\t\t分享到: \n" + + "\t\t\n" + + "\t\t\n" + + "\t\n" + + " 已有 0人顶\n" + + "\t\n" + + "\t
\n" + + "\t\t\n" + + "
\n" + + "
\n" + + "
\n" + + "

共有 3 条网友评论

\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"静风流云\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t1楼:静风流云 发表于 2013-09-01 08:34 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    貌似,OSC也是类似处理的。
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"黄亿华\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t2楼:黄亿华 发表于 2013-09-01 08:37 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t
    \n" + + "\t\t

    引用来自“静风流云”的评论

    貌似,OSC也是类似处理的。

    OSC就是使用Jsoup做解析的,见这里:http://www.oschina.net/p/jsoup
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"searchjack\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t3楼:searchjack 发表于 2013-09-02 09:20 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    极好的工具,
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t
\n" + + "
\n" + + "\t
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\t \n" + + "\t \n" + + "\t 文明上网,理性发言\n" + + "
\n" + + "\t回到页首 | 回到评论列表\n" + + "
\n" + + "
\n" + + "\t\n" + + "
\n" + + "\t关闭相关文章阅读\n" + + "\t\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + ""; +}