From 9bb2417f58cc44e8cc220db7143215c0f8b64ebd Mon Sep 17 00:00:00 2001 From: zyw61483 <zyw61483@163.com> Date: Wed, 11 Dec 2024 16:36:20 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9SmartContentSelector=20thresh?= =?UTF-8?q?old=E5=8F=AF=E5=AE=9A=E5=88=B6=E5=8C=96=20(#1183)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 修改SmartContentSelector threshold可定制化 * 修改SmartContentSelector threshold可定制化 --------- Co-authored-by: zhaoyiwei <zhaoyiwei@zhongan.com> --- .../main/java/us/codecraft/webmagic/selector/HtmlNode.java | 5 +++++ .../java/us/codecraft/webmagic/selector/Selectors.java | 4 ++++ .../codecraft/webmagic/selector/SmartContentSelector.java | 7 ++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 85ff5fa6..74ea718e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -31,6 +31,11 @@ public class HtmlNode extends AbstractSelectable { return select(smartContentSelector, getSourceTexts()); } + public Selectable smartContent(int threshold) { + SmartContentSelector smartContentSelector = Selectors.smartContent(threshold); + return select(smartContentSelector, getSourceTexts()); + } + @Override public Selectable links() { return selectElements(new LinksSelector()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 7cd68c1d..3600896e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -20,6 +20,10 @@ public abstract class Selectors { return new SmartContentSelector(); } + public static SmartContentSelector smartContent(int threshold) { + return new SmartContentSelector(threshold); + } + public static CssSelector $(String expr) { return new CssSelector(expr); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e2699..c8816510 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -16,9 +16,15 @@ import java.util.List; @Experimental public class SmartContentSelector implements Selector { + private int threshold = 86; + public SmartContentSelector() { } + public SmartContentSelector(int threshold) { + this.threshold = threshold; + } + @Override public String select(String html) { html = html.replaceAll("(?is)<!DOCTYPE.*?>", ""); @@ -29,7 +35,6 @@ public class SmartContentSelector implements Selector { html = html.replaceAll("(?is)<.*?>", ""); List<String> lines; int blocksWidth =3; - int threshold =86; int start; int end; StringBuilder text = new StringBuilder();