From 9bb2417f58cc44e8cc220db7143215c0f8b64ebd Mon Sep 17 00:00:00 2001
From: zyw61483 <zyw61483@163.com>
Date: Wed, 11 Dec 2024 16:36:20 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9SmartContentSelector=20thresh?=
 =?UTF-8?q?old=E5=8F=AF=E5=AE=9A=E5=88=B6=E5=8C=96=20(#1183)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 修改SmartContentSelector threshold可定制化

* 修改SmartContentSelector threshold可定制化

---------

Co-authored-by: zhaoyiwei <zhaoyiwei@zhongan.com>
---
 .../main/java/us/codecraft/webmagic/selector/HtmlNode.java | 5 +++++
 .../java/us/codecraft/webmagic/selector/Selectors.java     | 4 ++++
 .../codecraft/webmagic/selector/SmartContentSelector.java  | 7 ++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
index 85ff5fa6..74ea718e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
@@ -31,6 +31,11 @@ public class HtmlNode extends AbstractSelectable {
         return select(smartContentSelector, getSourceTexts());
     }
 
+    public Selectable smartContent(int threshold) {
+        SmartContentSelector smartContentSelector = Selectors.smartContent(threshold);
+        return select(smartContentSelector, getSourceTexts());
+    }
+
     @Override
     public Selectable links() {
         return selectElements(new LinksSelector());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
index 7cd68c1d..3600896e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
@@ -20,6 +20,10 @@ public abstract class Selectors {
         return new SmartContentSelector();
     }
 
+    public static SmartContentSelector smartContent(int threshold) {
+        return new SmartContentSelector(threshold);
+    }
+
     public static CssSelector $(String expr) {
         return new CssSelector(expr);
     }
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
index ff8e2699..c8816510 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
@@ -16,9 +16,15 @@ import java.util.List;
 @Experimental
 public class SmartContentSelector implements Selector {
 
+    private int threshold = 86;
+
     public SmartContentSelector() {
     }
 
+    public SmartContentSelector(int threshold) {
+        this.threshold = threshold;
+    }
+
     @Override
     public String select(String html) {
         html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
@@ -29,7 +35,6 @@ public class SmartContentSelector implements Selector {
         html = html.replaceAll("(?is)<.*?>", "");
         List<String> lines;
         int blocksWidth =3;
-        int threshold =86;
         int start;
         int end;
         StringBuilder text = new StringBuilder();