diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 7a294e18..aa5a4798 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -53,6 +53,12 @@ webmagic-extension ${project.version} + + org.projectlombok + lombok + 1.18.32 + provided + diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java new file mode 100644 index 00000000..873176e6 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.scripts; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import lombok.Getter; +import lombok.Setter; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Language; +import us.codecraft.webmagic.utils.WMCollections; + +public class Params { + @Getter + Language language = new Javascript(); + + @Getter @Setter + String scriptFileName; + + @Getter @Setter + List urls; + + @Getter @Setter + int thread = 1; + + @Getter @Setter + int sleepTime = 1000; + + private static Map> alias; + + public Params() { + alias = new HashMap>(); + alias.put(new Javascript(), WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); + alias.put(new JRuby(), WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); + } + + public void setLanguagefromArg(String arg) { + for (Map.Entry> languageSetEntry : alias.entrySet()) { + if (languageSetEntry.getValue().contains(arg)) { + this.language = languageSetEntry.getKey(); + return; + } + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 2ccfe7f4..c60b3ec3 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -1,90 +1,21 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.cli.*; -import org.apache.logging.log4j.Level; -import org.apache.logging.log4j.core.Logger; -import org.slf4j.LoggerFactory; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.scripts.config.CommandLineOption; import us.codecraft.webmagic.utils.WMCollections; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Set; /** - * @author code4crafter@gmail.com + * @author code4crafter@gmail.com / FrancoisGib * @since 0.4.1 */ public class ScriptConsole { - - private static class Params { - Language language = Language.JavaScript; - String scriptFileName; - List urls; - int thread = 1; - int sleepTime = 1000; - private static Map> alias = new HashMap>(); - - static { - alias.put(Language.JavaScript, WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); - alias.put(Language.JRuby, WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); - } - - public void setLanguagefromArg(String arg) { - for (Map.Entry> languageSetEntry : alias.entrySet()) { - if (languageSetEntry.getValue().contains(arg)) { - this.language = languageSetEntry.getKey(); - return; - } - } - } - - private Language getLanguage() { - return language; - } - - private void setLanguage(Language language) { - this.language = language; - } - - private String getScriptFileName() { - return scriptFileName; - } - - private void setScriptFileName(String scriptFileName) { - this.scriptFileName = scriptFileName; - } - - private List getUrls() { - return urls; - } - - private void setUrls(List urls) { - this.urls = urls; - } - - private int getThread() { - return thread; - } - - private void setThread(int thread) { - this.thread = thread; - } - - private int getSleepTime() { - return sleepTime; - } - - private void setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - } - } - public static void main(String[] args) { Params params = parseCommand(args); startSpider(params); @@ -142,45 +73,9 @@ public class ScriptConsole { private static Params readOptions(CommandLine commandLine) { Params params = new Params(); - if (commandLine.hasOption("l")) { - String language = commandLine.getOptionValue("l"); - params.setLanguagefromArg(language); - } - if (commandLine.hasOption("f")) { - String scriptFilename = commandLine.getOptionValue("f"); - params.setScriptFileName(scriptFilename); - } else { - exit(); - } - if (commandLine.hasOption("s")) { - Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); - params.setSleepTime(sleepTime); - } - if (commandLine.hasOption("t")) { - Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); - params.setThread(thread); - } - if (commandLine.hasOption("g")) { - configLogger(commandLine.getOptionValue("g")); - } - params.setUrls(commandLine.getArgList()); + List options = CommandLineOption.getAllOptions(); + for (CommandLineOption option : options) + option.addParamOptionIfInCommandLine(params, commandLine); return params; } - - private static void configLogger(String value) { - Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); - if ("debug".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.DEBUG); - } else if ("info".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.INFO); - } else if ("warn".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.WARN); - } else if ("trace".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.TRACE); - } else if ("off".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.OFF); - } else if ("error".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.ERROR); - } - } -} +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java index d1e5d7fe..bdfbbaed 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java @@ -2,6 +2,9 @@ package us.codecraft.webmagic.scripts; import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; + +import us.codecraft.webmagic.scripts.languages.Language; + import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -11,14 +14,11 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class ScriptEnginePool { - private final int size; - private final AtomicInteger availableCount; private final LinkedBlockingQueue scriptEngines = new LinkedBlockingQueue(); public ScriptEnginePool(Language language,int size) { - this.size = size; this.availableCount = new AtomicInteger(size); for (int i=0;i getAllOptions() { + return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG()); + } +} + +class OptionL extends CommandLineOption { + public OptionL() { + super('l'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String language = commandLine.getOptionValue("l"); + params.setLanguagefromArg(language); + } +} + +class OptionF extends CommandLineOption { + public OptionF() { + super('f'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String scriptFilename = commandLine.getOptionValue("f"); + params.setScriptFileName(scriptFilename); + } +} + +class OptionS extends CommandLineOption { + public OptionS() { + super('s'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); + params.setSleepTime(sleepTime); + } +} + +class OptionT extends CommandLineOption { + public OptionT() { + super('t'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); + params.setThread(thread); + } +} + +class OptionG extends CommandLineOption { + public OptionG() { + super('g'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + ConfigLogger.configLogger(commandLine.getOptionValue("g")); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java new file mode 100644 index 00000000..9e81ea6c --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.scripts.config; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.core.Logger; +import org.slf4j.LoggerFactory; + +public class ConfigLogger { + /** + * Log the config parameter. If the counter is less than the number of available + * options then it means that the user entered an option + * + * @param value The config string + */ + public static void configLogger(String value) { + List> options = List.of( + Pair.of("debug", Level.DEBUG), + Pair.of("info", Level.INFO), + Pair.of("warn", Level.WARN), + Pair.of("trace", Level.TRACE), + Pair.of("off", Level.OFF), + Pair.of("error", Level.ERROR)); + Pair option = options.get(0); + int i = 1; + while (i < options.size() && !option.getLeft().equalsIgnoreCase(value)) + option = options.get(i++); + if (i < options.size()) { + Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); + rootLogger.setLevel(option.getRight()); + } + } +} diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java new file mode 100644 index 00000000..b3a3209a --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.jruby.RubyHash; + +import us.codecraft.webmagic.Page; + +public class JRuby extends Language { + public JRuby() { + super("jruby","ruby/defines.rb",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext()); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry) itruby.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java new file mode 100644 index 00000000..b0f7b647 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import us.codecraft.webmagic.Page; + +public class Javascript extends Language { + public Javascript() { + super("javascript","js/defines.js",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java new file mode 100644 index 00000000..9124d2db --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.python.core.PyDictionary; + +import us.codecraft.webmagic.Page; + +public class Jython extends Language { + public Jython() { + super("jython","python/defines.py",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + PyDictionary oJython = (PyDictionary) engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java old mode 100755 new mode 100644 similarity index 51% rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java index 2f9d22d5..44e6ba0a --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java @@ -1,15 +1,18 @@ -package us.codecraft.webmagic.scripts; +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; +import us.codecraft.webmagic.Page; /** - * @author code4crafter@gmail.com + * @author FrancoisGib */ -public enum Language { - - JavaScript("javascript","js/defines.js",""), - - JRuby("jruby","ruby/defines.rb",""), - - Jython("jython","python/defines.py",""); +public abstract class Language { + public Language(String engineName, String defineFile, String gatherFile) { + this.engineName = engineName; + this.defineFile = defineFile; + this.gatherFile = gatherFile; + } private String engineName; @@ -17,12 +20,6 @@ public enum Language { private String gatherFile; - Language(String engineName, String defineFile, String gatherFile) { - this.engineName = engineName; - this.defineFile = defineFile; - this.gatherFile = gatherFile; - } - public String getEngineName() { return engineName; } @@ -34,4 +31,6 @@ public enum Language { public String getGatherFile() { return gatherFile; } + + public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException; } diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java index ffeb9c99..b4c28521 100755 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -2,7 +2,11 @@ package us.codecraft.webmagic.scripts; import org.junit.Ignore; import org.junit.Test; + import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Jython; /** * @author code4crafter@gmail.com @@ -13,14 +17,14 @@ public class ScriptProcessorTest { @Test public void testJavaScriptProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @Test public void testRubyProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @@ -28,7 +32,7 @@ public class ScriptProcessorTest { @Test public void testPythonProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); }