From bc5c30de17646c16c0507b1a007abb1b9f80d9f2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 12 Nov 2013 08:20:59 +0800 Subject: [PATCH] update scripts --- pom.xml | 1 + webmagic-core/src/main/resources/log4j.xml | 10 --- .../javascript/JsScriptProcessor.java | 82 +++++++++++++++++++ .../RubyScriptProcessor.java | 5 +- .../codecraft/webmagic/scripts/Language.java | 35 ++++++++ .../ScriptProcessor.java} | 41 +++++----- .../scripts/ScriptProcessorBuilder.java | 64 +++++++++++++++ .../webmagic/scripts/ScriptProcessorTest.java | 25 ++++++ webmagic-scripts/src/test/resouces/log4j.xml | 21 +++++ 9 files changed, 253 insertions(+), 31 deletions(-) create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/javascript/JsScriptProcessor.java rename webmagic-scripts/src/main/java/us/codecraft/webmagic/{processor => jruby}/RubyScriptProcessor.java (91%) create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java rename webmagic-scripts/src/main/java/us/codecraft/webmagic/{processor/JsScriptProcessor.java => scripts/ScriptProcessor.java} (50%) create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java create mode 100644 webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java create mode 100644 webmagic-scripts/src/test/resouces/log4j.xml diff --git a/pom.xml b/pom.xml index 2aa3df7a..bc3d03a3 100644 --- a/pom.xml +++ b/pom.xml @@ -48,6 +48,7 @@ webmagic-core webmagic-extension/ + webmagic-scripts/ diff --git a/webmagic-core/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml index 9084694e..c2b5a2f5 100644 --- a/webmagic-core/src/main/resources/log4j.xml +++ b/webmagic-core/src/main/resources/log4j.xml @@ -8,21 +8,11 @@ - - - - - - - - - - diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/javascript/JsScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/javascript/JsScriptProcessor.java new file mode 100644 index 00000000..3c5982ac --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/javascript/JsScriptProcessor.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.javascript; + +import org.apache.commons.io.IOUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scripts.ScriptProcessor; +import us.codecraft.webmagic.scripts.ScriptProcessorBuilder; + +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * @author code4crafter@gmail.com + * @since 0.4.1 + */ +public class JsScriptProcessor implements PageProcessor { + + private ScriptEngine engine; + + private String defines; + + private String script; + + JsScriptProcessor(String script) throws IOException { + ScriptEngineManager manager = new ScriptEngineManager(); + engine = manager.getEngineByName("javascript"); + InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js"); + defines = IOUtils.toString(resourceAsStream); + this.script = script; + } + + public static JsScriptProcessor fromFile(String fileName) { + try { + InputStream resourceAsStream = new FileInputStream(fileName); + String script = IOUtils.toString(resourceAsStream); + return new JsScriptProcessor(script); + } catch (IOException e) { + //wrap IOException because I prefer a runtime exception... + throw new IllegalArgumentException(e); + } + } + + public static JsScriptProcessor fromClassPathFile(String fileName) { + try { + InputStream resourceAsStream = JsScriptProcessor.class.getClassLoader().getResourceAsStream(fileName); + String script = IOUtils.toString(resourceAsStream); + return new JsScriptProcessor(script); + } catch (IOException e) { + //wrap IOException because I prefer a runtime exception... + throw new IllegalArgumentException(e); + } + } + + @Override + public void process(Page page) { + ScriptContext context = engine.getContext(); + context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); + try { + engine.eval(defines + script, context); + } catch (ScriptException e) { + e.printStackTrace(); + } + + } + + @Override + public Site getSite() { + return Site.me(); + } + + public static void main(String[] args) { + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().scriptFromClassPathFile("js/oschina.js").build(); + Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").run(); + } +} diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/RubyScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/jruby/RubyScriptProcessor.java similarity index 91% rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/RubyScriptProcessor.java rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/jruby/RubyScriptProcessor.java index cf6801c3..409374a9 100644 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/RubyScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/jruby/RubyScriptProcessor.java @@ -1,9 +1,10 @@ -package us.codecraft.webmagic.processor; +package us.codecraft.webmagic.jruby; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import javax.script.ScriptContext; import javax.script.ScriptEngine; @@ -15,7 +16,7 @@ import java.io.InputStream; /** * @author code4crafter@gmail.com */ -public class RubyScriptProcessor implements PageProcessor{ +public class RubyScriptProcessor implements PageProcessor { private ScriptEngine rubyEngine; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java new file mode 100644 index 00000000..c7ddcda9 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java @@ -0,0 +1,35 @@ +package us.codecraft.webmagic.scripts; + +/** + * @author code4crafter@gmail.com + */ +public enum Language { + + JavaScript("javascript","js/defines.js",""), + + JRuby("jruby","ruby/defines.rb",""); + + private String engineName; + + private String defineFile; + + private String gatherFile; + + Language(String engineName, String defineFile, String gatherFile) { + this.engineName = engineName; + this.defineFile = defineFile; + this.gatherFile = gatherFile; + } + + public String getEngineName() { + return engineName; + } + + public String getDefineFile() { + return defineFile; + } + + public String getGatherFile() { + return gatherFile; + } +} diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/JsScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java similarity index 50% rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/JsScriptProcessor.java rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java index 51ec04e2..c1ec74a4 100644 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/JsScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,9 +1,9 @@ -package us.codecraft.webmagic.processor; +package us.codecraft.webmagic.scripts; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import javax.script.ScriptContext; import javax.script.ScriptEngine; @@ -14,48 +14,51 @@ import java.io.InputStream; /** * @author code4crafter@gmail.com + * @since 0.4.1 */ -public class JsScriptProcessor implements PageProcessor{ +public class ScriptProcessor implements PageProcessor { - private ScriptEngine rubyEngine; + private ScriptEngine engine; private String defines; private String script; - public JsScriptProcessor(String filename){ + private final Language language; + + private Site site = Site.me(); + + public ScriptProcessor(Language language, String script) { + if (language == null || script == null) { + throw new IllegalArgumentException("language and script must not be null!"); + } + this.language = language; ScriptEngineManager manager = new ScriptEngineManager(); - rubyEngine = manager.getEngineByName("javascript"); - InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js"); + engine = manager.getEngineByName(language.getEngineName()); + InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile()); try { defines = IOUtils.toString(resourceAsStream); - resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename); - script = IOUtils.toString(resourceAsStream); } catch (IOException e) { - e.printStackTrace(); + throw new IllegalArgumentException(e); } - - + this.script = script; } @Override public void process(Page page) { - ScriptContext context = rubyEngine.getContext(); + ScriptContext context = engine.getContext(); context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); + context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE); try { - rubyEngine.eval(defines+script, context); + engine.eval(defines + script, context); } catch (ScriptException e) { e.printStackTrace(); } - } @Override public Site getSite() { - return Site.me(); + return site; } - public static void main(String[] args) { - Spider.create(new JsScriptProcessor("js/oschina.js")).addUrl("http://my.oschina.net/flashsword/blog").run(); - } } diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java new file mode 100644 index 00000000..29587299 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java @@ -0,0 +1,64 @@ +package us.codecraft.webmagic.scripts; + +import org.apache.commons.io.IOUtils; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * @author code4crafter@gmail.com + * @since 0.4.1 + */ +public class ScriptProcessorBuilder { + + private static final Language DefaultLanguage = Language.JavaScript; + + private Language language = DefaultLanguage; + + private String script; + + private ScriptProcessorBuilder() { + } + + public static ScriptProcessorBuilder custom() { + return new ScriptProcessorBuilder(); + } + + public ScriptProcessorBuilder language(Language language) { + this.language = language; + return this; + } + + public ScriptProcessorBuilder scriptFromFile(String fileName) { + try { + InputStream resourceAsStream = new FileInputStream(fileName); + this.script = IOUtils.toString(resourceAsStream); + } catch (IOException e) { + //wrap IOException because I prefer a runtime exception... + throw new IllegalArgumentException(e); + } + return this; + } + + public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) { + try { + InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName); + this.script = IOUtils.toString(resourceAsStream); + } catch (IOException e) { + //wrap IOException because I prefer a runtime exception... + throw new IllegalArgumentException(e); + } + return this; + } + + public ScriptProcessorBuilder script(String script) { + this.script = script; + return this; + } + + public ScriptProcessor build(){ + return new ScriptProcessor(language,script); + } + +} diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java new file mode 100644 index 00000000..ec3f6742 --- /dev/null +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic.scripts; + +import org.junit.Test; +import us.codecraft.webmagic.Spider; + +/** + * @author code4crafter@gmail.com + * @since 0.4.1 + */ +public class ScriptProcessorTest { + + @Test + public void testJavaScriptProcessor() { + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build(); + pageProcessor.getSite().setSleepTime(0); + Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); + } + + @Test + public void testRubyProcessor() { + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build(); + pageProcessor.getSite().setSleepTime(0); + Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); + } +} diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml new file mode 100644 index 00000000..1f64d8da --- /dev/null +++ b/webmagic-scripts/src/test/resouces/log4j.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + +