diff --git a/webmagic-scripts/README.md b/webmagic-scripts/README.md old mode 100644 new mode 100755 diff --git a/webmagic-scripts/deploy.sh b/webmagic-scripts/deploy.sh old mode 100644 new mode 100755 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml old mode 100644 new mode 100755 index 5c211609..41c79ea0 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -16,6 +16,10 @@ jruby 1.7.6 + org.python + jython + 2.5.3 + commons-cli commons-cli diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java old mode 100644 new mode 100755 index c7ddcda9..2f9d22d5 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java @@ -7,7 +7,9 @@ public enum Language { JavaScript("javascript","js/defines.js",""), - JRuby("jruby","ruby/defines.rb",""); + JRuby("jruby","ruby/defines.rb",""), + + Jython("jython","python/defines.py",""); private String engineName; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java old mode 100644 new mode 100755 index 58018511..0214e8a9 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,6 +1,9 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.io.IOUtils; +import org.jruby.RubyHash; +import org.python.core.PyDictionary; +import sun.org.mozilla.javascript.internal.NativeObject; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; @@ -10,6 +13,8 @@ import javax.script.ScriptEngine; import javax.script.ScriptException; import java.io.IOException; import java.io.InputStream; +import java.util.Iterator; +import java.util.Map; /** * @author code4crafter@gmail.com @@ -50,20 +55,34 @@ public class ScriptProcessor implements PageProcessor { context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE); try { - engine.eval(defines + "\n" + script, context); -// switch (language) { -// case JavaScript: -// NativeObject o = (NativeObject) engine.get("result"); -// if (o != null) { -// for (Map.Entry objectObjectEntry : o.entrySet()) { -// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue()); -// } -// } -// break; -// case JRuby: -// Object o1 = engine.get("result"); -// break; -// } + switch (language) { + case JavaScript: + engine.eval(defines + "\n" + script, context); + NativeObject o = (NativeObject) engine.get("result"); + if (o != null) { + for (Map.Entry objectObjectEntry : o.entrySet()) { + page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue()); + } + } + break; + case JRuby: + RubyHash oRuby=(RubyHash)engine.eval(defines+"\n"+script,context); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry)itruby.next(); + page.getResultItems().put(pairs.getKey().toString(),pairs.getValue()); + } + break; + case Jython: + engine.eval(defines + "\n" + script, context); + PyDictionary oJython=(PyDictionary)engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry)it.next(); + page.getResultItems().put(pairs.getKey().toString(),pairs.getValue()); + } + break; + } } catch (ScriptException e) { e.printStackTrace(); } @@ -72,6 +91,7 @@ public class ScriptProcessor implements PageProcessor { } } + @Override public Site getSite() { return site; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/defines.js b/webmagic-scripts/src/main/resources/js/defines.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/github.js b/webmagic-scripts/src/main/resources/js/github.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/js/oschina.js b/webmagic-scripts/src/main/resources/js/oschina.js old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/python/defines.py b/webmagic-scripts/src/main/resources/python/defines.py new file mode 100755 index 00000000..913a4b4d --- /dev/null +++ b/webmagic-scripts/src/main/resources/python/defines.py @@ -0,0 +1,13 @@ +def xpath(str): + return page.getHtml().xpath(str).toString() + +def css(str): + return page.getHtml().css(str).toString() + +def urls(str): + links=page.getHtml().links().regex(str).all() + page.addTargetRequests(links); + +def tomap(key,value): + return "hello world" + diff --git a/webmagic-scripts/src/main/resources/python/oschina.py b/webmagic-scripts/src/main/resources/python/oschina.py new file mode 100755 index 00000000..51a188b5 --- /dev/null +++ b/webmagic-scripts/src/main/resources/python/oschina.py @@ -0,0 +1,4 @@ +title=xpath("div[@class=BlogTitle]") +urls="http://my\\.oschina\\.net/flashsword/blog/\\d+" + +result={"title":title,"urls":urls} diff --git a/webmagic-scripts/src/main/resources/ruby/defines.rb b/webmagic-scripts/src/main/resources/ruby/defines.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/ruby/github.rb b/webmagic-scripts/src/main/resources/ruby/github.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/main/resources/ruby/oschina.rb b/webmagic-scripts/src/main/resources/ruby/oschina.rb old mode 100644 new mode 100755 diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java old mode 100644 new mode 100755 index ec3f6742..23fe0935 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -22,4 +22,12 @@ public class ScriptProcessorTest { pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } + + + @Test + public void testPythonProcessor() { + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + pageProcessor.getSite().setSleepTime(0); + Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); + } } diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml old mode 100644 new mode 100755