update the script

pull/93/head
friddle 11 years ago
parent c1e7207869
commit 37666a7151

@ -16,6 +16,10 @@
<artifactId>jruby</artifactId>
<version>1.7.6</version>
</dependency>
<dependency><groupId>org.python</groupId>
<artifactId>jython</artifactId>
<version>2.5.3</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>

@ -7,7 +7,9 @@ public enum Language {
JavaScript("javascript","js/defines.js",""),
JRuby("jruby","ruby/defines.rb","");
JRuby("jruby","ruby/defines.rb",""),
Jython("jython","python/defines.py","");
private String engineName;

@ -1,6 +1,9 @@
package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils;
import org.jruby.RubyHash;
import org.python.core.PyDictionary;
import sun.org.mozilla.javascript.internal.NativeObject;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
@ -10,6 +13,8 @@ import javax.script.ScriptEngine;
import javax.script.ScriptException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Map;
/**
* @author code4crafter@gmail.com
@ -50,20 +55,34 @@ public class ScriptProcessor implements PageProcessor {
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
try {
engine.eval(defines + "\n" + script, context);
// switch (language) {
// case JavaScript:
// NativeObject o = (NativeObject) engine.get("result");
// if (o != null) {
// for (Map.Entry<Object, Object> objectObjectEntry : o.entrySet()) {
// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue());
// }
// }
// break;
// case JRuby:
// Object o1 = engine.get("result");
// break;
// }
switch (language) {
case JavaScript:
engine.eval(defines + "\n" + script, context);
NativeObject o = (NativeObject) engine.get("result");
if (o != null) {
for (Map.Entry<Object, Object> objectObjectEntry : o.entrySet()) {
page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue());
}
}
break;
case JRuby:
RubyHash oRuby=(RubyHash)engine.eval(defines+"\n"+script,context);
Iterator itruby = oRuby.entrySet().iterator();
while (itruby.hasNext()) {
Map.Entry pairs = (Map.Entry)itruby.next();
page.getResultItems().put(pairs.getKey().toString(),pairs.getValue());
}
break;
case Jython:
engine.eval(defines + "\n" + script, context);
PyDictionary oJython=(PyDictionary)engine.get("result");
Iterator it = oJython.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pairs = (Map.Entry)it.next();
page.getResultItems().put(pairs.getKey().toString(),pairs.getValue());
}
break;
}
} catch (ScriptException e) {
e.printStackTrace();
}
@ -72,6 +91,7 @@ public class ScriptProcessor implements PageProcessor {
}
}
@Override
public Site getSite() {
return site;

@ -0,0 +1,13 @@
def xpath(str):
return page.getHtml().xpath(str).toString()
def css(str):
return page.getHtml().css(str).toString()
def urls(str):
links=page.getHtml().links().regex(str).all()
page.addTargetRequests(links);
def tomap(key,value):
return "hello world"

@ -0,0 +1,4 @@
title=xpath("div[@class=BlogTitle]")
urls="http://my\\.oschina\\.net/flashsword/blog/\\d+"
result={"title":title,"urls":urls}

@ -22,4 +22,12 @@ public class ScriptProcessorTest {
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@Test
public void testPythonProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
}

Loading…
Cancel
Save