add scripts

pull/88/head^2
yihua.huang 11 years ago
parent c2e04ea5a0
commit df8ca8ad09

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.5 KiB

@ -48,6 +48,7 @@
<modules>
<module>webmagic-core</module>
<module>webmagic-extension/</module>
<module>webmagic-scripts</module>
</modules>
<dependencyManagement>

@ -45,6 +45,16 @@ public class PlainText implements Selectable {
throw new UnsupportedOperationException();
}
@Override
public Selectable css(String selector) {
return $(selector);
}
@Override
public Selectable css(String selector, String attrName) {
return $(selector, attrName);
}
@Override
public Selectable smartContent() {
throw new UnsupportedOperationException();

@ -35,6 +35,23 @@ public interface Selectable {
*/
public Selectable $(String selector, String attrName);
/**
* select list with css selector
*
* @param selector css selector expression
* @return new Selectable after extract
*/
public Selectable css(String selector);
/**
* select list with css selector
*
* @param selector css selector expression
* @param attrName attribute name of css selector
* @return new Selectable after extract
*/
public Selectable css(String selector, String attrName);
/**
* select smart content with ReadAbility algorithm
*

@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-scripts</artifactId>
<version>0.4.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.jruby</groupId>
<artifactId>jruby</artifactId>
<version>1.7.6</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</project>

@ -0,0 +1,61 @@
package us.codecraft.webmagic.processor;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.io.IOException;
import java.io.InputStream;
/**
* @author code4crafter@gmail.com
*/
public class ScriptProcessor implements PageProcessor{
private ScriptEngine rubyEngine;
private String defines;
ScriptProcessor(){
ScriptEngineManager manager = new ScriptEngineManager();
rubyEngine = manager.getEngineByName("jruby");
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb");
try {
defines = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void process(Page page) {
ScriptContext context = rubyEngine.getContext();
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
String script;
try {
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/oschina.rb");
try {
script = IOUtils.toString(resourceAsStream);
rubyEngine.eval(defines+script, context);
} catch (IOException e) {
e.printStackTrace();
}
} catch (ScriptException e) {
e.printStackTrace();
}
}
@Override
public Site getSite() {
return Site.me();
}
public static void main(String[] args) {
Spider.create(new ScriptProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run();
}
}

@ -0,0 +1,11 @@
def xpath str
$page.getHtml().xpath(str).toString()
end
def css str
$page.getHtml().css(str).toString()
end
def urls str
links = $page.getHtml().links().regex(str).all();
$page.addTargetRequests(links);
end

@ -0,0 +1,5 @@
title = css "div.BlogTitle h1"
content = css "div.BlogContent"
urls "http://my\\.oschina\\.net/flashsword/blog/\\d+"
puts title
puts content

@ -0,0 +1,25 @@
package us.codecraft.webmagic.jruby;
import org.junit.Test;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
/**
* @author code4crafter@gmail.com
*/
public class TestJRubyCall {
@Test
public void test() throws ScriptException {
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine rubyEngine = manager.getEngineByName("jruby");
ScriptContext context = rubyEngine.getContext();
context.setAttribute("a", "sad", ScriptContext.ENGINE_SCOPE);
// rubyEngine.eval("", context);
rubyEngine.eval("b=1; puts b", context);
}
}
Loading…
Cancel
Save