update pipeline
parent
755b9aa84e
commit
ecb61d1385
@ -0,0 +1,57 @@
|
||||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import freemarker.template.Configuration;
|
||||
import freemarker.template.Template;
|
||||
import freemarker.template.TemplateException;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* User: cairne
|
||||
* Date: 13-6-8
|
||||
* Time: 下午9:00
|
||||
*/
|
||||
public class FreemarkerPipeline implements Pipeline {
|
||||
|
||||
private Configuration configuration;
|
||||
|
||||
private Template template;
|
||||
|
||||
private String path = "/data/temp/webmagic/ftl/";
|
||||
|
||||
public FreemarkerPipeline(String template, String path) throws IOException {
|
||||
configuration = new Configuration();
|
||||
configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile()));
|
||||
this.template = configuration.getTemplate(template);
|
||||
this.path = path;
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdir();
|
||||
}
|
||||
}
|
||||
|
||||
public FreemarkerPipeline(String template) throws IOException {
|
||||
this(template, "/data/temp/webmagic/ftl/");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void process(Page page, Site site) {
|
||||
String domain = site.getDomain();
|
||||
domain = UrlUtils.getDomain(domain);
|
||||
String path = this.path + "" + domain + "/";
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html"));
|
||||
template.process(page.getFields(), printWriter);
|
||||
printWriter.close();
|
||||
} catch (TemplateException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
<item>
|
||||
<title>$it.Title</title>
|
||||
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
|
||||
<pubDate>${date}</pubDate>
|
||||
<dc:creator>admin</dc:creator>
|
||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[${text}]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<#--<wp:post_id>$it.Id</wp:post_id>-->
|
||||
<wp:post_date>${date}</wp:post_date>
|
||||
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
<wp:ping_status>open</wp:ping_status>
|
||||
<wp:post_name>${title}</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
<wp:menu_order>0</wp:menu_order>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_password></wp:post_password>
|
||||
<wp:is_sticky>0</wp:is_sticky>
|
||||
$tags
|
||||
</item>
|
@ -0,0 +1,19 @@
|
||||
package us.codecraft.webmagic;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* User: cairne
|
||||
* Date: 13-6-9
|
||||
* Time: 上午7:14
|
||||
*/
|
||||
public class FreemarkerPipelineTest {
|
||||
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
}
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
<item>
|
||||
<title>$it.Title</title>
|
||||
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
|
||||
<pubDate>${date}</pubDate>
|
||||
<dc:creator>admin</dc:creator>
|
||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[${text}]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<#--<wp:post_id>$it.Id</wp:post_id>-->
|
||||
<wp:post_date>${date}</wp:post_date>
|
||||
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
<wp:ping_status>open</wp:ping_status>
|
||||
<wp:post_name>${title}</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
<wp:menu_order>0</wp:menu_order>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_password></wp:post_password>
|
||||
<wp:is_sticky>0</wp:is_sticky>
|
||||
$tags
|
||||
</item>
|
@ -0,0 +1,26 @@
|
||||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* User: cairne
|
||||
* Date: 13-6-9
|
||||
* Time: 上午8:02
|
||||
*/
|
||||
public class DiaoyuwengProcessorTest {
|
||||
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
||||
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
Spider.me().pipeline(new FilePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")).
|
||||
processor(diaoyuwengProcessor).run();
|
||||
}
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
<item>
|
||||
<title>${title}</title>
|
||||
<link>http://127.0.0.1/wordpress/?p=${id}</link>
|
||||
<pubDate>${date}</pubDate>
|
||||
<dc:creator>admin</dc:creator>
|
||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[${content}]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<#--<wp:post_id>$it.Id</wp:post_id>-->
|
||||
<wp:post_date>${date}</wp:post_date>
|
||||
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
<wp:ping_status>open</wp:ping_status>
|
||||
<wp:post_name>${title}</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
<wp:menu_order>0</wp:menu_order>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_password></wp:post_password>
|
||||
<wp:is_sticky>0</wp:is_sticky>
|
||||
</item>
|
Loading…
Reference in New Issue