添加百度图片搜索

添加百度图片搜索
pull/944/head
apaqi 5 years ago
parent c4c48d3522
commit e4ff496615

@ -193,6 +193,11 @@
<artifactId>jedis</artifactId>
<version>2.9.3</version>
</dependency>
<dependency>
<groupId>net.jcip</groupId>
<artifactId>jcip-annotations</artifactId>
<version>1.0</version>
</dependency>
</dependencies>
</dependencyManagement>

@ -80,7 +80,10 @@
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</dependency>
<dependency>
<groupId>net.jcip</groupId>
<artifactId>jcip-annotations</artifactId>
</dependency>
</dependencies>
</project>

@ -1,5 +1,6 @@
package us.codecraft.webmagic.pipeline;
import net.jcip.annotations.ThreadSafe;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ -0,0 +1,193 @@
package us.codecraft.webmagic.samples;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.Exchanger;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class BaiduPictureDownloadProcesser implements PageProcessor {
ExecutorService executorService = Executors.newFixedThreadPool(10);
private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
.setCharset("UTF-8");
private final static Map<String, String> KEY_WORDS = new HashMap<>();
static {
//奶制品
KEY_WORDS.put("牛奶", "牛奶");
KEY_WORDS.put("奶酪", "奶酪");
KEY_WORDS.put("酸奶", "酸奶");
//哺乳动物肉
KEY_WORDS.put("羊肉", "肉");
KEY_WORDS.put("牛肉", "肉");
KEY_WORDS.put("狗肉", "肉");
KEY_WORDS.put("驴肉", "肉");
KEY_WORDS.put("猪肉", "肉");
//家禽肉
KEY_WORDS.put("鸡", "家禽肉");
KEY_WORDS.put("鸭", "家禽肉");
KEY_WORDS.put("鹅", "家禽肉");
//蛋类
KEY_WORDS.put("鸡蛋", "蛋");
KEY_WORDS.put("鸭蛋", "蛋");
KEY_WORDS.put("鸽子蛋", "蛋");
//蔬菜
KEY_WORDS.put("冬瓜", "冬瓜");
KEY_WORDS.put("西红柿", "西红柿");
KEY_WORDS.put("苦瓜", "苦瓜");
KEY_WORDS.put("青椒", "青椒");
KEY_WORDS.put("胡萝卜", "胡萝卜");
KEY_WORDS.put("南瓜", "南瓜");
KEY_WORDS.put("玉米", "玉米");
KEY_WORDS.put("秋葵", "秋葵");
KEY_WORDS.put("西兰花", "西兰花");
KEY_WORDS.put("生姜", "生姜");
//水果
KEY_WORDS.put("苹果", "苹果");
KEY_WORDS.put("梨", "梨");
KEY_WORDS.put("香蕉", "香蕉");
KEY_WORDS.put("葡萄", "葡萄");
KEY_WORDS.put("榴莲", "榴莲");
KEY_WORDS.put("猕猴桃", "猕猴桃");
KEY_WORDS.put("哈密瓜", "哈密瓜");
KEY_WORDS.put("草莓", "草莓");
KEY_WORDS.put("橘子", "橘子");
KEY_WORDS.put("菠萝", "菠萝");
KEY_WORDS.put("山楂", "山楂");
KEY_WORDS.put("桂圆", "桂圆");
//水产品
KEY_WORDS.put("虾", "虾");
KEY_WORDS.put("蟹", "蟹");
KEY_WORDS.put("鱼", "鱼");
KEY_WORDS.put("贝类", "贝类");
KEY_WORDS.put("螺类", "螺类");
KEY_WORDS.put("海参类", "海参类");
//
KEY_WORDS.put("豆皮", "豆皮");
KEY_WORDS.put("豆腐脑", "豆腐脑");
KEY_WORDS.put("豆干", "豆干");
KEY_WORDS.put("豆腐", "豆腐");
//坚果
KEY_WORDS.put("腰果", "腰果");
KEY_WORDS.put("开心果", "开心果");
KEY_WORDS.put("核桃", "核桃");
KEY_WORDS.put("葡萄干", "葡萄干");
KEY_WORDS.put("夏威夷果", "夏威夷果");
}
@Override
public void process(Page page) {
List<String> url_list = new ArrayList<>();
List<String> name_list = new ArrayList<>();
JSONObject jsonObject = (JSONObject) JSONObject.parse(page.getRawText());
JSONArray data = (JSONArray) jsonObject.get("data");
for (int i = 0; i < data.size(); i++) {
String url = (String) data.getJSONObject(i).get("thumbURL");
String name = (String) data.getJSONObject(i).get("fromPageTitleEnc");
if (url != null) {
url_list.add(url);
name_list.add(name);
}
}
setUrls(url_list);
setNames(name_list);
}
@Override
public Site getSite() {
return this.site;
}
private void downloadPicture(List<String> urlList, String key, String keyName) {
URL url = null;
for (int i = 0; i < urlList.size(); i++) {
try {
url = new URL(urlList.get(i));
DataInputStream dataInputStream = new DataInputStream(url.openStream());
String imageName = i + ".jpg";
createDir("d:\\pic\\" + keyName);
File file = new File("d:\\pic\\" + keyName); //设置下载路径
if (!file.isDirectory()) {
file.mkdirs();
}
FileOutputStream fileOutputStream = new FileOutputStream(new File("d:\\pic\\" + keyName + "\\" + imageName.trim()));
byte[] buffer = new byte[1024];
int length;
while ((length = dataInputStream.read(buffer)) > 0) {
fileOutputStream.write(buffer, 0, length);
}
dataInputStream.close();
fileOutputStream.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
BaiduPictureDownloadProcesser downloadPicture = new BaiduPictureDownloadProcesser();
for (Map.Entry<String, String> entry : KEY_WORDS.entrySet()) {
List<String> urlList = new CopyOnWriteArrayList<String>();
for (int i = 0; i < 2; i++) { //控制爬取页数一页10张图片
String url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&queryWord=" + entry.getKey() + "&word=" + entry.getKey() + "&pn=" + i * 10 + "0";
Spider.create(new BaiduPictureDownloadProcesser())
.addUrl(url)
.run();
urlList.addAll(urls);
}
downloadPicture.downloadPicture(urlList, entry.getKey(), entry.getValue());
}
}
static List<String> urls;
static List<String> names;
public void setUrls(List<String> urls) {
this.urls = urls;
}
public void setNames(List<String> names) {
this.names = names;
}
/**
*
*
* @param dir
*/
private void createDir(String dir) throws IOException {
Path path = Paths.get(dir);
if (!Files.exists(path)) {
Files.createDirectories(path);
}
}
}
Loading…
Cancel
Save