|
|
|
@ -214,7 +214,7 @@ public class Spider implements Runnable, Task {
|
|
|
|
|
return this;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected void checkComponent() {
|
|
|
|
|
protected void initComponent() {
|
|
|
|
|
if (downloader == null) {
|
|
|
|
|
this.downloader = new HttpClientDownloader();
|
|
|
|
|
}
|
|
|
|
@ -222,36 +222,27 @@ public class Spider implements Runnable, Task {
|
|
|
|
|
pipelines.add(new ConsolePipeline());
|
|
|
|
|
}
|
|
|
|
|
downloader.setThread(threadNum);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void run() {
|
|
|
|
|
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) {
|
|
|
|
|
throw new IllegalStateException("Spider is already running!");
|
|
|
|
|
}
|
|
|
|
|
checkComponent();
|
|
|
|
|
executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
|
|
|
|
if (startUrls != null) {
|
|
|
|
|
for (String startUrl : startUrls) {
|
|
|
|
|
scheduler.push(new Request(startUrl), this);
|
|
|
|
|
}
|
|
|
|
|
startUrls.clear();
|
|
|
|
|
}
|
|
|
|
|
Request request = scheduler.poll(this);
|
|
|
|
|
logger.info("Spider " + getUUID() + " started!");
|
|
|
|
|
// single thread
|
|
|
|
|
if (threadNum <= 1) {
|
|
|
|
|
while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
|
|
|
|
|
processRequest(request);
|
|
|
|
|
request = scheduler.poll(this);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
this.executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
|
|
|
|
}
|
|
|
|
|
// multi thread
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void run() {
|
|
|
|
|
checkRunningStat();
|
|
|
|
|
initComponent();
|
|
|
|
|
logger.info("Spider " + getUUID() + " started!");
|
|
|
|
|
final AtomicInteger threadAlive = new AtomicInteger(0);
|
|
|
|
|
while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
|
|
|
|
|
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
|
|
|
|
|
Request request = scheduler.poll(this);
|
|
|
|
|
if (request == null) {
|
|
|
|
|
if (threadAlive.get() == 0) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
// when no request found but some thread is alive, sleep a
|
|
|
|
|
// while.
|
|
|
|
|
try {
|
|
|
|
@ -264,26 +255,35 @@ public class Spider implements Runnable, Task {
|
|
|
|
|
executorService.execute(new Runnable() {
|
|
|
|
|
@Override
|
|
|
|
|
public void run() {
|
|
|
|
|
try {
|
|
|
|
|
processRequest(requestFinal);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
logger.error("download "+requestFinal+" error",e);
|
|
|
|
|
} finally {
|
|
|
|
|
threadAlive.decrementAndGet();
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
request = scheduler.poll(this);
|
|
|
|
|
if (threadAlive.get() == 0) {
|
|
|
|
|
request = scheduler.poll(this);
|
|
|
|
|
if (request == null) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
executorService.shutdown();
|
|
|
|
|
}
|
|
|
|
|
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
|
|
|
|
|
stat.set(STAT_STOPPED);
|
|
|
|
|
// release some resources
|
|
|
|
|
destroy();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void checkRunningStat() {
|
|
|
|
|
while (true) {
|
|
|
|
|
int statNow = stat.get();
|
|
|
|
|
if (statNow == STAT_RUNNING) {
|
|
|
|
|
throw new IllegalStateException("Spider is already running!");
|
|
|
|
|
}
|
|
|
|
|
if (stat.compareAndSet(statNow, STAT_RUNNING)) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected void destroy() {
|
|
|
|
|
destroyEach(downloader);
|
|
|
|
|
destroyEach(pageProcessor);
|
|
|
|
@ -305,11 +305,10 @@ public class Spider implements Runnable, Task {
|
|
|
|
|
/**
|
|
|
|
|
* Process specific urls without url discovering.
|
|
|
|
|
*
|
|
|
|
|
* @param urls
|
|
|
|
|
* urls to process
|
|
|
|
|
* @param urls urls to process
|
|
|
|
|
*/
|
|
|
|
|
public void test(String... urls) {
|
|
|
|
|
checkComponent();
|
|
|
|
|
initComponent();
|
|
|
|
|
if (urls.length > 0) {
|
|
|
|
|
for (String url : urls) {
|
|
|
|
|
processRequest(new Request(url));
|
|
|
|
@ -356,7 +355,7 @@ public class Spider implements Runnable, Task {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected void checkIfRunning() {
|
|
|
|
|
if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) {
|
|
|
|
|
if (stat.get() == STAT_RUNNING) {
|
|
|
|
|
throw new IllegalStateException("Spider is already running!");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|