Add maven dependency
<dependency>
<groupId>io.loli.nekocat</groupId>
<artifactId>nekocat-core</artifactId>
<version>0.0.5</version>
</dependency>
NekoCatSpider.builder()
.name("spiderName")
.startUrl("http://www.example.com/")
.url(NekoCatProperties.builder()
// deal with the start-url
.regex("http://www.example.com/")
.pipline((resp)->{
response.asDocument()
.select("css-select")
.forEach(a ->
// url that should be downloaded
resp.getContext().next(a.attr("href"));
);
})
.build())
.url(NekoCatProperties.builder().regex("http://www.example.com/.+")
.pipline(resp -> {
// select all images
resp.adDocument().select("img")
.forEach(img->{
resp.getContext().next(img.attr("src"));
});
})
.build())
.build()
.start();
Nekocat provides two simple logging interceptors LoggingInterceptor
and ErrorLoggingInterceptor
ErrorLoggingInterceptor
only log exceptions but LoggingInterceptor
log all.
NekoCatProperties.builder()
...
.log()
NekoCatProperties.builder()
...
.logError()
NekoCatProperties.builder()
.regex(".*\\.jpg")
...
.downloadPoolSize(1)
.downloadMaxQueueSize(1024)
.piplinePoolSize(1)
.piplineMaxQueueSize(1024)
NekoCatSpider.builder()
.name("spiderName")
...
.stopAfterNoRequestEmmitMillis(3600 * 1000L)
NekoCatSpider.builder()
.name("spiderName")
.startUrl("http://www.example.com/")
.url(NekoCatProperties.builder().regex("http://www.example.com/")
.pipline(resp -> {
// select all images
resp.asDocument().select("img")
.forEach(img->{
CompletableFuture<Object> result = resp.getContext().next(img.attr("src")).getPiplineResult();
// get the file returned by the next pipline
File imgFile = (File)result.get();
});
})
.build())
.url(NekoCatProperties.builder().regex(".*\\.jpg")
.pipline(resp -> {
// select all images
byte[] bytes = resp.asBytes();
// write img to filesystem and return this file
writeBytesToFile(bytes);
return yourFile;
})
.build())
.build()
NekoCatSpider.builder()
.name("spiderName")
.startUrl("http://www.example.com/")
.url(NekoCatProperties.builder().regex("http://www.example.com/")
.pipline(resp -> {
// select all images
resp.asDocument().select("img")
.forEach(img->{
resp.getContext().addNextAttribute("storeFolder", "/tmp");
resp.getContext().next(img.attr("src"));
});
})
.build())
.url(NekoCatProperties.builder().regex(".*\\.jpg")
.pipline(resp -> {
String storeFolder = resp.getContext().getAttribute("storeFolder");
// select all images
byte[] bytes = resp.asBytes();
// write img to filesystem and return this file
writeBytesToFile(storeFolder, bytes);
return null;
})
.build())
.build()
// form
// value must be urlencoded
request.setMethod("POST");
request.setRequestBody("param1=value1¶m2=value2");
...
// json
request.setMethod("POST");
request.addHeader("content-type", "application/json");
request.setRequestBody(your_json_str);
request.addHeader(yourAdditionalHeader);
// spider will download the startUrl every 10 mins
NekoCatSpider.builder()
.name("spiderName")
.startUrl("http://www.example.com")
...
.loopInterval(1000 * 60 * 10)
...
// interval of each download
NekoCatProperties.builder()
.regex(".*\\.jpg")
.interval(1000)
...
NekoCatProperties.builder()
...
.interceptor(new FilterDownloadedUrlInterceptor(1024))
...
NekoCatProperties.builder()
...
downloadRetry(1)
...
piplineRetry(1)
...
- json export
- redis queue/db queue
- Thread Pool Factory