本文主要用Java selenium实现点击打开漫画 <https://manhua.dmzj.com/>

如果有selenium配置问题请前往从头学习爬虫(十)进阶篇----selenium
<https://blog.csdn.net/qq_36783371/article/details/79817923>回顾

未使用框架
import java.io.BufferedInputStream; import java.io.BufferedOutputStream;
import java.io.File; import java.io.FileOutputStream; import
java.io.IOException; import java.io.InputStream; import
java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList;
import java.util.List; import java.util.concurrent.Callable; import
java.util.concurrent.CompletionService; import
java.util.concurrent.ExecutorCompletionService; import
java.util.concurrent.ExecutorService; import java.util.concurrent.Executors;
import org.apache.http.HttpEntity; import
org.apache.http.client.config.RequestConfig; import
org.apache.http.client.methods.CloseableHttpResponse; import
org.apache.http.client.methods.HttpGet; import
org.apache.http.impl.client.CloseableHttpClient; import
org.apache.http.impl.client.HttpClients; import
org.apache.http.util.EntityUtils; import org.openqa.selenium.By; import
org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import
org.openqa.selenium.chrome.ChromeDriver; import
org.openqa.selenium.chrome.ChromeOptions; import
us.codecraft.webmagic.selector.Html; import
us.codecraft.webmagic.selector.Selectable; public class GaReiZeroSpiderX{
public static void main(String[] args) { //主页 String
url="https://manhua.dmzj.com/shiling"; //线程数 int threadsize=10; //延迟 long
sleeptime=5000; //获取列表页 List<String> itemList=getListPage(url); //获取图片地址
List<String> imgList=getListImg(itemList); //多线程下载
DownLoadImg(imgList,threadsize,sleeptime); } private static List<String>
getListImg(List<String> itemList) { List<String> listImg=new ArrayList<>();
if(itemList==null) { return null; } //配置驱动
System.getProperties().setProperty("webdriver.chrome.driver","D:\\newChromeDriver\\chromedriver_win32\\chromedriver.exe");
ChromeOptions options = new ChromeOptions(); //配置浏览器位置
options.setBinary("C:\\Program Files
(x86)\\Google\\Chrome\\Application\\chrome.exe"); //无头模式 59版本以上才可以
options.addArguments("test-type"); //ignore certificate errors
options.addArguments("headless");// headless mode
options.addArguments("disable-gpu"); //没啥用 本来可以用于页面显示模式设置
options.addArguments("Cookie:display_mode=1"); WebDriver driver = new
ChromeDriver(options); for (String url : itemList) {
url="https://manhua.dmzj.com"+url; driver.get(url); WebElement webElement =
driver.findElement(By.xpath("/html")); String content =
webElement.getAttribute("outerHTML"); Html html=new Html(content); String
title=html.xpath("//title/text()").toString().split("-")[0]; List<Selectable>
s=html.xpath("//div[@class='btmBtnBox']/select/option").nodes(); for
(Selectable selectable : s) {                             //每一话的标题 每一页 图片地址
                             listImg.add(title+"___"+selectable.xpath("/option/text()")+"___"+"https:"+selectable.xpath("/option/@value"));
} } //关闭窗口 driver.close(); //关闭进程 driver.quit(); return listImg; } private
static List<String> getListPage(String url) { CloseableHttpResponse response =
null; try{ CloseableHttpClient httpClient = HttpClients.createDefault();
RequestConfig requestConfig =
RequestConfig.custom().setConnectTimeout(1000).setConnectionRequestTimeout(1000).setSocketTimeout(1000).setRedirectsEnabled(true).build();
HttpGet httpGet = new HttpGet(url); httpGet.setConfig(requestConfig);
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36");
response =httpClient.execute(httpGet); if
(response.getStatusLine().getStatusCode() != 200) { System.out.println("request
url failed, http code=" + response.getStatusLine().getStatusCode()); return
null; }else{ HttpEntity entity1 = response.getEntity(); String resultStr =
EntityUtils.toString(entity1, "utf-8"); Html html=new Html(resultStr); /*
List<String> list=new ArrayList<>();
list.add(html.xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").toString());*/
return html.xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").all();
} } catch (Exception e) { return null; } finally { if (response != null){ try {
response.close(); } catch (IOException e) { e.printStackTrace(); } } } }
private static void DownLoadImg(List<String> imgList, int threadsize, long
sleeptime) { int count=0; int size=imgList.size(); ExecutorService
fixedThreadPool = Executors.newFixedThreadPool(threadsize);
CompletionService<String> cs = new
ExecutorCompletionService<String>(fixedThreadPool); for (String url : imgList)
{ final String url1 = url; cs.submit(new Callable<String>() { public String
call() throws Exception { try { Thread.sleep(sleeptime); return down(url1); }
catch (InterruptedException e) { System.out.println("线程异常"); return
"error_"+"url1"; } } }); } for (String url : imgList) { try { String a =
cs.take().get(); if(a!=null) { count++; } } catch (Exception e) {
e.printStackTrace(); }finally { if(count==size) { System.out.println("over");
}else { System.out.println(count+"/"+size); } } } fixedThreadPool.shutdown(); }
protected static String down(String url) { try { url=url.replace(" ", ""); File
dest1 = new File("D:/manhua"); if (!dest1.exists() && !dest1.isDirectory()) {
dest1.mkdir(); } File dest2 = new File("D:/manhua/" + url.split("___")[0]); if
(!dest2.exists() && !dest2.isDirectory()) { dest2.mkdir(); } File dest = new
File("D:/manhua/" + url.split("___")[0] + "/" + url.split("___")[1] + "." +
url.split("___")[2].split("\\.")[url.split("___")[2].split("\\.").length- 1]);
if (!dest.exists()) { dest.createNewFile(); } //接收字节输入流 InputStream is; //字节输出流
FileOutputStream fos = new FileOutputStream(dest); URL temp; String
imgurl=url.split("___")[2]; temp = new URL(imgurl.trim()); HttpURLConnection
uc=(HttpURLConnection) temp.openConnection();
uc.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64;
rv:59.0) Gecko/20100101 Firefox/59.0"); //必须加refer 防封 这个比较烂 写成百度地址也可以
uc.addRequestProperty("Referer", "https://manhua.dmzj.com/");
is=uc.getInputStream(); //为字节输入流加缓冲 BufferedInputStream bis = new
BufferedInputStream(is); //为字节输出流加缓冲 BufferedOutputStream bos = new
BufferedOutputStream(fos); int length; byte[] bytes = new byte[1024 * 20];
while ((length = bis.read(bytes, 0, bytes.length)) != -1) { fos.write(bytes, 0,
length); } bos.close(); fos.close(); bis.close(); is.close(); return
"success_"+"url1"; } catch (Exception e) { e.printStackTrace(); return
"error_"+"url1"; } } }
webmagic框架

spider
import java.util.ArrayList; import java.util.List; import
us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import
us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import
us.codecraft.webmagic.processor.PageProcessor; import
us.codecraft.webmagic.selector.Selectable; public class GaReiZeroSpider
implements PageProcessor{     static List<String> imgurl=new ArrayList<>();   
 private Site site =Site.me();     @Override     public Site getSite() {       
 return site ;     }     @Override     public void process(Page page) {       
 if(page.getUrl().toString().equals("https://manhua.dmzj.com/shiling")) {   
         List<String>
pageUrl=page.getHtml().xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").all();
            for (String string : pageUrl) {                 Request request=new
Request("https://manhua.dmzj.com"+string);               
 request.addHeader("Cookie", "display_mode=1");               
 page.addTargetRequest(request);             }         }else {           
 String title=page.getHtml().xpath("//title/text()").toString().split("-")[0];
            List<Selectable>
s=page.getHtml().xpath("//div[@class='btmBtnBox']/select/option").nodes();   
         for (Selectable selectable : s) {               
 imgurl.add(title+"___"+selectable.xpath("/option/text()")+"___"+"https:"+selectable.xpath("/option/@value"));
            }             page.putField("imgurl", imgurl);         }         
    }     public static void main(String[] args) {         Spider.create(new
GaReiZeroSpider()).downloader(new GaReiZeroDownloader()).addPipeline(new
GaReiZeroPipline()).addUrl("https://manhua.dmzj.com/shiling").start();        
 }           }
downloader
import java.io.Closeable; import java.io.IOException; import
org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import
org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions; import
us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import
us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html; import
us.codecraft.webmagic.selector.PlainText; public class GaReiZeroDownloader
implements Downloader, Closeable{ @Override public void close() throws
IOException { } @Override public Page download(Request request, Task task) {
System.getProperties().setProperty("webdriver.chrome.driver","D:\\newChromeDriver\\chromedriver_win32\\chromedriver.exe");
ChromeOptions options = new ChromeOptions(); options.setBinary("C:\\Program
Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
options.addArguments("test-type"); //ignore certificate errors
options.addArguments("headless");// headless mode
options.addArguments("disable-gpu");
options.addArguments("Cookie:display_mode=1"); WebDriver driver = new
ChromeDriver(options); driver.get(request.getUrl()); WebElement webElement =
driver.findElement(By.xpath("/html")); String content =
webElement.getAttribute("outerHTML"); Page page = new Page();
page.setRawText(content); page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl())); page.setRequest(request);
driver.close(); driver.quit(); return page; } @Override public void
setThread(int threadNum) { } }
pipline
public class GaReiZeroPipline implements Pipeline{ @Override public void
process(ResultItems resultItems, Task task) { try {
if(null!=resultItems.get("imgurl")) { List<String>
imgurl=resultItems.get("imgurl"); if(!imgurl.isEmpty()) {
DownLoadImg(imgurl,5,500); } } } catch (Exception e) { } } private void
DownLoadImg(List<String> imgList, int threadsize, long sleeptime) { int
count=0; int size=imgList.size(); ExecutorService fixedThreadPool =
Executors.newFixedThreadPool(threadsize); CompletionService<String> cs = new
ExecutorCompletionService<String>(fixedThreadPool); for (String url : imgList)
{ final String url1 = url; cs.submit(new Callable<String>() { public String
call() throws Exception { try { Thread.sleep(sleeptime); return down(url1); }
catch (InterruptedException e) { System.out.println("线程异常"); return
"error_"+"url1"; } } }); } for (String url : imgList) { try { String a =
cs.take().get(); if(a!=null) { count++; } } catch (Exception e) {
e.printStackTrace(); }finally { if(count==size) { System.out.println("over");
}else { System.out.println(count+"/"+size); } } } fixedThreadPool.shutdown(); }
protected String down(String url) { try { url=url.replace(" ", ""); File dest1
= new File("D:/manhua"); if (!dest1.exists() && !dest1.isDirectory()) {
dest1.mkdir(); } File dest2 = new File("D:/manhua/" + url.split("___")[0]); if
(!dest2.exists() && !dest2.isDirectory()) { dest2.mkdir(); } File dest = new
File("D:/manhua/" + url.split("___")[0] + "/" + url.split("___")[1] + "." +
url.split("___")[2].split("\\.")[url.split("___")[2].split("\\.").length- 1]);
if (!dest.exists()) { dest.createNewFile(); } //接收字节输入流 InputStream is; //字节输出流
FileOutputStream fos = new FileOutputStream(dest); URL temp; String
imgurl=url.split("___")[2]; temp = new URL(imgurl.trim()); HttpURLConnection
uc=(HttpURLConnection) temp.openConnection();
uc.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64;
rv:59.0) Gecko/20100101 Firefox/59.0"); //必须加refer 防封 这个比较烂 写成百度地址也可以
uc.addRequestProperty("Referer", "https://manhua.dmzj.com/");
is=uc.getInputStream(); //为字节输入流加缓冲 BufferedInputStream bis = new
BufferedInputStream(is); //为字节输出流加缓冲 BufferedOutputStream bos = new
BufferedOutputStream(fos); int length; byte[] bytes = new byte[1024 * 20];
while ((length = bis.read(bytes, 0, bytes.length)) != -1) { fos.write(bytes, 0,
length); } bos.close(); fos.close(); bis.close(); is.close(); return
"success_"+"url1"; } catch (Exception e) { e.printStackTrace(); return
"error_"+"url1"; } } }
download没有复用webdriver,建议改造下

欢迎加群313557283(刚创建),小白互相学习~

 

友情链接
KaDraw流程图
API参考文档
OK工具箱
云服务器优惠
阿里云优惠券
腾讯云优惠券
华为云优惠券
站点信息
问题反馈
邮箱:[email protected]
QQ群:637538335
关注微信