工具:python3
本文主要用python实现动漫之家的爬取
如果有安装模块疑问请自行百度,有代码格式问题,也可以在下方回复
分为两部分跟java那部分一样分为无框架和scrapy框架实现,步骤和实现原理都是一样的可以参考前篇
代码不再详细注释
无框架
from selenium import webdriver from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options from urllib.request
import urlretrieve from urllib.request import build_opener from urllib.request
import install_opener import os import time if __name__ == "__main__":
chrome_options = Options() chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--headless') abspath =
os.path.abspath(r"D:\chromedriver_win32\chromedriver.exe") dr =
webdriver.Chrome(executable_path=abspath,chrome_options=chrome_options)
dr.get("https://manhua.dmzj.com/shiling")
element=dr.find_elements_by_xpath("//div[@class='cartoon_online_border']/ul/li/a")
urllist=[] for sel1 in element: urllist.append(sel1.get_attribute("href"))
print(dr.title) imglist=[] for list2 in urllist: print (list2) dr.get(list2)
element2=dr.find_elements_by_xpath("//div[@class='btmBtnBox']/select/option")
for sel2 in element2:
title=dr.find_elements_by_xpath("//div[@class='display_middle']/h1/a")[0].text+dr.find_elements_by_xpath("//div[@class='display_middle']/span")[0].text;
imglist.append(title+'---https:'+sel2.get_attribute("value")+'---'+sel2.text)
dr.close() dr.quit() total=len(imglist) index_dest="D:/manhua" if not
os.path.exists(index_dest) : os.makedirs(index_dest) else:
print(index_dest+"已创建") j=0 for list3 in imglist:
imgdest=index_dest+list3.split('---')[0].split('-')[0].strip() if not
os.path.exists(imgdest): os.makedirs(imgdest) else: print(imgdest+"已创建")
opener=build_opener() opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT
6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0
Safari/537.36'),("Referer","https://manhua.dmzj.com/")] install_opener(opener)
img_url=list3.split('---')[1]
filename=imgdest+"/"+list3.split('---')[2]+"."+img_url.split('.')[len(img_url.split('.'))-1]
j+=1 if filename not in os.listdir(): urlretrieve(url = img_url,filename =
filename) else: print(filename+"已下载") print(j+"/"+total) time.sleep(1)
print("下载完成")
Scrapy框架
下期见
热门工具 换一换