Python爬虫爬取鬼刀的画作

都是比较简单的内容,我就不建项目了,直接上代码了,还有大佬说为啥不用多线程,我注释里有多线程的代码,你取消注释就好了,但是我每次多线程下载到一半就出错了,所以还是用了urlretrieve安稳下载了,速度也挺快的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import urllib.request
from lxml import etree
import re
import threading
def url(url):
header = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
requests = urllib.request.Request(url,headers=header)
response = urllib.request.urlopen(requests)
html = response.read().decode("GBK")
return html
def getlink(html):
page = etree.HTML(html)
imageurl = page.xpath(u"/html/body/div[@id='main']/div[@class='list']/ul/li/a/@href")
return imageurl
def nextpageurl(html):
page = etree.HTML(html)
next = page.xpath(u"/html/body/div[@id='main']/div[@class='page']/a")
allpage = len(next)
return allpage
def allpageurl(num):
pageurllist = ['http://www.netbian.com/s/wlop/index.htm',]
for i in range(2,num + 1):
url = "http://www.netbian.com/s/wlop/index_"+str(i)+".htm"
pageurllist.append(url)
return pageurllist
def lastimagelink(allimagelinks):
alldownlink = []
for allimagelink in allimagelinks:
html = url(allimagelink)
# page = etree.HTML(html)
# imagedownlink = page.xpath(u"/html/body[@id='endbz']/div[@id='main']/table[@id='endimg']/tbody/tr/td/a/img/@src")
f = re.compile(r'<img src="([^\"]*)"', re.S)
imagedownlink = f.findall(html)
alldownlink.append(imagedownlink[1])
return alldownlink
def tddown(alldownlinks):
tmp = []
i = 1
for alldownlink in alldownlinks:
downfile(alldownlink,i)
# print(alldownlink)
# t = threading.Thread(target=downfile, args=(alldownlink,i))
# t.setDaemon(True)
# t.start()
# tmp.append(t)
i += 1
# for i in tmp:
# i.join()
def downfile(alldownlink,i):
g = re.compile(r'[0-9]{4}/([A-Za-z0-9]+\.jpg)', re.S)
filename = g.findall(alldownlink)
a,b = urllib.request.urlretrieve(alldownlink,filename[0])
print("第"+str(i)+"张图片下载成功!")

if __name__ == "__main__":
myurl = "http://www.netbian.com/s/wlop/index.htm"
html = url(myurl)
allpagenum = nextpageurl(html)
pageurllist = allpageurl(allpagenum)
allimagelink = []
for i in pageurllist:
html = url(i)
imageurl = getlink(html)
for j in imageurl:
j = re.sub(r'.htm',"-1920x1080.htm",j)
j = "http://www.netbian.com" + j
allimagelink.append(j)
alldownlink = lastimagelink(allimagelink)
tddown(alldownlink)
# imageurl = getlink(html)
# print(imageurl)
#http://www.netbian.com
#-1920x1080.htm
感谢您的阅读,本文由 ZhangAo`s Blog 版权所有。如若转载,请注明出处:ZhangAo`s Blog(https://www.imzhangao.com/2018/09/02/Python爬虫爬取鬼刀的画作/
Python爬虫爬取微软官方Windows_10_ISO镜像文件
Python破解百度云限速