1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
| import urllib.request from lxml import etree import re import threading def url(url): header = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} requests = urllib.request.Request(url,headers=header) response = urllib.request.urlopen(requests) html = response.read().decode("GBK") return html def getlink(html): page = etree.HTML(html) imageurl = page.xpath(u"/html/body/div[@id='main']/div[@class='list']/ul/li/a/@href") return imageurl def nextpageurl(html): page = etree.HTML(html) next = page.xpath(u"/html/body/div[@id='main']/div[@class='page']/a") allpage = len(next) return allpage def allpageurl(num): pageurllist = ['http://www.netbian.com/s/wlop/index.htm',] for i in range(2,num + 1): url = "http://www.netbian.com/s/wlop/index_"+str(i)+".htm" pageurllist.append(url) return pageurllist def lastimagelink(allimagelinks): alldownlink = [] for allimagelink in allimagelinks: html = url(allimagelink) # page = etree.HTML(html) # imagedownlink = page.xpath(u"/html/body[@id='endbz']/div[@id='main']/table[@id='endimg']/tbody/tr/td/a/img/@src") f = re.compile(r'<img src="([^\"]*)"', re.S) imagedownlink = f.findall(html) alldownlink.append(imagedownlink[1]) return alldownlink def tddown(alldownlinks): tmp = [] i = 1 for alldownlink in alldownlinks: downfile(alldownlink,i) # print(alldownlink) # t = threading.Thread(target=downfile, args=(alldownlink,i)) # t.setDaemon(True) # t.start() # tmp.append(t) i += 1 # for i in tmp: # i.join() def downfile(alldownlink,i): g = re.compile(r'[0-9]{4}/([A-Za-z0-9]+\.jpg)', re.S) filename = g.findall(alldownlink) a,b = urllib.request.urlretrieve(alldownlink,filename[0]) print("第"+str(i)+"张图片下载成功!")
if __name__ == "__main__": myurl = "http://www.netbian.com/s/wlop/index.htm" html = url(myurl) allpagenum = nextpageurl(html) pageurllist = allpageurl(allpagenum) allimagelink = [] for i in pageurllist: html = url(i) imageurl = getlink(html) for j in imageurl: j = re.sub(r'.htm',"-1920x1080.htm",j) j = "http://www.netbian.com" + j allimagelink.append(j) alldownlink = lastimagelink(allimagelink) tddown(alldownlink) # imageurl = getlink(html) # print(imageurl) #http://www.netbian.com #-1920x1080.htm
|