1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
|
import requests import re,sys,time import random
failed_urls = [] image_urls = []
def record_data(): global image_urls,failed_urls with open('H:/temp/list.txt','w') as f: ls = set(image_urls) print('DONE:',len(ls)) f.write('\n'.join(ls)) with open('H:/temp/failed_urls.txt','w') as f: ls = set(failed_urls) print('FAILED:',len(ls)) f.write('\n'.join(ls))
<!--more-->
def hr(url,retry=1,sleeptime=1): global failed_urls try: time.sleep(sleeptime) print('url:',url) return requests.get(url,timeout=3) except KeyboardInterrupt as e: raise e except: print('url:',url,'failed.retry=',retry) if retry > 3: print('url:',url,'failed more than 3 times.pass') failed_urls.append(url) else: return hr(url,retry=retry+1,sleeptime=sleeptime+5) return None
num = 0 def download_image(url): global num num+=1 suffix = url[url.rindex('.'):] print('downloading:[',url,']-->',num)
r = hr(url) if r is not None: with open('H:/temp/' + str(num) + suffix,'wb') as f: f.write(r.content)
def download(): urls = [] with open('H:/temp/list.txt','rb') as f: txt = f.read().decode('utf-8') urls = txt.split('\n') for x in urls: download_image(x.strip())
def main(): global image_urls
baseurl = 'http://wallls.com/tag/anime%2Bgirls' r = requests.get(baseurl) page_num = int(re.findall(r'of (\d+)',r.text)[0]) print(page_num) try: for page in range(1,page_num+1): list_url = baseurl + '/' + str(page) lr = hr(list_url) if lr is not None: detail_urls = re.findall(r'href="(/wallpaper/\d+)/',lr.text) for durl in ['http://wallls.com' + x for x in detail_urls]: dr = hr(durl) if dr is not None: imgurl = re.findall(r'http://\w+.wallls.com/uploads/original/\w+/\w+/\w+.\w+.(?:png|jpg)',dr.text)[0] image_urls.append(imgurl) except: pass record_data() download()
if __name__ == '__main__': main()
|