deftxt_to_dic(name): filename=name dic=[] with open(filename,'r',encoding='utf-8') as file: for line in file: line=line.strip('\n') dic.append(line) print('%s Txt loads fine'%(name)) print("url has %s "%(len(dic))) return dic
说明: 输入 name文件名; 输出 dic 以列表形式的url;
解析链接urllib.parse
2.1 获取资源格式类型
1 2 3 4 5 6 7
import re defget_suffix(name): m = re.search(r'\.[^\.]*$', name) if m.group(0) and len(m.group(0)) <= 5: return m.group(0) else: return'.jpg'
import random defget_user_agent(): user_agent = [ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", ] return random.choice(user_agent)
3.2 Cookie和timeout
1 2 3
jar = requests.cookies.RequestsCookieJar() jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies') r = requests.get(url, cookies=jar)
Cookie 的返回对象为 RequestsCookieJar,它的行为和字典类似,但接口更为完整,适合跨域名跨路径使用。你还可以把 Cookie Jar 传到 Requests 中.
timeout 仅对连接过程有效,与响应体的下载无关。 timeout 并不是整个下载响应的时间限制,而是如果服务器在 timeout 秒内没有应答,将会引发一个异常(更精确地说,是在 timeout 秒内没有从基础套接字上接收到任何字节的数据时)If no timeout is specified explicitly, requests do not time out.
defsave_image(self, url,rsp_data, word): # if directory not exists ifnot os.path.exists("./" + word): os.mkdir("./" + word) # current existed file number self.__counter = len(os.listdir('./' + word)) + 1 time.sleep(self.time_sleep) # get file name name="./"+word+"/"+str(self.__counter)+get_suffix(url) with open(name, 'wb') as file: file.write(rsp_data) print("图+1,已有" + str(self.__counter) + "张图片") return
5.2 原始套接字响应内容
1 2 3 4 5
# must set stream=true r = requests.get('https://api.github.com/events', stream=True) with open(filename, 'wb') as fd: for chunk in r.iter_content(chunk_size): fd.write(chunk)
defrefresh(self, count=1, status=None): self.count += count # if status is not None: self.status = status or self.status end_str = "\r" if self.count >= self.total: end_str = '\n' self.status = status or self.fin_status print(self.__get_info(), end=end_str)
defdownload_progress(): start=time.time() with closing(requests.get("https://i.imgur.com/YjVeqM9h.jpg", stream=True)) as response: chunk_size = 1024 content_size = int(response.headers['content-length']) progress = ProgressBar("downloading", total=content_size, unit="KB", chunk_size=chunk_size, run_status="正在下载", fin_status="下载完成") # chunk_size = chunk_size < content_size and chunk_size or content_size with open('./file.mp3', "wb") as file: for data in response.iter_content(chunk_size=chunk_size): file.write(data) progress.refresh(count=len(data)) end=time.time() print("time costs {:.3f} secs".format(end-start))