| def get_user_agent(): user_agent = [ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", ] return random.choice(user_agent)
def save_to_txt(file_name,dic): with open(file_name,'a+',encoding='utf-8') as file: for key,value in dic.items(): file.write(key+":"+value) file.write('\n') file.close() print("fined writing\n\r") def find_hotel_url(soup): dic={} link_prefix='https://www.dianping.com/shop/' body=soup.findAll('li',class_='hotel-block') for tem in body: name=tem.a.get_text() num=tem['data-poi'] link=link_prefix+num dic[name]=link return dic
""" get hotel's link of chengdu city radomly choose user_agent input: fp: begin page tp: end page (default in function : href) return : dic : keyvalue(hotelname),key(hotel url) """ def scrapy_link_main(fp,tp): for i in range(fp,tp+1): href_prefix="https://www.dianping.com/chengdu/hotel/" href_suffix="p"+str(i) if i == 0: href_suffix="p" href=href_prefix+href_suffix headers = {'User-Agent': get_user_agent()} Html=requests.get(href,headers=headers) soup=BeautifulSoup(Html.content,'html.parser') print(soup.ROOT_TAG_NAME) soup.name dic=find_hotel_url(soup) print(dic) filename='.\hotel\hotel_url.txt' save_to_txt(filename,dic)
""" input: none output: txt """ def txt_to_dic(): filename='.\hotel\hotel_url.txt' dic={} with open(filename,'r',encoding='utf-8') as file: for line in file: tem=line.split(':',maxsplit=1) dic[str(tem[0])]=str(tem[1]) print('Txt loads fine') print("dic has %s "%(len(dic))) return dic
def get_soup(name,url): hotel_link=url headers = {'User-Agent': get_user_agent(), 'Cookie':'__mta=150787721.1541145960944.1541166756059.1541208492067.12; cy=8; cye=chengdu; _lxsdk_cuid=166d3570de5c8-07985b70746ce-b79183d-144000-166d3570de69d; _lxsdk=166d3570de5c8-07985b70746ce-b79183d-144000-166d3570de69d; _hc.v=c746c1d8-9fb6-4e00-2712-20e2880ac118.1541143993; s_ViewType=10; cityInfo=%7B%22cityId%22%3A8%2C%22cityEnName%22%3A%22chengdu%22%2C%22cityName%22%3A%22%E6%88%90%E9%83%BD%22%7D; __utma=1.1900410642.1541164771.1541164771.1541164771.1; __utmz=1.1541164771.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _lx_utm=utm_source%3Dgoogle%26utm_medium%3Dorganic; _lxsdk_s=166d72f44b8-8f2-afd-052%7C%7C2' } try : Html=requests.get(hotel_link,headers=headers) status_code=Html.status_code if status_code!=200: print("Name:%s Url: %s ----Response error! %s"%(name,url,status_code)) time.sleep(random.randint(1,3)) return None except Exception as ex: time.sleep(random.randint(1,5)) print('Connect Error---- %s -----%s'%(name,url)) print(ex) return None print("Hotel %s link %s : Response Code is : %s \n ! "%(name,url,Html)) soup=BeautifulSoup(Html.content) tit=soup.find('title').get_text() if tit=="验证中心": print('---Sorry Need to get out of Verification Manually----') while True: print(' Enter goon to Continue ') g=input() if g=='goon': break Html.close() time.sleep(random.randint(1,3)) return soup
def get_information(soup): t=soup.find('div',class_='info-value') a=soup.find('span',class_='hotel-address') try: phone_num=t.get_text() except Exception as ex: print(" %s Get information failed Maybe PhoneNum lost;"%(ex)) phone_num="None" try: address=a.get_text() except Exception as ex: print(" %s Get information failed Maybe Address lost;"%(ex)) address='None' return [phone_num,address]
def write_to_json(dic): with open('data.json', 'w',encoding='utf-8') as f: dic_json=json.dumps(dic,indent=4,ensure_ascii=False) f.write(dic_json)
def information_scrap(): dic=txt_to_dic() dic_json={} i=0 for name,link in dic.items(): link=link[:-1] soup=get_soup(name,link) i+=1 print(i) if not soup: print("the url is broken Continue Next") continue [ph,add]=get_information(soup) sub_dic=[link,ph,add] print(sub_dic) dic_json[name]=sub_dic return dic_json