1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
| def get_user_agent(): user_agent = [ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", ] return random.choice(user_agent)
def save_to_txt(file_name,dic): with open(file_name,'a+',encoding='utf-8') as file: for key,value in dic.items(): file.write(key+":"+value) file.write('\n') file.close() print("fined writing\n\r") def find_hotel_url(soup): dic={} link_prefix='https://www.dianping.com/shop/' body=soup.findAll('li',class_='hotel-block') for tem in body: name=tem.a.get_text() num=tem['data-poi'] link=link_prefix+num dic[name]=link return dic
""" get hotel's link of chengdu city radomly choose user_agent input: fp: begin page tp: end page (default in function : href) return : dic : keyvalue(hotelname),key(hotel url) """ def scrapy_link_main(fp,tp): for i in range(fp,tp+1): href_prefix="https://www.dianping.com/chengdu/hotel/" href_suffix="p"+str(i) if i == 0: href_suffix="p" href=href_prefix+href_suffix headers = {'User-Agent': get_user_agent()} Html=requests.get(href,headers=headers) soup=BeautifulSoup(Html.content,'html.parser') print(soup.ROOT_TAG_NAME) soup.name dic=find_hotel_url(soup) print(dic) filename='.\hotel\hotel_url.txt' save_to_txt(filename,dic)
""" input: none output: txt """ def txt_to_dic(): filename='.\hotel\hotel_url.txt' dic={} with open(filename,'r',encoding='utf-8') as file: for line in file: tem=line.split(':',maxsplit=1) dic[str(tem[0])]=str(tem[1]) print('Txt loads fine') print("dic has %s "%(len(dic))) return dic
def get_soup(name,url): hotel_link=url headers = {'User-Agent': get_user_agent(), 'Cookie':'__mta=150787721.1541145960944.1541166756059.1541208492067.12; cy=8; cye=chengdu; _lxsdk_cuid=166d3570de5c8-07985b70746ce-b79183d-144000-166d3570de69d; _lxsdk=166d3570de5c8-07985b70746ce-b79183d-144000-166d3570de69d; _hc.v=c746c1d8-9fb6-4e00-2712-20e2880ac118.1541143993; s_ViewType=10; cityInfo=%7B%22cityId%22%3A8%2C%22cityEnName%22%3A%22chengdu%22%2C%22cityName%22%3A%22%E6%88%90%E9%83%BD%22%7D; __utma=1.1900410642.1541164771.1541164771.1541164771.1; __utmz=1.1541164771.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _lx_utm=utm_source%3Dgoogle%26utm_medium%3Dorganic; _lxsdk_s=166d72f44b8-8f2-afd-052%7C%7C2' } try : Html=requests.get(hotel_link,headers=headers) status_code=Html.status_code if status_code!=200: print("Name:%s Url: %s ----Response error! %s"%(name,url,status_code)) time.sleep(random.randint(1,3)) return None except Exception as ex: time.sleep(random.randint(1,5)) print('Connect Error---- %s -----%s'%(name,url)) print(ex) return None print("Hotel %s link %s : Response Code is : %s \n ! "%(name,url,Html)) soup=BeautifulSoup(Html.content) tit=soup.find('title').get_text() if tit=="验证中心": print('---Sorry Need to get out of Verification Manually----') while True: print(' Enter goon to Continue ') g=input() if g=='goon': break Html.close() time.sleep(random.randint(1,3)) return soup
def get_information(soup): t=soup.find('div',class_='info-value') a=soup.find('span',class_='hotel-address') try: phone_num=t.get_text() except Exception as ex: print(" %s Get information failed Maybe PhoneNum lost;"%(ex)) phone_num="None" try: address=a.get_text() except Exception as ex: print(" %s Get information failed Maybe Address lost;"%(ex)) address='None' return [phone_num,address]
def write_to_json(dic): with open('data.json', 'w',encoding='utf-8') as f: dic_json=json.dumps(dic,indent=4,ensure_ascii=False) f.write(dic_json)
def information_scrap(): dic=txt_to_dic() dic_json={} i=0 for name,link in dic.items(): link=link[:-1] soup=get_soup(name,link) i+=1 print(i) if not soup: print("the url is broken Continue Next") continue [ph,add]=get_information(soup) sub_dic=[link,ph,add] print(sub_dic) dic_json[name]=sub_dic return dic_json
|