diff --git a/www.tripadvisor.com/deal.py b/www.tripadvisor.com/deal.py new file mode 100644 index 0000000..811ba2f --- /dev/null +++ b/www.tripadvisor.com/deal.py @@ -0,0 +1,93 @@ +#coding:utf-8 + +import os +from bs4 import BeautifulSoup +import re + + +def Deal(): + f=open('data.txt','w') + for filename in os.listdir('page'): + html=open('page/'+filename,'r').read() + soup=BeautifulSoup(html,'lxml').find('div',id='REVIEWS').find_all('div',attrs={'class':'reviewSelector'}) + for item in soup: + infor=item.find('div',attrs={'class':'member_info'}) + try: + text=infor.find('div',attrs={'class':'username mo'}).get_text()+'||'+infor.find('div',attrs={'class':'memberOverlayLink'}).get('id') + except: + print(filename) + continue + try: + location=infor.find('div',attrs={'class':'location'}).get_text() + except: + location='--' + try: + level=item.find('div',attrs={'class':'levelBadge'}).get('class')[-1] + except: + level='--' + content=item.find('div',attrs={'class':'col2of2'}).find('div',attrs={'class':'wrap'}) + try: + title=content.find('div',attrs={'class':'quote'}).find('a').get_text() + except: + title='--' + try: + rate=content.find('div',attrs={'class':'rating reviewItemInline'}).find('img').get('alt') + except: + rate='--' + try: + via=content.find('a',attrs={'class':'viaMobile sprite-grayPhone'}).get_text() + except: + via='--' + text+='|| '+location+' ||'+level+'||'+title+'||'+rate+'||'+via + text=text.replace('\r','').replace('\n','') + f.write(text+'\n') + f.close() + +def Deal_data(): + f=open('result.txt','w') + for line in open('re_data.txt','r'): + line=line.replace('\n','') + lists=line.split('||') + lists[3]=lists[3].replace('lvl_','') + lists[5]=lists[5].split('of')[0].replace(' ','') + text='||'.join(lists) + print(text) + f.write(text+'\n') + + f.close() + +def deal_to_txt(): + try: + os.mkdir('hasDate') + os.mkdir('nothasDate') + except: + pass + has_file={'5':1,'4':1,'3':1,'2':1,'1':1} + stars={'5':'Excellent','4':'Verygood','3':'Average','2':'Poor','1':'Terrible'} + not_file={'5':1,'4':1,'3':1,'2':1,'1':1} + for key in stars: + try: + os.mkdir('hasDate/'+stars[key]) + os.mkdir('nothasDate/'+stars[key]) + except: + continue + title=['Name:','Nationality:','Level:','Age & Gender:','Title:','Rating:','Mobile:','Visiting time:','Detailed review:'] + for line in open('result_NEW.txt','r',encoding='utf-16'): + line=line.replace('\n','') + lists=line.split('\t') + if(lists[7]!='--'): + star=lists[5] + f=open('hasDate/%s/%s%s.txt'%(stars[star],stars[star],has_file[star]),'w') + for i in range(len(lists)): + f.write(title[i]+lists[i]+'\r\n') + f.close() + has_file[star]+=1 + else: + star=lists[5] + f=open('nothasDate/%s/%s%s.txt'%(stars[star],stars[star],not_file[star]),'w') + for i in range(len(lists)): + f.write(title[i]+lists[i]+'\r\n') + f.close() + not_file[star]+=1 + +deal_to_txt() diff --git a/www.tripadvisor.com/getpage.py b/www.tripadvisor.com/getpage.py new file mode 100644 index 0000000..12048e2 --- /dev/null +++ b/www.tripadvisor.com/getpage.py @@ -0,0 +1,40 @@ +#coding:utf-8 + +import requests +import os +import time + +headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate'} + +def main(): + html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS',headers=headers).text + try: + os.mkdir('page') + except: + pass + count=0 + f=open('page'+str(count)+'.html','w') + f.write(html) + f.close() + count+=1 + num=10 + while True: + try: + html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-or%s-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS'%num,headers=headers).text + except: + continue + f=open('page/'+str(count)+'.html','w') + f.write(html) + f.close() + num+=10 + print(num) + count+=1 + if(num==8490): + break + time.sleep(2) + +main() diff --git a/www.tripadvisor.com/moredata.py b/www.tripadvisor.com/moredata.py new file mode 100644 index 0000000..0196bd9 --- /dev/null +++ b/www.tripadvisor.com/moredata.py @@ -0,0 +1,60 @@ +#coding:utf-8 + +import requests +from bs4 import BeautifulSoup + + +headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate'} + + +def getdata(target,viewid): + html=requests.get('https://www.tripadvisor.com/ExpandedUserReviews-g294212-d325811?target=%s&context=1&reviews=%s&servlet=Attraction_Review&expand=1'%(target,viewid),headers=headers).text + table=BeautifulSoup(html,'lxml').find_all('div',attrs={'class':'innerBubble'}) + result=[] + for item in table: + text=item.find('div',attrs={'class':'entry'}).get_text().replace('\r','').replace('\n','')+'||' + try: + text+=item.find('div',attrs={'class':'recommend'}).get_text().replace('\r','').replace('\n','') + except: + text+='--' + result.append(text) + return result + +def main(): + f=open('result.txt','a') + viewids=[] + lines=[] + count=0 + for line in open('data.txt','r'): + line=line.replace('\n','') + lines.append(line) + viewid=line.split('||')[1].split('-')[-1].replace('SRC_','') + viewids.append(viewid) + if(len(viewids)<20): + continue + text='' + for id in viewids: + text+=id+',' + result=getdata(viewids[0],text[:-1]) + print(len(result)) + for num in range(len(lines)): + f.write(lines[num]+'||'+result[num]+'\n') + viewids.clear() + lines.clear() + count+=1 + print(count,'--ok') + text='' + for id in viewids: + text+=id+',' + result=getdata(viewids[0],text[:-1]) + for num in range(lines): + f.write(lines[num]+'||'+result[num]+'\n') + viewids.clear() + lines.clear() + f.close() + +main() diff --git a/www.tripadvisor.com/userinfor.py b/www.tripadvisor.com/userinfor.py new file mode 100644 index 0000000..9ea18d7 --- /dev/null +++ b/www.tripadvisor.com/userinfor.py @@ -0,0 +1,59 @@ +#coding:utf-8 + +import requests +from bs4 import BeautifulSoup +import threading + + +headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate'} + + +class Infor(threading.Thread): + def __init__(self,line): + super(Infor,self).__init__() + self.line=line + self.uid=self.line.split('||')[1].split('-')[0].replace('UID_','') + + def run(self): + try: + html=requests.get('https://www.tripadvisor.com/MemberOverlay?uid=%s&c=&fus=false&partner=false&LsoId='%self.uid,headers=headers,timeout=50).text + except: + self.result='--' + self.line+='||'+self.result + return + try: + self.result=BeautifulSoup(html,'lxml').find('ul',attrs={'class':'memberdescription'}).find_all('li')[1].get_text().replace('\r','').replace('\n','') + except: + self.result='--' + self.line+='||'+self.result + + +def main(): + f=open('re_data.txt','a') + threadings=[] + lines=[] + count=0 + for line in open('result.txt','r'): + line=line.replace('\n','') + lines.append(line) + if(len(lines)<20): + continue + for line in lines: + work=Infor(line) + threadings.append(work) + for work in threadings: + work.start() + for work in threadings: + work.join() + for work in threadings: + f.write(work.line+'\n') + count+=1 + print(count,'--ok') + threadings.clear() + lines.clear() + +main()