Skip to content

Commit

Permalink
new
Browse files Browse the repository at this point in the history
  • Loading branch information
Nyloner committed Apr 9, 2016
1 parent 14f48ae commit c9199e2
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 0 deletions.
93 changes: 93 additions & 0 deletions www.tripadvisor.com/deal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#coding:utf-8

import os
from bs4 import BeautifulSoup
import re


def Deal():
f=open('data.txt','w')
for filename in os.listdir('page'):
html=open('page/'+filename,'r').read()
soup=BeautifulSoup(html,'lxml').find('div',id='REVIEWS').find_all('div',attrs={'class':'reviewSelector'})
for item in soup:
infor=item.find('div',attrs={'class':'member_info'})
try:
text=infor.find('div',attrs={'class':'username mo'}).get_text()+'||'+infor.find('div',attrs={'class':'memberOverlayLink'}).get('id')
except:
print(filename)
continue
try:
location=infor.find('div',attrs={'class':'location'}).get_text()
except:
location='--'
try:
level=item.find('div',attrs={'class':'levelBadge'}).get('class')[-1]
except:
level='--'
content=item.find('div',attrs={'class':'col2of2'}).find('div',attrs={'class':'wrap'})
try:
title=content.find('div',attrs={'class':'quote'}).find('a').get_text()
except:
title='--'
try:
rate=content.find('div',attrs={'class':'rating reviewItemInline'}).find('img').get('alt')
except:
rate='--'
try:
via=content.find('a',attrs={'class':'viaMobile sprite-grayPhone'}).get_text()
except:
via='--'
text+='|| '+location+' ||'+level+'||'+title+'||'+rate+'||'+via
text=text.replace('\r','').replace('\n','')
f.write(text+'\n')
f.close()

def Deal_data():
f=open('result.txt','w')
for line in open('re_data.txt','r'):
line=line.replace('\n','')
lists=line.split('||')
lists[3]=lists[3].replace('lvl_','')
lists[5]=lists[5].split('of')[0].replace(' ','')
text='||'.join(lists)
print(text)
f.write(text+'\n')

f.close()

def deal_to_txt():
try:
os.mkdir('hasDate')
os.mkdir('nothasDate')
except:
pass
has_file={'5':1,'4':1,'3':1,'2':1,'1':1}
stars={'5':'Excellent','4':'Verygood','3':'Average','2':'Poor','1':'Terrible'}
not_file={'5':1,'4':1,'3':1,'2':1,'1':1}
for key in stars:
try:
os.mkdir('hasDate/'+stars[key])
os.mkdir('nothasDate/'+stars[key])
except:
continue
title=['Name:','Nationality:','Level:','Age & Gender:','Title:','Rating:','Mobile:','Visiting time:','Detailed review:']
for line in open('result_NEW.txt','r',encoding='utf-16'):
line=line.replace('\n','')
lists=line.split('\t')
if(lists[7]!='--'):
star=lists[5]
f=open('hasDate/%s/%s%s.txt'%(stars[star],stars[star],has_file[star]),'w')
for i in range(len(lists)):
f.write(title[i]+lists[i]+'\r\n')
f.close()
has_file[star]+=1
else:
star=lists[5]
f=open('nothasDate/%s/%s%s.txt'%(stars[star],stars[star],not_file[star]),'w')
for i in range(len(lists)):
f.write(title[i]+lists[i]+'\r\n')
f.close()
not_file[star]+=1

deal_to_txt()
40 changes: 40 additions & 0 deletions www.tripadvisor.com/getpage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#coding:utf-8

import requests
import os
import time

headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate'}

def main():
html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS',headers=headers).text
try:
os.mkdir('page')
except:
pass
count=0
f=open('page'+str(count)+'.html','w')
f.write(html)
f.close()
count+=1
num=10
while True:
try:
html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-or%s-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS'%num,headers=headers).text
except:
continue
f=open('page/'+str(count)+'.html','w')
f.write(html)
f.close()
num+=10
print(num)
count+=1
if(num==8490):
break
time.sleep(2)

main()
60 changes: 60 additions & 0 deletions www.tripadvisor.com/moredata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#coding:utf-8

import requests
from bs4 import BeautifulSoup


headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate'}


def getdata(target,viewid):
html=requests.get('https://www.tripadvisor.com/ExpandedUserReviews-g294212-d325811?target=%s&context=1&reviews=%s&servlet=Attraction_Review&expand=1'%(target,viewid),headers=headers).text
table=BeautifulSoup(html,'lxml').find_all('div',attrs={'class':'innerBubble'})
result=[]
for item in table:
text=item.find('div',attrs={'class':'entry'}).get_text().replace('\r','').replace('\n','')+'||'
try:
text+=item.find('div',attrs={'class':'recommend'}).get_text().replace('\r','').replace('\n','')
except:
text+='--'
result.append(text)
return result

def main():
f=open('result.txt','a')
viewids=[]
lines=[]
count=0
for line in open('data.txt','r'):
line=line.replace('\n','')
lines.append(line)
viewid=line.split('||')[1].split('-')[-1].replace('SRC_','')
viewids.append(viewid)
if(len(viewids)<20):
continue
text=''
for id in viewids:
text+=id+','
result=getdata(viewids[0],text[:-1])
print(len(result))
for num in range(len(lines)):
f.write(lines[num]+'||'+result[num]+'\n')
viewids.clear()
lines.clear()
count+=1
print(count,'--ok')
text=''
for id in viewids:
text+=id+','
result=getdata(viewids[0],text[:-1])
for num in range(lines):
f.write(lines[num]+'||'+result[num]+'\n')
viewids.clear()
lines.clear()
f.close()

main()
59 changes: 59 additions & 0 deletions www.tripadvisor.com/userinfor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#coding:utf-8

import requests
from bs4 import BeautifulSoup
import threading


headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate'}


class Infor(threading.Thread):
def __init__(self,line):
super(Infor,self).__init__()
self.line=line
self.uid=self.line.split('||')[1].split('-')[0].replace('UID_','')

def run(self):
try:
html=requests.get('https://www.tripadvisor.com/MemberOverlay?uid=%s&c=&fus=false&partner=false&LsoId='%self.uid,headers=headers,timeout=50).text
except:
self.result='--'
self.line+='||'+self.result
return
try:
self.result=BeautifulSoup(html,'lxml').find('ul',attrs={'class':'memberdescription'}).find_all('li')[1].get_text().replace('\r','').replace('\n','')
except:
self.result='--'
self.line+='||'+self.result


def main():
f=open('re_data.txt','a')
threadings=[]
lines=[]
count=0
for line in open('result.txt','r'):
line=line.replace('\n','')
lines.append(line)
if(len(lines)<20):
continue
for line in lines:
work=Infor(line)
threadings.append(work)
for work in threadings:
work.start()
for work in threadings:
work.join()
for work in threadings:
f.write(work.line+'\n')
count+=1
print(count,'--ok')
threadings.clear()
lines.clear()

main()

0 comments on commit c9199e2

Please sign in to comment.