new

stamhe · Apr 9, 2016 · c9199e2 · c9199e2
1 parent 14f48ae
commit c9199e2
Show file tree

Hide file tree

Showing 4 changed files with 252 additions and 0 deletions.
diff --git a/www.tripadvisor.com/deal.py b/www.tripadvisor.com/deal.py
@@ -0,0 +1,93 @@
+#coding:utf-8
+
+import os
+from bs4 import BeautifulSoup
+import re
+
+
+def Deal():
+    f=open('data.txt','w')
+    for filename in os.listdir('page'):
+        html=open('page/'+filename,'r').read()
+        soup=BeautifulSoup(html,'lxml').find('div',id='REVIEWS').find_all('div',attrs={'class':'reviewSelector'})
+        for item in soup:
+            infor=item.find('div',attrs={'class':'member_info'})
+            try:
+                text=infor.find('div',attrs={'class':'username mo'}).get_text()+'||'+infor.find('div',attrs={'class':'memberOverlayLink'}).get('id')
+            except:
+                print(filename)
+                continue
+            try:
+                location=infor.find('div',attrs={'class':'location'}).get_text()
+            except:
+                location='--'
+            try:
+                level=item.find('div',attrs={'class':'levelBadge'}).get('class')[-1]
+            except:
+                level='--'
+            content=item.find('div',attrs={'class':'col2of2'}).find('div',attrs={'class':'wrap'})
+            try:
+                title=content.find('div',attrs={'class':'quote'}).find('a').get_text()
+            except:
+                title='--'
+            try:
+                rate=content.find('div',attrs={'class':'rating reviewItemInline'}).find('img').get('alt')
+            except:
+                rate='--'
+            try:
+                via=content.find('a',attrs={'class':'viaMobile sprite-grayPhone'}).get_text()
+            except:
+                via='--'
+            text+='|| '+location+' ||'+level+'||'+title+'||'+rate+'||'+via
+            text=text.replace('\r','').replace('\n','')
+            f.write(text+'\n')
+    f.close()
+
+def Deal_data():
+    f=open('result.txt','w')
+    for line in open('re_data.txt','r'):
+        line=line.replace('\n','')
+        lists=line.split('||')
+        lists[3]=lists[3].replace('lvl_','')
+        lists[5]=lists[5].split('of')[0].replace(' ','')
+        text='||'.join(lists)
+        print(text)
+        f.write(text+'\n')
+
+    f.close()
+
+def deal_to_txt():
+    try:
+        os.mkdir('hasDate')
+        os.mkdir('nothasDate')
+    except:
+        pass
+    has_file={'5':1,'4':1,'3':1,'2':1,'1':1}
+    stars={'5':'Excellent','4':'Verygood','3':'Average','2':'Poor','1':'Terrible'}
+    not_file={'5':1,'4':1,'3':1,'2':1,'1':1}
+    for key in stars:
+        try:
+            os.mkdir('hasDate/'+stars[key])
+            os.mkdir('nothasDate/'+stars[key])
+        except:
+            continue
+    title=['Name:','Nationality:','Level:','Age & Gender:','Title:','Rating:','Mobile:','Visiting time:','Detailed review:']
+    for line in open('result_NEW.txt','r',encoding='utf-16'):
+        line=line.replace('\n','')
+        lists=line.split('\t')
+        if(lists[7]!='--'):
+            star=lists[5]
+            f=open('hasDate/%s/%s%s.txt'%(stars[star],stars[star],has_file[star]),'w')
+            for i in range(len(lists)):
+                f.write(title[i]+lists[i]+'\r\n')
+            f.close()
+            has_file[star]+=1
+        else:
+            star=lists[5]
+            f=open('nothasDate/%s/%s%s.txt'%(stars[star],stars[star],not_file[star]),'w')
+            for i in range(len(lists)):
+                f.write(title[i]+lists[i]+'\r\n')
+            f.close()
+            not_file[star]+=1
+
+deal_to_txt()
diff --git a/www.tripadvisor.com/getpage.py b/www.tripadvisor.com/getpage.py
@@ -0,0 +1,40 @@
+#coding:utf-8
+
+import requests
+import os
+import time
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Accept-Encoding': 'gzip, deflate'}
+
+def main():
+    html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS',headers=headers).text
+    try:
+        os.mkdir('page')
+    except:
+        pass
+    count=0
+    f=open('page'+str(count)+'.html','w')
+    f.write(html)
+    f.close()
+    count+=1
+    num=10
+    while True:
+        try:
+            html=requests.get('https://www.tripadvisor.com/Attraction_Review-g294212-d325811-Reviews-or%s-Great_Wall_at_Mutianyu-Beijing.html#REVIEWS'%num,headers=headers).text
+        except:
+            continue
+        f=open('page/'+str(count)+'.html','w')
+        f.write(html)
+        f.close()
+        num+=10
+        print(num)
+        count+=1
+        if(num==8490):
+            break
+        time.sleep(2)
+
+main()
diff --git a/www.tripadvisor.com/moredata.py b/www.tripadvisor.com/moredata.py
@@ -0,0 +1,60 @@
+#coding:utf-8
+
+import requests
+from bs4 import BeautifulSoup
+
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Accept-Encoding': 'gzip, deflate'}
+
+
+def getdata(target,viewid):
+    html=requests.get('https://www.tripadvisor.com/ExpandedUserReviews-g294212-d325811?target=%s&context=1&reviews=%s&servlet=Attraction_Review&expand=1'%(target,viewid),headers=headers).text
+    table=BeautifulSoup(html,'lxml').find_all('div',attrs={'class':'innerBubble'})
+    result=[]
+    for item in table:
+        text=item.find('div',attrs={'class':'entry'}).get_text().replace('\r','').replace('\n','')+'||'
+        try:
+            text+=item.find('div',attrs={'class':'recommend'}).get_text().replace('\r','').replace('\n','')
+        except:
+            text+='--'
+        result.append(text)
+    return result
+
+def main():
+    f=open('result.txt','a')
+    viewids=[]
+    lines=[]
+    count=0
+    for line in open('data.txt','r'):
+        line=line.replace('\n','')
+        lines.append(line)
+        viewid=line.split('||')[1].split('-')[-1].replace('SRC_','')
+        viewids.append(viewid)
+        if(len(viewids)<20):
+            continue
+        text=''
+        for id in viewids:
+            text+=id+','
+        result=getdata(viewids[0],text[:-1])
+        print(len(result))
+        for num in range(len(lines)):
+            f.write(lines[num]+'||'+result[num]+'\n')
+        viewids.clear()
+        lines.clear()
+        count+=1
+        print(count,'--ok')
+    text=''
+    for id in viewids:
+        text+=id+','
+    result=getdata(viewids[0],text[:-1])
+    for num in range(lines):
+        f.write(lines[num]+'||'+result[num]+'\n')
+    viewids.clear()
+    lines.clear()
+    f.close()
+
+main()
diff --git a/www.tripadvisor.com/userinfor.py b/www.tripadvisor.com/userinfor.py
@@ -0,0 +1,59 @@
+#coding:utf-8
+
+import requests
+from bs4 import BeautifulSoup
+import threading
+
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Accept-Encoding': 'gzip, deflate'}
+
+
+class Infor(threading.Thread):
+    def __init__(self,line):
+        super(Infor,self).__init__()
+        self.line=line
+        self.uid=self.line.split('||')[1].split('-')[0].replace('UID_','')
+
+    def run(self):
+        try:
+            html=requests.get('https://www.tripadvisor.com/MemberOverlay?uid=%s&c=&fus=false&partner=false&LsoId='%self.uid,headers=headers,timeout=50).text
+        except:
+            self.result='--'
+            self.line+='||'+self.result
+            return
+        try:
+            self.result=BeautifulSoup(html,'lxml').find('ul',attrs={'class':'memberdescription'}).find_all('li')[1].get_text().replace('\r','').replace('\n','')
+        except:
+            self.result='--'
+        self.line+='||'+self.result
+
+
+def main():
+    f=open('re_data.txt','a')
+    threadings=[]
+    lines=[]
+    count=0
+    for line in open('result.txt','r'):
+        line=line.replace('\n','')
+        lines.append(line)
+        if(len(lines)<20):
+            continue
+        for line in lines:
+            work=Infor(line)
+            threadings.append(work)
+        for work in threadings:
+            work.start()
+        for work in threadings:
+            work.join()
+        for work in threadings:
+            f.write(work.line+'\n')
+        count+=1
+        print(count,'--ok')
+        threadings.clear()
+        lines.clear()
+
+main()