Skip to content

Commit

Permalink
Initialize
Browse files Browse the repository at this point in the history
  • Loading branch information
yhslai committed Apr 20, 2012
0 parents commit 0f4680b
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.swp
*.swo
74 changes: 74 additions & 0 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import tweepy
import pprint
import random
import collections
import re
import requests

api = tweepy.API()
damping_factor = 1

def fetch_followings(user_id, user_graph):
# if not in cache, fetch them
if user_id not in user_graph:
user_graph[user_id] = api.friends_ids(id=user_id)
return user_graph[user_id]


def get_link(text):
url_pattern = r'(\bhttp(s)?://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?\b)'
all_matches = re.findall(url_pattern, text)
return [match[0] for match in all_matches]


def roam(user_id, user_graph, links):
if random.random() < damping_factor:
#stop at this user
timeline = api.user_timeline(id=user_id)
for status in timeline:
new_links = get_link(status.text)
for link in new_links:
#check if the link is still alive(and resolve the redirections)
links[link] += status.retweet_count + 1 # the "retweet rank" :)
else:
#go to one of following users
followings = fetch_followings(user.id)
following = random.choice(followings)
roam(following, user_graph, links)


def start():
user_graph = {}
links = collections.defaultdict(int)
users = [status.user for status in api.public_timeline()]
while len(links) < 10:
initial_user = random.choice(users)
roam(initial_user.id, user_graph, links)

links = links.items()
links.sort(key=lambda(item): item[1], reverse=True)

entities = []
for link, weight in links[:10]:
response = requests.get(link)
match = re.search(r'<title>(?P<title>[^<>]*)</title>', response.text)
title = match.group('title')
title = re.sub(r'[\n ]+', ' ', title)

entities.append({
'link': link,
'weight': weight,
'title': title,
})

for i, entity in enumerate(entities):
# it looks like "#1: http://to.c/ABCDEFG [5] -- the title of this page"
print ( "#" + str(i+1) + ": " +
entity['link'] + " [" + str(entity['weight']) + "] -- " +
entity['title'] )






16 changes: 16 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#! /usr/bin/env python

from setuptools import setup

setup(
name="TweePops",
version="0.0.1",
description="A very simple app to crawl popular news from Twitter",
author="Raincole Lai",
author_email="[email protected]",
install_requires=[
'requests>=0.11.1',
'tweepy>=1.9',
],
)

0 comments on commit 0f4680b

Please sign in to comment.