Skip to content

Commit

Permalink
Merge pull request #10 from mdredze/jack
Browse files Browse the repository at this point in the history
Carmen 2.0
  • Loading branch information
jackjyzhang authored Dec 13, 2022
2 parents 070b974 + 3633101 commit d369fb9
Show file tree
Hide file tree
Showing 41 changed files with 521,859 additions and 132 deletions.
22 changes: 22 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
twitter-api/*
jack/*
test_examples/*
test_out/*

*.py[cdo]
*.log


# Virtual Environment
env/
venv/

# Packages
*.egg
Expand All @@ -8,3 +19,14 @@ dist

# Documentation
doc/_build

# Geonames Data
carmen/data/dump/
/geonames_data
geoname_mapping/Geonames

# Mac OSX Files
.DS_Store

# PyCharm files
/.idea
19 changes: 0 additions & 19 deletions README.markdown

This file was deleted.

60 changes: 60 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Carmen

A Python version of [Carmen](https://github.com/mdredze/carmen),
a library for geolocating tweets.

Given a tweet, Carmen will return `Location` objects that represent a
physical location.
Carmen uses both coordinates and other information in a tweet to make
geolocation decisions.
It's not perfect, but this greatly increases the number of geolocated
tweets over what Twitter provides.

To install, simply run:

$ python setup.py install

To run the Carmen frontend, see:

$ python -m carmen.cli --help

### Geonames Mapping

Alternatively, `locations.json` can be swapped out to use Geonames IDs
instead of arbitrary IDs used in the original version of Carmen. This
JSON file can be found in `carmen/data/new.json`.

Below are instructions on how mappings can be generated.

First, we need to get the data. This can be found at
http://download.geonames.org/export/dump/. The required files are
`countryInfo.txt`, `admin1CodesASCII.txt`, `admin2Codes.txt`, and
`cities1000.txt`. Download these files and move them into
`carmen/data/dump/`.

Next, we need to format our data. We can simply delete the comments in
`countryInfo.txt`. Afterwards, run the following.

$ python3 format_admin1_codes.py
$ python3 format_admin2_codes.py

Then, we need to set up a PostgreSQL database, as this allows finding
relations between the original Carmen IDs and Geonames IDs significantly
easier. To set up the database, create a PostgreSQL database named `carmen`
and reun the following SQL script:

$ psql -f carmen/sql/populate_db.sql carmen

Now we can begin constructing the mappings from Carmen IDs to
Geonames IDs. Run the following scripts.

$ python3 map_cities.py > ../mappings/cities.txt
$ python3 map_regions.py > ../mappings/regions.txt

With the mappings constructed, we can finally attempt to convert the
`locations.json` file into one that uses Geonames IDs. To do this, run
the following.

$ python3 rewrite_json.py


2 changes: 1 addition & 1 deletion carmen/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Carmen, a library for geolocating tweets."""

__version__ = '0.0.3'
__version__ = '0.0.4'

from .resolver import get_resolver
125 changes: 78 additions & 47 deletions carmen/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import collections
import gzip
import json
import jsonlines
import sys
import warnings

Expand Down Expand Up @@ -37,10 +38,17 @@ def parse_args():
nargs='?', default=sys.stdout,
help='file to write geolocated tweets to (defaults to standard '
'output)')
parser.add_argument('--debug', '-d',
action='store_true',
help='turn on debug (verbose) mode')
return parser.parse_args()


def open_file(filename, mode):
# Check for stdin/stdout case
if "_io.TextIOWrapper" in str(filename.__class__):
return filename
# GZIP case
if filename.endswith('.gz'):
return gzip.open(filename, mode)
else:
Expand All @@ -63,55 +71,78 @@ def main():
has_place = has_coordinates = has_geo = has_profile_location = 0
resolution_method_counts = collections.defaultdict(int)
skipped_tweets = resolved_tweets = total_tweets = 0

with open_file(args.input_file, 'rb') as input_file, open_file(args.output_file, 'wb') as output_file:
for i, input_line in enumerate(input_file):
# Show warnings from the input file, not the Python source code.
def showwarning(message, category, filename, lineno,
file=sys.stderr, line=None):
sys.stderr.write(warnings.formatwarning(
message, category, input_file.name, i+1,
line=''))
warnings.showwarning = showwarning
try:
if len(input_line.strip()) == 0:

fi = open_file(args.input_file, "rb")
fo = open_file(args.output_file, 'wb')
with jsonlines.Writer(fo) as writer:
with jsonlines.Reader(fi) as reader:
for i, tweet in enumerate(reader.iter(skip_invalid=True, skip_empty=True)):
total_tweets += 1
if args.debug:
# DEBUGGING
print('-'*70)
print(json.dumps(tweet, indent=4, sort_keys=True))
print(type(tweet))
data = tweet.get("data")
includes = tweet.get("includes")
geo = tweet.get("data", {}).get("geo")
print("\ndata")
print(data)
print("\nincludes")
print(includes)
print("\ngeo")
print(geo)
# break
# END DEBUGGING

# Show warnings from the input file, not the Python source code.
def showwarning(message, category, filename, lineno, file=sys.stderr, line=None):
sys.stderr.write(
warnings.formatwarning(message, category, args.input_file, i+1, line='')
)
warnings.showwarning = showwarning
# Skip deleted and status_withheld tweets
if "delete" in tweet or "status_withheld" in tweet:
skipped_tweets += 1
continue
tweet = json.loads(input_line)
except ValueError:
warnings.warn('Invalid JSON object')
skipped_tweets += 1
continue
# Collect statistics on the tweet.
if tweet.get('place'):
has_place += 1
if tweet.get('coordinates'):
has_coordinates += 1
if tweet.get('geo'):
has_geo += 1
if tweet.get('user', {}).get('location', ''):
has_profile_location += 1
# Perform the actual resolution.
resolution = resolver.resolve_tweet(tweet)
if resolution:
location = resolution[1]
tweet['location'] = location
# More statistics.
resolution_method_counts[location.resolution_method] += 1
if location.city:
city_found += 1
elif location.county:
county_found += 1
elif location.state:
state_found += 1
elif location.country:
country_found += 1
resolved_tweets += 1
json_output = json.dumps(tweet, cls=LocationEncoder).encode()
output_file.write(json_output)
output_file.write(bytes('\n'.encode(encoding='ascii')))
total_tweets += 1


# TODO: in APIv2, statistics can't work like before since fields are different.
# Collect statistics on the tweet.
if tweet.get('place'):
has_place += 1
if tweet.get('coordinates'):
has_coordinates += 1
if tweet.get('geo'):
has_geo += 1
if tweet.get('user', {}).get('location', ''):
has_profile_location += 1
# Perform the actual resolution.
resolution = resolver.resolve_tweet(tweet)
if resolution:
location = resolution[1]
tweet['location'] = location
# More statistics.
resolution_method_counts[location.resolution_method] += 1
if location.city:
city_found += 1
elif location.county:
county_found += 1
elif location.state:
state_found += 1
elif location.country:
country_found += 1
resolved_tweets += 1
try:
json_output = json.dumps(tweet, cls=LocationEncoder).encode()
writer.write(json_output)
except TypeError as err:
json_output = json.dumps(tweet, cls=LocationEncoder)
writer.write(json_output)
fi.close()
fo.close()

if args.statistics:
# TODO: change the statistics to correspond with the new API v2
print('Skipped %d tweets.' % skipped_tweets, file=sys.stderr)
print('Tweets with "place" key: %d; '
'"coordinates" key: %d; '
Expand Down
Loading

0 comments on commit d369fb9

Please sign in to comment.