Skip to content

Commit

Permalink
Merge pull request #2 from aryamccarthy/master
Browse files Browse the repository at this point in the history
unittests, code formatting with black, and Type inclusion
  • Loading branch information
AADeLucia authored Jun 7, 2020
2 parents b152396 + d235ce5 commit 4a354f3
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 58 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ for tweet in reader.read_tweets():

# Write out filtered Tweets
writer = TweetWriter("filtered.json")
writer.write(tweets)
writer.write(filtered_tweets)
```


Expand All @@ -57,11 +57,12 @@ writer.write(tweets)
A basic example using the default Tokenizer settings is below.

```python
from littlebird import TweetReader, TweetTokenier
from littlebird import TweetReader, TweetTokenizer

# File in JSONlines form. Automatically handles GZIP files.
tweet_file = "2014_01_02.json.gz"
reader = TweetReader(tweet_file)
tokenizer = TweetTokenizer()

# Iterate over Tweets
# Make sure to check for the "truncated" field otherwise you will only access the
Expand Down
2 changes: 0 additions & 2 deletions littlebird/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@

from .tweet_utils import TweetReader
from .tweet_utils import TweetWriter
from .tweet_tokenizer import TweetTokenizer

78 changes: 43 additions & 35 deletions littlebird/tweet_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,13 @@
Author: Alexandra DeLucia
"""
# Standard imports
import os
import argparse
import logging
import gzip
import zlib
from collections import Counter
import random

from typing import Iterable, List, Optional, Set, Union

# Third-party imports
import jsonlines as jl
import regex

# Local modules
Expand All @@ -22,13 +19,9 @@
# Configurations
logging.basicConfig(level=logging.INFO)

# Define erros
class Error(Exception):
pass


class LanguageNotSupportedError(Error):
def __init__(self, lang):
class LanguageNotSupportedError(ValueError):
def __init__(self, lang: str):
self.lang = lang


Expand All @@ -37,13 +30,15 @@ class TweetTokenizer:
"""
Open Twitter files and process the text content.
"""
def __init__(self,
language="en",
token_pattern=r"\b\w+\b",
stopwords=None,
remove_hashtags=False,
lowercase=True
):

def __init__(
self,
language: str = "en",
token_pattern: str = r"\b\w+\b",
stopwords: Optional[Iterable[str]] = None,
remove_hashtags: bool = False,
lowercase: bool = True,
):
"""
Currently only English and Arabic are support languages ("en" and "ar").
There are many options for the token pattern, and the token pattern should be different depending upon your use case.
Expand All @@ -67,18 +62,21 @@ def __init__(self,
self.URL_RE = r"http(s)?:\/\/[\w\.\/\?\=]+"
self.RT_RE = r"\bRT\b"
self.HASHTAG_RE = regex.compile(r"#[\p{L}\p{N}_]+")
self.REMOVAL_RE = regex.compile("|".join([self.HANDLE_RE, self.URL_RE, self.RT_RE]))
self.REMOVAL_RE = regex.compile(
"|".join([self.HANDLE_RE, self.URL_RE, self.RT_RE])
)
self.WHITESPACE_RE = regex.compile(r"\s+")
self.TOKEN_RE = regex.compile(token_pattern)
self.remove_hashtags = remove_hashtags
self.lowercase = lowercase
self.stopwords: Optional[Set[str]]
if stopwords is not None:
self.stopwords = set(stopwords)
else:
self.stopwords = None
return

def tokenize(self, tweet):
def tokenize(self, tweet: str) -> List[str]:
"""
:param tweets:
:return: tokens
Expand All @@ -88,11 +86,11 @@ def tokenize(self, tweet):

# Remove URLs, handles, "RT"
tweet = self.REMOVAL_RE.sub(" ", tweet)

# Lowercase
if self.lowercase:
tweet = tweet.lower()

# Tokenize
tokens = self.TOKEN_RE.findall(tweet)

Expand All @@ -101,8 +99,9 @@ def tokenize(self, tweet):
tokens = [t for t in tokens if t not in self.stopwords]
return tokens


def tokenize_tweet_file(self, input_file, sample_size=-1, return_tokens=False):
def tokenize_tweet_file(
self, input_file: str, sample_size: int = -1, return_tokens: bool = False
) -> Optional[Union[List[str], List[List[str]]]]:
"""
Return tokenize tweets in file
Expand All @@ -123,28 +122,36 @@ def tokenize_tweet_file(self, input_file, sample_size=-1, return_tokens=False):
# Check for empty file
if num_tweets == 0:
logging.warning(f"{input_file} has no tweets.")
return
return None

# Sample from the file's tweets
if sample_size != -1:
if sample_size < num_tweets:
all_tweet_text = random.sample(all_tweet_text, k=sample_size)

# Tokenize the tweets and return
# Some tweets have no valid tokens. Skip them.
tweet_text = map(self.tokenize, all_tweet_text)
tweet_text_ = map(self.tokenize, all_tweet_text)
tweet_text: Union[List[str], List[List[str]]]
if return_tokens:
tweet_text = [t for t in tweet_text if t != []]
tweet_text = [t for t in tweet_text_ if t != []]
else:
tweet_text = [" ".join(t) for t in tweet_text if t != []]
tweet_text = [" ".join(t) for t in tweet_text_ if t != []]
return tweet_text


def parse_args():
def parse_args() -> argparse.Namespace:
"""Command-line parser for use with scripting"""
parser = argparse.ArgumentParser()
parser.add_argument("--input-files", type=str, nargs="+", help="List of GZIP'd Tweet files")
parser.add_argument("--sample", type=int, default=-1, help="Number of tweets to use for the keyword counts. Only for Tweet files.")
parser.add_argument(
"--input-files", type=str, nargs="+", help="List of GZIP'd Tweet files"
)
parser.add_argument(
"--sample",
type=int,
default=-1,
help="Number of tweets to use for the keyword counts. Only for Tweet files.",
)
parser.add_argument("--language", choices=["en", "ar"])
parser.add_argument("--output-dir")
parser.add_argument("--output-file")
Expand All @@ -155,7 +162,8 @@ def parse_args():
args = parse_args()

tokenizer = TweetTokenizer(remove_hashtags=True)
tweet_text = tokenizer.tokenize_tweet_file("/home/aadelucia/files/minerva/raw_tweets/tweets_en/2014_01_01_MA.gz", sample_size=10)
tweet_text = tokenizer.tokenize_tweet_file(
"/home/aadelucia/files/minerva/raw_tweets/tweets_en/2014_01_01_MA.gz",
sample_size=10,
)
print(tweet_text)


28 changes: 15 additions & 13 deletions littlebird/tweet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
# Standard imports
import logging
import sys
import os
import gzip
import zlib
import tarfile
import zipfile

from typing import Any, Iterable, List, Union

# Third-party imports
import jsonlines as jl
Expand All @@ -22,7 +21,8 @@

class TweetReader:
"""Iterator to read a Twitter file"""
def __init__(self, filename):

def __init__(self, filename: str):
self.path = filename

try:
Expand All @@ -39,8 +39,8 @@ def __init__(self, filename):
except Exception as err:
logging.error(f"Issue opening {filename}:\n{err}")
sys.exit(1)
def read_tweets(self):

def read_tweets(self) -> Iterable[Any]:
try:
with jl.Reader(self.f) as reader:
for tweet in reader.iter(skip_empty=True, skip_invalid=True):
Expand All @@ -49,26 +49,27 @@ def read_tweets(self):
logging.error(f"Error reading {self.path} of type {self.ftype}: {err}")
self.f.close()
sys.exit(1)

# Close file
self.f.close()
return


class TweetWriter:
"""Write Tweets in jsonlines format"""
def __init__(self, filename):

def __init__(self, filename: str):
self.path = filename
try:
if ".gz" in filename:
if filename.endswith(".gz"):
self.f = gzip.open(filename, "w+")
else:
self.f = open(filename, "w+")
except Exception as err:
logging.error(f"Issue opening {filename}:\n{err}")
sys.exit(1)
def write(self, tweets):

def write(self, tweets: Union[Any, List[Any]]) -> None:
"""Write Tweet or list of Tweets to file"""
with jl.Writer(self.f) as writer:
if not isinstance(tweets, list):
Expand All @@ -79,8 +80,9 @@ def write(self, tweets):


if __name__ == "__main__":
reader = TweetReader("/home/aadelucia/files/minerva/data/tweets_en/2014_01_01_MA.gz")
reader = TweetReader(
"/home/aadelucia/files/minerva/data/tweets_en/2014_01_01_MA.gz"
)
for tweet in reader.read_tweets():
print(tweet)
break

8 changes: 2 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,12 @@
long_description_content_type="text/markdown",
url="",
packages=setuptools.find_packages(),
install_requires=[
"jsonlines",
"regex",
"filetype",
],
install_requires=["jsonlines", "regex", "filetype"],
license="MIT",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
python_requires=">=3.6",
)
58 changes: 58 additions & 0 deletions tests/test_tweet_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3
"""
Tests for littlebird/tweet_tokenizer.py
"""
import unittest
from typing import List

import littlebird
from littlebird import TweetTokenizer


class TestTweetTokenizer(unittest.TestCase):
def test_default_tokenize(self):
tokenizer = TweetTokenizer()
tweet: str = (
"Me: I think I have Ebola "
"Doctor: when did you start feel"
"ing symptoms Me: bout a w"
"eek ago Everyone in hospi"
"tal: http://t.co/LoIPKzvOmT"
)
tokenized = tokenizer.tokenize(tweet)
right_answer: List[str] = [
"me",
"i",
"think",
"i",
"have",
"ebola",
"doctor",
"when",
"did",
"you",
"start",
"feeling",
"symptoms",
"me",
"bout",
"a",
"week",
"ago",
"everyone",
"in",
"hospital",
]
self.assertListEqual(tokenized, right_answer)

def test_supported_langs(self):
with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError):
tokenizer = TweetTokenizer(language="zxx")
with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError):
tokenizer = TweetTokenizer(language="es")
with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError):
tokenizer = TweetTokenizer(language="english")


if __name__ == "__main__":
unittest.main()

0 comments on commit 4a354f3

Please sign in to comment.