Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
ramakrishna232 committed May 13, 2024
1 parent bad291c commit 36677c1
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 7 deletions.
3 changes: 1 addition & 2 deletions language-identification-submission/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ FROM fschlatt/natural-language-processing-exercises:0.0.1
ADD run.py /code/run.py
ADD model.joblib /code/model.joblib

RUN pip3 install spacy \
&& python3 -m spacy download en_core_web_sm


ENTRYPOINT ["python3", "/code/run.py"]
Binary file modified language-identification-submission/model.joblib
Binary file not shown.
10 changes: 5 additions & 5 deletions language-identification-submission/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,17 @@
}

# text preprocess
# Helper function for text preprocessing
# Helper function for text preprocessing
def preprocess_text(text):
# Regex to replace anything that is not alphabet or whitespace with empty string
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Converting the text to all lowercase to remove case mismatches
text = text.lower()
# Tokenizing the text by breaking it up into smaller components (tokens)
# Tokenizing the text by breaking it up into smaller components
text = word_tokenize(text)
# Stemming the text to remove word affixes (prefixes and suffixes)
# Using stemming to remove the word prefix and suffixes
text = [stemmer.stem(token) for token in text]
# Lemmatization to bring words down to their root forms
# Using lemmatation to bring their words to normla form
text = [lemmatizer.lemmatize(token) for token in text]
# Stopword removal to remove words that don’t provide any additional information
text = [word for word in text if word not in stopwords]
Expand Down Expand Up @@ -103,7 +103,7 @@ def preprocess_text(text):
y = le.fit_transform(df["lang"])

final_data = pd.DataFrame(np.c_[df["text"], y], columns=["text", "lang"])
print("Final data:\n", final_data.head())



# Train the model
Expand Down

0 comments on commit 36677c1

Please sign in to comment.