Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add progress bar #75

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 19 additions & 14 deletions chaine/crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from operator import itemgetter
from pathlib import Path

from progress.bar import Bar

from chaine._core.crf import Model as _Model
from chaine._core.crf import Trainer as _Trainer
from chaine.logging import Logger, set_verbosity
Expand Down Expand Up @@ -183,20 +185,23 @@ def train(
Path to model location.
"""
LOGGER.info("Loading data set")
for i, (sequence, labels_) in enumerate(zip(dataset, labels)):
if not is_valid_sequence(sequence):
raise ValueError(f"Invalid format: {sequence}")

# log progress every 100 data points
if i > 0 and i % 100 == 0:
LOGGER.debug(f"{i} processed data points")

try:
self._trainer.append(sequence, labels_)
except Exception as message:
LOGGER.error(message)
LOGGER.debug(f"Sequence: {json.dumps(sequence)}")
LOGGER.debug(f"Labels: {json.dumps(labels_)}")
with Bar("Loading data set...", max=len(labels)) as bar:
for i, (sequence, labels_) in enumerate(zip(dataset, labels)):
if not is_valid_sequence(sequence):
raise ValueError(f"Invalid format: {sequence}")

# log progress every 100 data points
if i > 0 and i % 100 == 0:
LOGGER.debug(f"{i} processed data points")

try:
self._trainer.append(sequence, labels_)
except Exception as message:
LOGGER.error(message)
LOGGER.debug(f"Sequence: {json.dumps(sequence)}")
LOGGER.debug(f"Labels: {json.dumps(labels_)}")

bar.next()

# fire!
LOGGER.info("Start training")
Expand Down
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ build = "build.py"

[tool.poetry.dependencies]
python = "^3.9"
progress = "^1.6"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually don't want any third-party dependencies. Since progress is a quite lightweight project, can you just migrate to chaine? As a submodule?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A refactoring of progress is necessary in the first place, because we don't need all these different stylings of the progress bar:

68747470733a2f2f7261772e6769746875622e636f6d2f7665726967616b2f70726f67726573732f6d61737465722f64656d6f2e676966

Just a single, simple and plain progress bar.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer the Bar one

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also prefer the Bar one, the integration of progress is currently temporary and for testing purposes. The main reason for this very drafty PR is, that I'm continue to work on this PR on another computer.

I'm currently not sure if I will integrate this directly into the C code or as an outer layer within the Python code, but I would prefer the first one to have more control over the bar.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


[tool.poetry.dev-dependencies]
black = "^22.3.0"
Expand Down
157 changes: 157 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import chaine

TEST_1 = False

if TEST_1:
tokens = [[{"index": 0, "text": "John"}, {"index": 1, "text": "Lennon"}]]
labels = [["B-PER", "I-PER"]]
new_tokens = []
new_labels = []

c = 0
while c < 100000:
new_tokens.append(tokens[0])
new_labels.append(labels[0])
c+=1

model = chaine.train(new_tokens, new_labels, verbose=0)
print(model.predict(tokens))
else:
import datasets
import pandas as pd
from seqeval.metrics import classification_report

import chaine
from chaine.typing import Dataset, Features, Sentence, Tags

dataset = datasets.load_dataset("conll2003")

print(f"Number of sentences for training: {len(dataset['train']['tokens'])}")
print(f"Number of sentences for evaluation: {len(dataset['test']['tokens'])}")

def featurize_token(token_index: int, sentence: Sentence, pos_tags: Tags) -> Features:
"""Extract features from a token in a sentence.

Parameters
----------
token_index : int
Index of the token to featurize in the sentence.
sentence : Sentence
Sequence of tokens.
pos_tags : Tags
Sequence of part-of-speech tags corresponding to the tokens in the sentence.

Returns
-------
Features
Features representing the token.
"""
token = sentence[token_index]
pos_tag = pos_tags[token_index]
features = {
"token.lower()": token.lower(),
"token[-3:]": token[-3:],
"token[-2:]": token[-2:],
"token.isupper()": token.isupper(),
"token.istitle()": token.istitle(),
"token.isdigit()": token.isdigit(),
"pos_tag": pos_tag,
}
if token_index > 0:
previous_token = sentence[token_index - 1]
previous_pos_tag = pos_tags[token_index - 1]
features.update(
{
"-1:token.lower()": previous_token.lower(),
"-1:token.istitle()": previous_token.istitle(),
"-1:token.isupper()": previous_token.isupper(),
"-1:pos_tag": previous_pos_tag,
}
)
else:
features["BOS"] = True
if token_index < len(sentence) - 1:
next_token = sentence[token_index + 1]
next_pos_tag = pos_tags[token_index + 1]
features.update(
{
"+1:token.lower()": next_token.lower(),
"+1:token.istitle()": next_token.istitle(),
"+1:token.isupper()": next_token.isupper(),
"+1:pos_tag": next_pos_tag,
}
)
else:
features["EOS"] = True
return features


def featurize_sentence(sentence: Sentence, pos_tags: Tags) -> list[Features]:
"""Extract features from tokens in a sentence.

Parameters
----------
sentence : Sentence
Sequence of tokens.
pos_tags : Tags
Sequence of part-of-speech tags corresponding to the tokens in the sentence.

Returns
-------
list[Features]
List of features representing tokens of a sentence.
"""
return [
featurize_token(token_index, sentence, pos_tags) for token_index in range(len(sentence))
]


def featurize_dataset(dataset: Dataset) -> list[list[Features]]:
"""Extract features from sentences in a dataset.

Parameters
----------
dataset : Dataset
Dataset to featurize.

Returns
-------
list[list[Features]]
Featurized dataset.
"""
return [
featurize_sentence(sentence, pos_tags)
for sentence, pos_tags in zip(dataset["tokens"], dataset["pos_tags"])
]


def preprocess_labels(dataset: Dataset) -> list[list[str]]:
"""Translate raw labels (i.e. integers) to the respective string labels.

Parameters
----------
dataset : Dataset
Dataset to preprocess labels.

Returns
-------
list[list[Features]]
Preprocessed labels.
"""
labels = dataset.features["ner_tags"].feature.names
return [[labels[index] for index in indices] for indices in dataset["ner_tags"]]

train_sentences = featurize_dataset(dataset["train"])
train_labels = preprocess_labels(dataset["train"])

train_sentences = train_sentences[:100]
train_labels = train_labels[:100]

model = chaine.train(train_sentences, train_labels, verbose=0)

test_sentences = featurize_dataset(dataset["test"])
test_labels = preprocess_labels(dataset["test"])

predictions = model.predict(test_sentences)

print(classification_report(test_labels, predictions))