Skip to content

Commit

Permalink
all done - just need some cleanup of instructions and dataset names
Browse files Browse the repository at this point in the history
  • Loading branch information
thesteve0 committed Dec 17, 2024
1 parent eff09ae commit b71e4b0
Show file tree
Hide file tree
Showing 3 changed files with 217 additions and 8 deletions.
112 changes: 110 additions & 2 deletions 10_final_predictions.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,118 @@
import fiftyone as fo
import os
import tempfile
import torch
from fiftyone import Classification
from ultralytics import YOLO

"""
We have trained the model on the < 5% of the original data.
Now we are going to take this newly trained model and train it some more on another chunk of the data.
We are going to have the new model predict the new images and then correct it to the ground truth again.
First step is going to be splitting out more data that is not in the first training set.
Then run the new model over it producing the new predictions
"""

# This is the number we need to make our total sampled data = 10%
FIRST_TRAINING = "first_labeled_dataset"
SECOND_TRAINING = "second_labeled_dataset"
FINAL_OUTPUT = "final_predicted_photos"
IMAGE_SIZE = 704
MODEL_LOCATION = "/home/spousty/git/voxel-photo-album/sp_final_training_photos_yolo11/output/weights/best.pt"


def get_torch_device():
if torch.cuda.is_available():
return torch.device("cuda")
elif torch.backends.mps.is_available():
return torch.device("mps")
else:
return torch.device("cpu")


def load_data(datasetname) -> fo.Dataset:
return fo.load_dataset(datasetname)


def split_again(dataset_with_orig, first_training) -> fo.Dataset:
# Take a random sample from the original data, excluding the first set of sample data
# sample ID is not generally a good unique ID. For 51, filepath is what you should use when working
# with the same dataset over and over again
if "second_play_photos" in fo.list_datasets():
fo.delete_dataset("second_play_photos")
return dataset_with_orig.exclude_by("filepath", first_training.values("filepath"))


def run_predictions(dataset):
model = YOLO(MODEL_LOCATION) # load a custom model
# naive_model = YOLO(f"yolo11x-cls.pt")

# "export" our sample images to disk - symlink
data_dir = tempfile.mkdtemp()
dataset.export(export_dir=data_dir, dataset_type=fo.types.ImageDirectory, export_media="symlink")

# Predict with the model
results = model(
source=data_dir,
device=get_torch_device(),
imgsz=IMAGE_SIZE,
stream=True,
# save=True,
project="final_predictions"
)

return results


def extract_orig_path(dataset):
sample = dataset.first()
result = sample.filepath.replace(sample.filename, "")
return result

def merge_all_training_data():
first_data = fo.load_dataset(FIRST_TRAINING).view().clone()
second_data_view = fo.load_dataset(SECOND_TRAINING).view()
first_data.add_samples(second_data_view)
return first_data


if __name__ == '__main__':
print("starting")
whole_dataset = load_data("photo_album")
original_path = extract_orig_path(whole_dataset)
all_training_data = merge_all_training_data()
if FINAL_OUTPUT in fo.list_datasets():
fo.delete_dataset(FINAL_OUTPUT)
remaining_photos = split_again(whole_dataset, all_training_data).clone(FINAL_OUTPUT, persistent=True)
print("about to predict")
results = run_predictions(remaining_photos)

# Convert the results list into a dict so we can iterate through the dataset rather than
# the results. This will allow for faster saves to the dataset rather than individual sample by sample saves
# Using the images name without the path as the key
# This is causing an OOM kill - I think there are too many objects in each results object
results_dict = {}
result = {}
for x in results:
file_name = x.path[x.path.rfind("/") + 1:]
label = x.names[x.probs.top1]
confidence = float(round(x.probs.top1conf.item(),2))
result = {"label": label, "confidence": confidence}
results_dict[file_name] = result

# results_dict = {: x for x in results}

# Now for each sample in the new dataset, add the prediction
for sample in remaining_photos.iter_samples(progress=True, autosave=True):
filename = sample.filename
res = results_dict[filename]
predicted_class = Classification(label=res["label"],
confidence=res["confidence"])
sample["prediction"] = predicted_class

# Display new dataset and hold it open with a wait()
#session = fo.launch_app(remaining_photos)
#session.wait()
print("Done")


print("finished")
90 changes: 84 additions & 6 deletions 9_final_fine_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,22 @@
import torch
from ultralytics import YOLO

FIRST_TRAINING = "low_quality_first_labeled_dataset"
SECOND_TRAINING = ""
FIRST_TRAINING = "first_labeled_dataset"
SECOND_TRAINING = "second_labeled_dataset"

DATASET_NAME = 'labeled_dataset'
DEFAULT_MODEL_SIZE = "x"
DATASET_NAME = ''
DEFAULT_MODEL_SIZE = "l"
DEFAULT_IMAGE_SIZE = 704
DEFAULT_EPOCHS = 12
PROJECT_NAME = 'sp_photos_yolo11'
DEFAULT_EPOCHS = 16
SAVE_RESULTS = True
PROJECT_NAME = 'sp_final_training_photos_yolo11'

def merge_datasets():
first_data = fo.load_dataset(FIRST_TRAINING).clone()
second_data = fo.load_dataset(SECOND_TRAINING)
first_data.add_samples(second_data.view())
return first_data
print("Done Merging")


def get_torch_device():
Expand All @@ -23,8 +30,79 @@ def get_torch_device():
else:
return torch.device("cpu")

def train_classifier(
dataset_name=None,
model_size=DEFAULT_MODEL_SIZE,
image_size=DEFAULT_IMAGE_SIZE,
epochs=DEFAULT_EPOCHS,
project_name="mislabel_confidence_noise",
gt_field="ground_truth",
train_split=None,
test_split=None,
**kwargs
):

# settings.update({"wandb": False})
if dataset_name:
dataset = fo.load_dataset(dataset_name)
dataset.take(round(0.3 * len(dataset))).tag_samples("test")
dataset.match_tags("test", bool=False).tag_samples("train")
train = dataset.match_tags("train")
test = dataset.match_tags("test")
else:
train = train_split
test = test_split

if model_size is None:
model_size = "s"
elif model_size not in ["n", "s", "m", "l", "x"]:
raise ValueError("model_size must be one of ['n', 's', 'm', 'l', 'x']")

splits_dict = {
"train": train,
"val": test,
"test": test,
}

data_dir = tempfile.mkdtemp()

for key, split in splits_dict.items():
split_dir = os.path.join(data_dir, key)
os.makedirs(split_dir)
split.export(
export_dir=split_dir,
dataset_type=fo.types.ImageClassificationDirectoryTree,
label_field=gt_field,
export_media="symlink",
)

# Load a pre-trained YOLOv8 model for classification
model = YOLO(f"yolo11{model_size}-cls.pt")

# Train the model
model.train(
data=data_dir, # Path to the dataset
epochs=epochs, # Number of epochs
imgsz=image_size, # Image size
device=get_torch_device(),
save = SAVE_RESULTS,
name = "output",
exist_ok = True,
project=project_name,
)

return model

if __name__ == '__main__':
print("starting")
merged_data = merge_datasets()
train_classifier(
dataset_name=merged_data.name,
# model_size=args.model_size,
# image_size=args.image_size,
# epochs=args.epochs,
project_name=PROJECT_NAME,
)

"""
PREP DATA
Expand Down
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,26 @@
I got lost in the challenge of finding good classes and forgot my main goal.
My goals was to quickly find the raw images I wanted to work with for production.
This is not, given photos of dogs, which breed is it. This is, here is a general photo that could be of anything, do I care
about it.

I only need minimal classes for that:
1. I could simply do - People vs Not people
2. Or I could do people, animal, plant, landscape, buildings

People means there is a person in the photo. Even if the people are small or singular in the picture it should be considered
a people picture.

For the second scheme, this will be more dependent on my deciding what is the intended subject of the photo. For example,
a plant with an insect centered on it would be insect.

THere is an open question about how to handle photos that I am just not interested in, like a picture of a box or a picture from
a doorway but just is not interesting for development. There are no people in the picture and there is not one of the classes I am interested in. I think this is basically teaching the model my concept of uninteresting.

Doing it with the 16 classes lead to overlapping categories and unclean labeling. This in turn led to poor model performance
but for "understandble" reasons. This was not a problem with the models but a problem with data prep

Markus gave the suggestion - which is a really good one - to run this in two stages. First, train a model for people not people. Then for the not-people use that data to train a multi-class model

@@TODO I need to rename the datasets, and their references in code, to make more sense
The way to rename a dataset is
```python
Expand Down

0 comments on commit b71e4b0

Please sign in to comment.