Skip to content

Commit

Permalink
Merge pull request #323 from roboflow/import-paligemma-format
Browse files Browse the repository at this point in the history
Import paligemma format into text-image-pairs project
  • Loading branch information
tonylampada authored Sep 16, 2024
2 parents f0656cb + 009e322 commit 66bbe0d
Show file tree
Hide file tree
Showing 19 changed files with 82 additions and 15 deletions.
2 changes: 1 addition & 1 deletion roboflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from roboflow.models import CLIPModel, GazeModel # noqa: F401
from roboflow.util.general import write_line

__version__ = "1.1.44"
__version__ = "1.1.45"


def check_key(api_key, model, notebook, num_retries=0):
Expand Down
3 changes: 2 additions & 1 deletion roboflow/roboflowpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def download(args):


def import_dataset(args):
rf = roboflow.Roboflow()
api_key = load_roboflow_api_key(args.workspace)
rf = roboflow.Roboflow(api_key)
workspace = rf.workspace(args.workspace)
workspace.upload_dataset(
dataset_path=args.folder,
Expand Down
41 changes: 32 additions & 9 deletions roboflow/util/folderparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .image_utils import load_labelmap

IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp"}
ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv"}
ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv", ".jsonl"}
LABELMAPS_EXTENSIONS = {".labels", ".yaml", ".yml"}


Expand Down Expand Up @@ -107,13 +107,14 @@ def _map_annotations_to_images_1tomany(images, annotationFiles):
dirname = image["dirname"]
annotationsInSameDir = annotationsByDirname.get(dirname, [])
if annotationsInSameDir:
if len(annotationsInSameDir) > 1:
print(f"warning: found multiple annotation files on dir {dirname}")
annotationFile = annotationsInSameDir[0]
format = annotationFile["parsedType"]
image["annotationfile"] = _filterIndividualAnnotations(
image, annotationFile, format, imgRefMap, annotationMap
)
for annotationFile in annotationsInSameDir:
format = annotationFile["parsedType"]
filtered_annotations = _filterIndividualAnnotations(
image, annotationFile, format, imgRefMap, annotationMap
)
if filtered_annotations:
image["annotationfile"] = filtered_annotations
break


def _build_image_and_annotation_maps(annotationFiles):
Expand Down Expand Up @@ -182,11 +183,16 @@ def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotatio
return _annotation
else:
return None
elif format == "jsonl":
jsonlLines = [json.dumps(line) for line in parsed if line["image"] == image["name"]]
if jsonlLines:
_annotation = {"name": "annotation.jsonl", "rawText": "\n".join(jsonlLines)}
return _annotation
return None


def _loadAnnotations(folder, annotations):
valid_extensions = {".json", ".csv"}
valid_extensions = {".json", ".csv", ".jsonl"}
annotations = [a for a in annotations if a["extension"] in valid_extensions]
for ann in annotations:
extension = ann["extension"]
Expand All @@ -197,12 +203,29 @@ def _loadAnnotations(folder, annotations):
if parsedType:
ann["parsed"] = parsed
ann["parsedType"] = parsedType
elif extension == ".jsonl":
ann["parsed"] = _read_jsonl(f"{folder}{ann['file']}")
ann["parsedType"] = "jsonl"
elif extension == ".csv":
ann["parsedType"] = "csv"
ann["parsed"] = _parseAnnotationCSV(f"{folder}{ann['file']}")
return annotations


def _read_jsonl(path):
data = []
with open(path) as file:
for linenum, line in enumerate(file, 1):
if not line:
continue
try:
json_object = json.loads(line.strip())
data.append(json_object)
except json.JSONDecodeError:
print(f"Warning: Skipping invalid JSON line in {path}:{linenum}")
return data


def _parseAnnotationCSV(filename):
# TODO: use a proper CSV library?
with open(filename) as f:
Expand Down
5 changes: 5 additions & 0 deletions tests/datasets/paligemma/README.dataset.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# ChartQA > 2024-08-28 7:21pm
https://universe.roboflow.com/roboflow-jvuqo/chartqa-c9zny

Provided by a Roboflow user
License: CC BY 4.0
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions tests/datasets/paligemma/dataset/_annotations.test.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"What region in Italy had the highest number of mafia crimes in 2018?","suffix":"Calabria"}
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"How many criminal reports were recorded in the region of Calabria in 2018?","suffix":"896"}
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"What region in Italy had the highest number of mafia crimes in 2018?","suffix":"Calabria"}
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"How many criminal reports were recorded in the region of Calabria in 2018?","suffix":"896"}
{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2013?","suffix":"Retail"}
{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2014?","suffix":"Electronics"}
{"image":"e1893eee3f64bda1eac88da795ad3a00_png.rf.01248d761c27015da1fa5f3c4daea759.jpg","prefix":"How much did Hermes' national general cargo revenue add up to in 2009?","suffix":"100"}
{"image":"e1893eee3f64bda1eac88da795ad3a00_png.rf.01248d761c27015da1fa5f3c4daea759.jpg","prefix":"How much did Hermes' national general cargo revenue add up to in 2009?","suffix":"100"}
{"image":"eaab023f1ce380c4c9163415facc3c0d_png.rf.01c5a1f19653c056bbb3b0c8fc2d752d.jpg","prefix":"What's the percentage value of leftmost bar?","suffix":"24"}
4 changes: 4 additions & 0 deletions tests/datasets/paligemma/dataset/_annotations.train.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"image":"63a6c783083d5c7c7290bc81877a4ee9_png.rf.5c02d037f48bc3df56e6d0e3e6e053e4.jpg","prefix":"How many research and public policy oriented organizations were there among the registered environmental and conservation organizations in the United States in 2005?","suffix":"372"}
{"image":"63a6c783083d5c7c7290bc81877a4ee9_png.rf.5c02d037f48bc3df56e6d0e3e6e053e4.jpg","prefix":"How many research and public policy oriented organizations were there among the registered environmental and conservation organizations in the United States in 2005?","suffix":"372"}
{"image":"5964b4c268577652f171d52dc317d82d_png.rf.5bf49f8aa575f586001710b1d79968fd.jpg","prefix":"What was the crude birth rate in Costa Rica in 2019?","suffix":"13.69"}
{"image":"5964b4c268577652f171d52dc317d82d_png.rf.5bf49f8aa575f586001710b1d79968fd.jpg","prefix":"What was the crude birth rate in Costa Rica in 2019?","suffix":"13.69"}
3 changes: 3 additions & 0 deletions tests/datasets/paligemma/dataset/_annotations.valid.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"image":"fa68474f5b30c3d647ec1f5cddf41570_png.rf.000949c9aafeb8c594a936a0ef92993f.jpg","prefix":"How many murders and manslaughters were recorded by the Belgian police in 2020?","suffix":"874"}
{"image":"fa68474f5b30c3d647ec1f5cddf41570_png.rf.000949c9aafeb8c594a936a0ef92993f.jpg","prefix":"How many murders and manslaughters were recorded by the Belgian police in 2020?","suffix":"874"}
{"image":"aca6fd05e9b2830518288ba082aa6f76_png.rf.001543e209328197472f6587dfa8a6d6.jpg","prefix":"What was the unemployment rate in Chile in 2020?","suffix":"11.51"}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 2 additions & 1 deletion tests/manual/debugme.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
# f"import {thisdir}/data/cultura-pepino-yolov8_voc -w wolfodorpythontests -p yellow-auto -c 100".split() # noqa: E501 // docs
# f"import {thisdir}/data/cultura-pepino-yolov5pytorch -w wolfodorpythontests -p yellow-auto -c 100 -n papaiasso".split() # noqa: E501 // docs
# f"import {thisdir}/../datasets/mosquitos -w wolfodorpythontests -p yellow-auto -n papaiasso".split() # noqa: E501 // docs
f"deployment list".split() # noqa: E501 // docs
# f"deployment list".split() # noqa: E501 // docs
f"import -w tonyprivate -p meh-plvrv {thisdir}/../datasets/paligemma/".split() # noqa: E501 // docs
)
args.func(args)
5 changes: 4 additions & 1 deletion tests/manual/uselocal
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/env bash
cp data/.config-staging data/.config
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cp $SCRIPT_DIR/data/.config-staging $SCRIPT_DIR/data/.config
export API_URL=https://localhost.roboflow.one
export APP_URL=https://localhost.roboflow.one
export DEDICATED_DEPLOYMENT_URL=https://staging.roboflow.cloud
export ROBOFLOW_CONFIG_DIR=$SCRIPT_DIR/data/.config
# need to set it in /etc/hosts to the IP of host.docker.internal!
4 changes: 3 additions & 1 deletion tests/manual/useprod
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/bin/env bash

cp data/.config-prod data/.config
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cp $SCRIPT_DIR/data/.config-prod $SCRIPT_DIR/data/.config
export API_URL=https://api.roboflow.com
export APP_URL=https://app.roboflow.com
export OBJECT_DETECTION_URL=https://detect.roboflow.one
export DEDICATED_DEPLOYMENT_URL=https://roboflow.cloud
export ROBOFLOW_CONFIG_DIR=$SCRIPT_DIR/data/.config
4 changes: 3 additions & 1 deletion tests/manual/usestaging
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/bin/env bash

cp data/.config-staging data/.config
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cp $SCRIPT_DIR/data/.config-staging $SCRIPT_DIR/data/.config
export API_URL=https://api.roboflow.one
export APP_URL=https://app.roboflow.one
export OBJECT_DETECTION_URL=https://lambda-object-detection.staging.roboflow.com
export DEDICATED_DEPLOYMENT_URL=https://staging.roboflow.cloud
export ROBOFLOW_CONFIG_DIR=$SCRIPT_DIR/data/.config
14 changes: 14 additions & 0 deletions tests/util/test_folderparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,20 @@ def test_parse_mosquitos_csv(self):
expected += "train_10308.jpeg,1058,943,japonicus/koreicus,28,187,908,815\n"
assert testImage["annotationfile"]["rawText"] == expected

def test_paligemma_format(self):
folder = f"{thisdir}/../datasets/paligemma"
parsed = folderparser.parsefolder(folder)
testImagePath = "/dataset/de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg"
testImage = [i for i in parsed["images"] if i["file"] == testImagePath][0]
assert testImage["annotationfile"]["name"] == "annotation.jsonl"
expected = (
'{"image": "de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg",'
' "prefix": "Which sector had the highest ROI in 2013?", "suffix": "Retail"}\n'
'{"image": "de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg",'
' "prefix": "Which sector had the highest ROI in 2014?", "suffix": "Electronics"}'
)
assert testImage["annotationfile"]["rawText"] == expected


def _assertJsonMatchesFile(actual, filename):
with open(filename) as file:
Expand Down

0 comments on commit 66bbe0d

Please sign in to comment.