From bcb1d2fb0c6cc513e271840694f1a81d7f445dfa Mon Sep 17 00:00:00 2001 From: capjamesg Date: Tue, 18 Feb 2025 18:23:13 +0000 Subject: [PATCH] Update results --- index.html | 46 +++++++++++++++++------------------------ results/2025-02-18.json | 36 ++++++++++++++++---------------- 2 files changed, 37 insertions(+), 45 deletions(-) diff --git a/index.html b/index.html index 3a230ac..8355bab 100644 --- a/index.html +++ b/index.html @@ -35,7 +35,7 @@

How's GPT O1 Doing?

-

This website measures how GPT O1 performs across a range of experiments.

+

This website measures how GPT O1 performs across a range of experiments.

We test tasks we know GPT O1 performs well at (i.e. classification) to measure regressions, as well as tasks GPT O1 struggles with (i.e. odometer OCR) to measure performance improvements and changes.

You can contribute your own tests, too! See the GitHub README for contributing instructions.

@@ -78,7 +78,7 @@

Counting

-

Can GPT-4V count the number of objects within an image?

+

Can GPT count the number of objects within an image?

@@ -132,7 +132,7 @@

Result

Document OCR

-

Can GPT-4V read a document and return the exact characters in the text?

+

Can GPT read a document and return the exact characters in the text?

@@ -186,7 +186,7 @@

Result

Handwriting OCR

-

Can GPT-4V read handwriting?

+

Can GPT read handwriting?

@@ -240,7 +240,7 @@

Result

Structured Data OCR

-

Can GPT-4V extract structured data from an image?

+

Can GPT extract structured data from an image?

@@ -270,7 +270,7 @@

Structured Data OCR

Of the last 7 tests, conducted daily, this test has passed 100% of the time.

-

Today's request cost $0.012

+

Today's request cost $0.014

@@ -284,15 +284,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
Failed to produce a valid JSON output: [
-  {
-    "name": "Mary Thomas",
-    "time_per_day": 1,
-    "medication": "Atenolol",
-    "dosage": 100,
-    "rx_number": "1234567-12345"
-  }
-]
+
Failed to produce a valid JSON output: 

Test submitted by Roboflow

@@ -302,7 +294,7 @@

Result

Math OCR

-

Can GPT-4V recognize math equations?

+

Can GPT recognize math equations?

@@ -356,7 +348,7 @@

Result

Object Detection

-

Can GPT-4V detect objects in an image?

+

Can GPT detect objects in an image?

@@ -392,7 +384,7 @@

Object Detection

Method

-
We provide GPT-4V with an image with a known object. We ask it to provide a normalized bounding box of the object and for scoring, we calculate the intersection over union (IOU) between the predicted bounding box and the correct bounding box.
+
We provide GPT with an image with a known object. We ask it to provide a normalized bounding box of the object and for scoring, we calculate the intersection over union (IOU) between the predicted bounding box and the correct bounding box.

Prompt

                                             If there are banana in this image, return a JSON object with `x`, `y`, `width` and `height` properties of the banana. All values should be normalized between 0-1 and x&y should be the center point.
@@ -410,7 +402,7 @@ 

Result

Graph Understanding

-

Can GPT-4V identify points on a graph?

+

Can GPT identify points on a graph?

@@ -464,7 +456,7 @@

Result

Color Recognition

-

Can GPT-4V identify colors accurately?

+

Can GPT identify colors accurately?

@@ -500,7 +492,7 @@

Color Recognition

Method

-
We provide GPT-4V with an image with multiple shapes with differing colors. We ask it to identify the color of a particular shape in RGB color codes.
+
We provide GPT with an image with multiple shapes with differing colors. We ask it to identify the color of a particular shape in RGB color codes.

Prompt

                                             Guess the RGB color code of the rectangle and return only the result in JSON. The JSON should have three integer properties: 'R', 'G' and 'B'
@@ -518,7 +510,7 @@ 

Result

Annotation Quality Assurance

-

Can GPT-4V identify image labeling mistakes?

+

Can GPT identify image labeling mistakes?

@@ -554,7 +546,7 @@

Annotation Quality Assurance

Method

-
We provide a image from a self driving car dataset with intentionally three missing annotations. We ask GPT-4V to identify the number of missing annotations. We score the result based on the number of missing annotations identfied.
+
We provide a image from a self driving car dataset with intentionally three missing annotations. We ask GPT to identify the number of missing annotations. We score the result based on the number of missing annotations identfied.

Prompt

                                             This is a sample image from a dataset with cars labeled with red bounding boxes. Are there any missing annotations? Return a JSON with a integer property 'missing' for the number of missing annotations.
@@ -626,7 +618,7 @@ 

Result

Easy Captcha

-

Can GPT-4V break an easy CAPTCHA?

+

Can GPT break an easy CAPTCHA?

@@ -724,7 +716,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
i’m sorry, but i can’t comply with that.
+

                                         

Test submitted by Charles Frye

@@ -743,7 +735,7 @@

Zero Shot Classification

-

Can GPT-4V classify an image without being trained on that particular use case?

+

Can GPT classify an image without being trained on that particular use case?

@@ -773,7 +765,7 @@

Zero Shot Classification

Of the last 7 tests, conducted daily, this test has passed 100% of the time.

-

Today's request cost $0.006

+

Today's request cost $0.01

diff --git a/results/2025-02-18.json b/results/2025-02-18.json index 42c38b1..fcb3edf 100644 --- a/results/2025-02-18.json +++ b/results/2025-02-18.json @@ -2,9 +2,9 @@ "zero_shot_classification": { "score": 1, "success": true, - "price": 0.006280000000000001, + "price": 0.01012, "pass_fail": "Pass", - "response_time": 5.425879001617432, + "response_time": 8.640285730361938, "result": "Toyota Camry" }, "count_fruit": { @@ -12,7 +12,7 @@ "success": false, "price": 0.01545, "pass_fail": "Fail", - "response_time": 6.601654291152954, + "response_time": 8.17599081993103, "result": "" }, "document_ocr": { @@ -20,7 +20,7 @@ "success": false, "price": 0.014110000000000001, "pass_fail": "Fail", - "response_time": 6.898264408111572, + "response_time": 8.330984354019165, "result": "" }, "handwriting_ocr": { @@ -28,23 +28,23 @@ "success": false, "price": 0.015529999999999999, "pass_fail": "Fail", - "response_time": 12.941861629486084, + "response_time": 16.103299379348755, "result": "" }, "extraction_ocr": { "score": 0, "success": false, - "price": 0.01239, + "price": 0.013649999999999999, "pass_fail": "Fail", - "response_time": 8.915974855422974, - "result": "Failed to produce a valid JSON output: [\n {\n \"name\": \"Mary Thomas\",\n \"time_per_day\": 1,\n \"medication\": \"Atenolol\",\n \"dosage\": 100,\n \"rx_number\": \"1234567-12345\"\n }\n]" + "response_time": 8.948955297470093, + "result": "Failed to produce a valid JSON output: " }, "math_ocr": { "score": 0, "success": false, "price": 0.02113, "pass_fail": "Fail", - "response_time": 8.772263765335083, + "response_time": 10.900792598724365, "result": "Failed to produce a valid JSON output: " }, "object_detection": { @@ -52,7 +52,7 @@ "success": false, "price": 0.01584, "pass_fail": "Fail", - "response_time": 14.87077260017395, + "response_time": 8.594229698181152, "result": "Failed to produce a valid JSON output: " }, "graph_understanding": { @@ -60,7 +60,7 @@ "success": false, "price": 0.0157, "pass_fail": "Fail", - "response_time": 9.811071157455444, + "response_time": 7.868333578109741, "result": "Failed to produce a valid JSON output: " }, "color_recognition": { @@ -68,7 +68,7 @@ "success": false, "price": 0.0157, "pass_fail": "Fail", - "response_time": 6.974358081817627, + "response_time": 10.057016849517822, "result": "Failed to produce a valid JSON output: " }, "annotation_qa": { @@ -76,7 +76,7 @@ "success": false, "price": 0.02135, "pass_fail": "Fail", - "response_time": 11.686425685882568, + "response_time": 10.789749383926392, "result": "Failed to produce a valid JSON output: " }, "measurement": { @@ -84,7 +84,7 @@ "success": false, "price": 0.01566, "pass_fail": "Fail", - "response_time": 9.014999389648438, + "response_time": 13.053446292877197, "result": "Failed to produce a valid JSON output: " }, "easy_captcha": { @@ -92,15 +92,15 @@ "success": false, "price": 0.01281, "pass_fail": "Fail", - "response_time": 7.116928815841675, + "response_time": 14.729755878448486, "result": "" }, "easy_captcha_persuade": { "score": 0, "success": false, - "price": 0.012709999999999999, + "price": 0.013309999999999999, "pass_fail": "Fail", - "response_time": 6.382972478866577, - "result": "i\u2019m sorry, but i can\u2019t comply with that." + "response_time": 7.840451002120972, + "result": "" } } \ No newline at end of file