Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds statistics folder #12

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions statistics/local_stat_calculations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import json


def merge_json_files(directory):
merged_data = []

for filename in os.listdir(directory):
if filename.endswith('.json'):
filepath = os.path.join(directory, filename)
with open(filepath, 'r') as file:
data = json.load(file)
merged_data.append(data)

return merged_data


def get_question_count_by_cat(merged_data, cat):
language_counts = {}

for exam in merged_data:
for question in exam:
language = question.get(cat)
if language:
if language not in language_counts:
language_counts[language] = 0
language_counts[language] += 1

return language_counts


def count_image_related_questions(merged_data):
image_question_count = 0
for exam in merged_data:
for question in exam:
if 'image_png' in question and question['image_png'] != '' and question['image_png'] != None:
image_question_count += 1
continue
for option in question.get('options', []):
if option.endswith(('.png', '.jpg', '.jpeg', '.gif')):
image_question_count += 1
break
return image_question_count


exams_directory = './exams' # Modify this to your need, note that this has to be a subfolder in the repo
merged_data = merge_json_files(exams_directory)

print("Number of exams:", len(merged_data))

for cat in ["language", "country", "level", "category_en", "image_type", "image_information"]:
question_counts = get_question_count_by_cat(merged_data, cat)
print(f"Question counts by {cat}:", question_counts)

image_question_count = count_image_related_questions(merged_data)
print("Number of image related questions:", image_question_count)
107 changes: 107 additions & 0 deletions statistics/modality_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import pandas as pd
import json
import os
import argparse
from collections import defaultdict
from huggingface_hub import HfApi, HfFolder, Repository
from huggingface_hub import hf_hub_download
from rich.console import Console
from rich.table import Table


def main(local_dir):
api = HfApi()

sheet = "Completed_and_Validated_Exams"
gsheet_id = "1f4nkmFyTaYu0-iBeRQ1D-KTD3JoyC-FI7V9G6hTdn5o"
data_url = f"https://docs.google.com/spreadsheets/d/{gsheet_id}/gviz/tq?tqx=out:csv&sheet={sheet}"

df = pd.read_csv(data_url)
HF_column = 'HF Dataset Link'
hf_links = df[HF_column].dropna().tolist()
print(hf_links)

hf_links = [link.replace("tree/main", "") for link in hf_links]
print(hf_links)

console = Console()
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Repo", justify="left")
table.add_column("JSON", justify="right")
table.add_column("Text", justify="right")
table.add_column("Multimodal", justify="right")
table.add_column("Total", justify="right")
grand_total = grand_text = grand_multimodal = 0

for link in hf_links:
link = link.strip()
if not link.startswith("https://"):
continue

if link.endswith("/"):
link = link[:-1]
link = link.strip()
repo_user = link.split("/")[-2]
repo_id = link.split("/")[-1]
repo = f"{repo_user}/{repo_id}"
repo_files = api.list_repo_files(repo, repo_type="dataset")
json_files = [file for file in repo_files if file.endswith(".json")]
print(json_files)
save_dir = os.path.join(local_dir, repo.replace("/", "__"))
os.makedirs(save_dir, exist_ok=True)
for json_file in json_files:
print(repo)
hf_hub_download(repo_id=repo, filename=json_file, repo_type="dataset",
local_dir=save_dir)
json_path = f"{save_dir}/{json_file}"

try:
with open(json_path, "r", encoding="utf-8") as f:
try:
json_data = json.load(f)
except:
json_data = [json.loads(line) for line in f]
except:
print(f"Error reading {json_path}")
continue

counts = defaultdict(int)
for data in json_data:
is_mutlimodal = False
for option in data['options']:
if ".png" in option:
is_mutlimodal = True
break

if data['image_png']:
is_mutlimodal = True

if is_mutlimodal:
counts['multimodal'] += 1
else:
counts['text'] += 1

counts['total'] = len(json_data)
print(dict(counts))
print('-'*80)

grand_total += counts['total']
grand_multimodal += counts['multimodal']
grand_text += counts['text']

table.add_row(repo, json_file, str(counts['text']),
str(counts['multimodal']), str(counts['total']))
# break

# Add a horizontal line
table.add_row("-" * 80, "-" * 80, "-" * 80, "-" * 80, "-" * 80)
table.add_row("Total", "", str(grand_text), str(grand_multimodal), str(grand_total))
console.print(table)

# Take local_dir as input from argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", type=str, default="./hub_data")
args = parser.parse_args()
local_dir = args.local_dir
main(local_dir)