for-ai · northern-64bit · Jan 20, 2025 · Jan 20, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/statistics/local_stat_calculations.py b/statistics/local_stat_calculations.py
@@ -0,0 +1,56 @@
+import os
+import json
+
+
+def merge_json_files(directory):
+    merged_data = []
+
+    for filename in os.listdir(directory):
+        if filename.endswith('.json'):
+            filepath = os.path.join(directory, filename)
+            with open(filepath, 'r') as file:
+                data = json.load(file)
+                merged_data.append(data)
+
+    return merged_data
+
+
+def get_question_count_by_cat(merged_data, cat):
+    language_counts = {}
+
+    for exam in merged_data:
+        for question in exam:
+            language = question.get(cat)
+            if language:
+                if language not in language_counts:
+                    language_counts[language] = 0
+                language_counts[language] += 1
+
+    return language_counts
+
+
+def count_image_related_questions(merged_data):
+    image_question_count = 0
+    for exam in merged_data:
+        for question in exam:
+            if 'image_png' in question and question['image_png'] != '' and question['image_png'] != None:
+                image_question_count += 1
+                continue
+            for option in question.get('options', []):
+                if option.endswith(('.png', '.jpg', '.jpeg', '.gif')):
+                    image_question_count += 1
+                    break
+    return image_question_count
+
+
+exams_directory = './exams' # Modify this to your need, note that this has to be a subfolder in the repo
+merged_data = merge_json_files(exams_directory)
+
+print("Number of exams:", len(merged_data))
+
+for cat in ["language", "country", "level", "category_en", "image_type", "image_information"]:
+    question_counts = get_question_count_by_cat(merged_data, cat)
+    print(f"Question counts by {cat}:", question_counts)
+
+image_question_count = count_image_related_questions(merged_data)
+print("Number of image related questions:", image_question_count)
diff --git a/statistics/modality_counts.py b/statistics/modality_counts.py
@@ -0,0 +1,107 @@
+import pandas as pd
+import json
+import os
+import argparse
+from collections import defaultdict
+from huggingface_hub import HfApi, HfFolder, Repository
+from huggingface_hub import hf_hub_download
+from rich.console import Console
+from rich.table import Table
+
+
+def main(local_dir):
+  api = HfApi()
+
+  sheet = "Completed_and_Validated_Exams"
+  gsheet_id = "1f4nkmFyTaYu0-iBeRQ1D-KTD3JoyC-FI7V9G6hTdn5o"
+  data_url = f"https://docs.google.com/spreadsheets/d/{gsheet_id}/gviz/tq?tqx=out:csv&sheet={sheet}"
+
+  df = pd.read_csv(data_url)
+  HF_column = 'HF Dataset Link'
+  hf_links = df[HF_column].dropna().tolist()
+  print(hf_links)
+
+  hf_links = [link.replace("tree/main", "") for link in hf_links]
+  print(hf_links)
+
+  console = Console()
+  table = Table(show_header=True, header_style="bold magenta")
+  table.add_column("Repo", justify="left")
+  table.add_column("JSON", justify="right")
+  table.add_column("Text", justify="right")
+  table.add_column("Multimodal", justify="right")
+  table.add_column("Total", justify="right")
+  grand_total = grand_text = grand_multimodal = 0
+
+  for link in hf_links:
+    link = link.strip()
+    if not link.startswith("https://"):
+      continue
+
+    if link.endswith("/"):
+      link = link[:-1]
+    link = link.strip()
+    repo_user = link.split("/")[-2]
+    repo_id = link.split("/")[-1]
+    repo = f"{repo_user}/{repo_id}"
+    repo_files = api.list_repo_files(repo, repo_type="dataset")
+    json_files = [file for file in repo_files if file.endswith(".json")]
+    print(json_files)
+    save_dir = os.path.join(local_dir, repo.replace("/", "__"))
+    os.makedirs(save_dir, exist_ok=True)
+    for json_file in json_files:
+      print(repo)
+      hf_hub_download(repo_id=repo, filename=json_file, repo_type="dataset", 
+                      local_dir=save_dir)
+      json_path = f"{save_dir}/{json_file}"
+
+      try:
+        with open(json_path, "r", encoding="utf-8") as f:
+          try:
+            json_data = json.load(f)
+          except:        
+            json_data = [json.loads(line) for line in f]
+      except:
+        print(f"Error reading {json_path}")
+        continue
+
+      counts = defaultdict(int)
+      for data in json_data:
+        is_mutlimodal = False
+        for option in data['options']:
+          if ".png" in option:
+            is_mutlimodal = True
+            break
+
+        if data['image_png']:
+          is_mutlimodal = True
+
+        if is_mutlimodal:
+          counts['multimodal'] += 1
+        else:
+          counts['text'] += 1
+
+      counts['total'] = len(json_data)
+      print(dict(counts))
+      print('-'*80)
+
+      grand_total += counts['total']
+      grand_multimodal += counts['multimodal']
+      grand_text += counts['text']
+
+      table.add_row(repo, json_file, str(counts['text']), 
+                    str(counts['multimodal']), str(counts['total']))
+    # break
+
+  # Add a horizontal line
+  table.add_row("-" * 80, "-" * 80, "-" * 80, "-" * 80, "-" * 80)
+  table.add_row("Total", "", str(grand_text), str(grand_multimodal), str(grand_total))
+  console.print(table)
+
+# Take local_dir as input from argparse
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--local_dir", type=str, default="./hub_data")
+  args = parser.parse_args()
+  local_dir = args.local_dir
+  main(local_dir)