-
Notifications
You must be signed in to change notification settings - Fork 9
/
extract_photo_metadata.py
63 lines (51 loc) · 1.6 KB
/
extract_photo_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
Extract photo urls and captions from photos.json files
"""
from pathlib import Path
from typing import Set, List, Dict
import csv
import json
import tap
from tqdm.auto import tqdm
from helpers import load_json # type: ignore
class Arguments(tap.Tap):
merlin: Path
output: Path = Path("photos.csv")
if __name__ == "__main__":
args = Arguments().parse_args()
print(args)
# Extract photos
photos: List[Dict] = []
listing_ids: Set[int] = set()
counter = 0
for filename in tqdm(args.merlin.rglob("photos.json")):
listing_id = int(filename.parent.name)
if listing_id in listing_ids:
continue
counter += 1
try:
data = load_json(filename)
except json.decoder.JSONDecodeError:
continue
listing_ids.add(listing_id)
if "data" not in data:
continue
photo_tour = data["data"]["merlin"]["pdpPhotoTour"]
if photo_tour is None:
continue
for image in photo_tour["images"]:
photos.append(
{
"listing_id": listing_id,
"photo_id": image["id"],
"url": image["baseUrl"],
"caption": image["imageMetadata"]["caption"],
}
)
print(f"Extracted {len(photos)} photos from {counter} listings")
assert len(photos) > 0
# Store photos
with open(args.output, "w", newline="") as fid:
fieldnames = photos[0].keys()
writer = csv.DictWriter(fid, fieldnames=fieldnames, delimiter="\t")
writer.writerows(photos)