assemble_data.py

from collections import defaultdict
import glob
import hashlib
import os
import random
import requests
import shutil
import sys
import time

import cv2
from skimage import io

# Mapping: (class, Name, groups)
STYLE_MAPPING = [
    (0, 'Bokeh', ['1543486@N25']),
    (1, 'Bright', ['799643@N24']),
    (2, 'Depth_of_Field', ['75418467@N00', '407825@N20']),
    (3, 'Detailed', ['1670588@N24', '1131378@N23']),
    (4, 'Ethereal', ['907784@N22']),
    (5, 'Geometric_Composition', ['46353124@N00']),
    (6, 'Hazy', ['38694591@N00']),
    (7, 'HDR', ['99275357@N00']),
    (8, 'Horror', ['29561404@N00']),
    (9, 'Long_Exposure', ['52240257802@N01']),
    (10, 'Macro', ['52241335207@N01']),
    (11, 'Melancholy', ['70495179@N00']),
    (12, 'Minimal', ['42097308@N00']),
    (13, 'Noir', ['42109523@N00']),
    (14, 'Romantic', ['54284561@N00']),
    (15, 'Serene', ['1081625@N25']),
    (16, 'Pastel', ['1055565@N24', '1371818@N25']),
    (17, 'Sunny', ['1242213@N23']),
    (18, 'Texture', ['70176273@N00']),
    (19, 'Vintage', ['1222306@N25', "1176551@N24"]),
]


def main():
    if len(sys.argv) != 5:
        print('Usage: python assemble_data.py image_path train_file test_file images_per_style')
        return

    image_path = os.path.abspath(sys.argv[1])
    train_file = sys.argv[2]
    test_file = sys.argv[3]
    images_per_style = int(sys.argv[4])

    url_file = os.path.join(os.path.dirname(__file__), 'flickr_style_url.txt')
    img_info_file = os.path.join(os.path.dirname(__file__), 'flickr_style_img_info.txt')

    collect_image_style_url(url_file, images_per_style)
    fetch_images(url_file, img_info_file, image_path)
    generate_train_test_dataset(img_info_file, train_file, test_file, train_ratio=0.8)


def collect_image_style_url(url_file, photos_per_style):
    if os.path.exists(url_file):
        print('[Skip] Url file exists: {}'.format(url_file))
        return

    with open(url_file, 'w') as f:
        for class_id, style, groups in STYLE_MAPPING:
            print('Get_photos_for_style: {}'.format(style))
            urls = get_image_url_from_group(groups, photos_per_style)
            for url in urls:
                print('{} {}'.format(url, class_id), file=f)

    print('[Done] Url file saves to: {}'.format(url_file))


def get_image_url_from_group(groups, num_images):
    params = {
        'api_key': "d31c7cb60c57aa7483c5c80919df5371",
        'per_page': 500,  # 500 is the maximum allowed
        'content_type': 1,  # only photos
    }

    image_urls = []
    for page in range(10):
        params['page'] = page

        for group in groups:
            params['group_id'] = group

            url = ('https://api.flickr.com/services/rest/?'
                   'method=flickr.photos.search&format=json&nojsoncallback=1'
                   '&api_key={api_key}&content_type={content_type}'
                   '&group_id={group_id}&page={page}&per_page={per_page}')
            url = url.format(**params)

            # Make the request and ensure it succeeds.
            try:
                page_data = requests.get(url).json()
            except:
                print(requests.get(url))
                raise
            if page_data['stat'] != 'ok':
                raise Exception("Something is wrong: API returned {}".format(page_data['stat']))

            for photo_item in page_data['photos']['photo']:
                image_urls.append(_get_image_url(photo_item))

            if len(image_urls) >= num_images:
                return image_urls[:num_images]

    raise Exception('Not enough images, only find {}'.format(len(image_urls)))


def _get_image_url(photo_item, size_flag=''):
    """
    size_flag: string ['']
        See http://www.flickr.com/services/api/misc.urls.html for options.
            '': 500 px on longest side
            '_m': 240px on longest side
    """
    url = "http://farm{farm}.staticflickr.com/{server}/{id}_{secret}{size}.jpg"
    return url.format(size=size_flag, **photo_item)


def fetch_images(url_file, img_info_file, image_folder):
    if os.path.exists(img_info_file):
        print('[Skip] Image info file exists: {}'.format(img_info_file))
        return
    
    os.makedirs(image_folder, exist_ok=True)
    
    with open(url_file, 'r') as f:
        lines = [line.strip() for line in f]

    image_info = []
    for line in lines:
        url, class_id = line.strip().split()
        image_name = _get_image_name(url, class_id)
        image_file = os.path.join(image_folder, image_name)

        # Download and verify
        if not os.path.exists(image_file):
            res = download_image(url, image_file)
        res = verify_image(image_file)

        if not res:
            print('[FAILURE] {}'.format(url))
        else:
            image_info.append((image_file, class_id))
            print('[SUCCESS] {}'.format(url))
            
    with open(img_info_file, 'w') as f:
        for image_file, class_id in image_info:
            print('{} {}'.format(image_file, class_id), file=f)

    print('Success: {}, Failure: {}'.format(len(image_info), len(lines) - len(image_info)))
    print('[Done] Image info file saves to: {}'.format(img_info_file))


def _get_image_name(url, class_id):
    return '{}_{}.jpg'.format(hashlib.sha1(url.encode()).hexdigest(), class_id)


def download_image(url, file):
    try:
        if os.path.exists(file):
            return True
        
        r = requests.get(url, stream=True)
        if r.status_code == 200:
            with open(file, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
                return True
        else:
            return False
    except KeyboardInterrupt:
        raise Exception()  # multiprocessing doesn't catch keyboard exceptions
    except:
        return False


def verify_image(img_file):
    try:
        img = io.imread(img_file)
    except:
        return False
    return True


def generate_train_test_dataset(img_info_file, train_file, test_file, train_ratio=0.8):
    class_to_images = defaultdict(list)
    with open(img_info_file, 'r') as f:
        lines = [line.strip() for line in f]

    random.seed(1211)
    random.shuffle(lines)
    train_size = int(len(lines) * train_ratio)
    
    with open(train_file, 'w') as f:
        for line in lines[:train_size]:
            print(line, file=f)

    with open(test_file, 'w') as f:
        for line in lines[train_size:]:
            print(line, file=f)


    print('[Done] Test file (size={}) saves to: {}'.format(train_size, train_file))
    print('[Done] Train file (size={}) saves to: {}'.format(len(lines) - train_size, test_file))

if __name__ == '__main__':
    main()