tests/test_data_epic_kitchen_recognition.py

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

import unittest
import unittest.mock

import torch
from pytorchvideo.data import EpicKitchenRecognition
from pytorchvideo.data.epic_kitchen import ActionData
from pytorchvideo.data.epic_kitchen_recognition import ClipSampling
from pytorchvideo.data.frame_video import FrameVideo


class TestEpicKitchenRecognition(unittest.TestCase):
    def test_transform_generator(self):
        clip = {
            "start_time": 2.5,
            "stop_time": 6.5,
            "video": torch.rand(3, 4, 10, 20),
            "actions": [
                ActionData(
                    "P01",
                    "P01_01",
                    "turn off light",
                    "00:00:01.00",
                    "00:00:02.00",
                    262,
                    370,
                    "turn-off",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P01",
                    "P01_01",
                    "turn on light",
                    "00:00:04.00",
                    "00:00:06.00",
                    262,
                    370,
                    "turn-on",
                    12,
                    "light",
                    113,
                    "['light']",
                    "[113]",
                ),
                ActionData(
                    "P01",
                    "P01_01",
                    "close door",
                    "00:00:06.00",
                    "00:00:07.00",
                    418,
                    569,
                    "close",
                    3,
                    "door",
                    8,
                    "['door']",
                    "[8]",
                ),
                ActionData(
                    "P01",
                    "P01_01",
                    "slam door",
                    "00:00:10.00",
                    "00:00:11.00",
                    408,
                    509,
                    "slam",
                    3,
                    "door",
                    8,
                    "['door']",
                    "[8]",
                ),
            ],
        }

        def additional_transform(clip):
            clip["video"] = clip["video"].permute(1, 2, 3, 0)
            return clip

        transform_fn = EpicKitchenRecognition._transform_generator(additional_transform)

        transformed_clip = transform_fn(clip)

        self.assertEqual(len(transformed_clip["actions"]), 2)
        # Sort for stability
        sorted_actions = sorted(transformed_clip["actions"], key=lambda a: a.start_time)

        self.assertEqual(sorted_actions[0].narration, "turn on light")
        self.assertEqual(sorted_actions[1].narration, "close door")

        self.assertEqual(transformed_clip["start_time"], 2.5)
        self.assertEqual(transformed_clip["stop_time"], 6.5)

        self.assertEqual(transformed_clip["video"].size(), torch.Size([4, 10, 20, 3]))

    def test_frame_filter_generator(self):
        input_list = list(range(10))

        frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(10)
        all_elements = frame_filter_fn(input_list)
        self.assertEqual(all_elements, input_list)

        frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(5)
        half_elements = frame_filter_fn(input_list)
        self.assertEqual(len(half_elements), 5)
        self.assertEqual(half_elements, [i for i in input_list if not i % 2])

        frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(1)
        half_elements = frame_filter_fn(input_list)
        self.assertEqual(len(half_elements), 1)
        self.assertEqual(half_elements[0], 0)

    def test_define_clip_structure_generator(self):
        seconds_per_clip = 5
        define_clip_structure_fn = (
            EpicKitchenRecognition._define_clip_structure_generator(
                seconds_per_clip=5, clip_sampling=ClipSampling.RandomOffsetUniform
            )
        )
        frame_videos = {
            "P01_003": FrameVideo.from_frame_paths(
                [f"root/P01_003/frame_{i}" for i in range(100)], 10
            ),
            "P02_004": FrameVideo.from_frame_paths(
                [f"root/P02_004/frame_{i}" for i in range(300)], 10
            ),
            "P11_010": FrameVideo.from_frame_paths(
                [f"root/P11_010/frame_{i}" for i in range(600)], 30
            ),
        }
        actions = {video_id: [] for video_id in frame_videos}
        random_value = 0.5
        with unittest.mock.patch("random.random", return_value=random_value) as _:
            clips = define_clip_structure_fn(frame_videos, actions)
            sorted_clips = sorted(clips, key=lambda c: c.start_time)  # For stability

            for clip in sorted_clips:
                self.assertEqual(clip.stop_time - clip.start_time, seconds_per_clip)

            clips_P01_003 = [c for c in sorted_clips if c.video_id == "P01_003"]
            self.assertEqual(len(clips_P01_003), 1)
            for i in range(len(clips_P01_003)):
                self.assertEqual(
                    clips_P01_003[i].start_time, seconds_per_clip * (i + random_value)
                )

            clips_P02_004 = [c for c in sorted_clips if c.video_id == "P02_004"]
            self.assertEqual(len(clips_P02_004), 5)
            for i in range(len(clips_P02_004)):
                self.assertEqual(
                    clips_P02_004[i].start_time, seconds_per_clip * (i + random_value)
                )

            clips_P11_010 = [c for c in sorted_clips if c.video_id == "P11_010"]
            self.assertEqual(len(clips_P11_010), 3)
            for i in range(len(clips_P11_010)):
                self.assertEqual(
                    clips_P11_010[i].start_time, seconds_per_clip * (i + random_value)
                )