From 6ea1df5d696a33ab3c8555d759c6b626d33f6d88 Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Mon, 30 Dec 2024 15:05:32 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=97=91=EF=B8=8F=20remove=20unused=20text?= =?UTF-8?q?=20processing=20notebook?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lmms_eval/models/text.ipynb | 156 ------------------------------------ 1 file changed, 156 deletions(-) delete mode 100644 lmms_eval/models/text.ipynb diff --git a/lmms_eval/models/text.ipynb b/lmms_eval/models/text.ipynb deleted file mode 100644 index de40cc2c0..000000000 --- a/lmms_eval/models/text.ipynb +++ /dev/null @@ -1,156 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "\n", - "def split_media_tags(content):\n", - " # 用正则表达式匹配 \n", - " # (\\d+) 捕获组用来提取数字\n", - " pattern = r''\n", - " \n", - " # 用 split 方法分割文本,同时保留匹配到的数字\n", - " # re.split 会返回一个列表,包含分割后的文本和匹配到的捕获组\n", - " parts = re.split(pattern, content)\n", - " \n", - " # 处理结果列表,将数字转换为整型\n", - " result = []\n", - " for i, part in enumerate(parts):\n", - " if i % 2 == 0: # 偶数索引是文本\n", - " result.append(part)\n", - " else: # 奇数索引是数字\n", - " result.append(int(part))\n", - " \n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['', 1, 'world', 2, '!']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "split_media_tags('world!') # ['hello', 1, 'world', 2, '!']" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input: Select the instrument represented in images that corresponds to the audio [audio1] from [img1] [img2] [img3] [img4].\n", - "Output: ['Select the instrument represented in images that corresponds to the audio ', ('audio', 1), ' from ', ('img', 1), ' ', ('img', 2), ' ', ('img', 3), ' ', ('img', 4), '.']\n", - "\n", - "Input: [video1] is a great video\n", - "Output: [('video', 1), ' is a great video']\n", - "\n", - "Input: Compare [img1] and [img2]\n", - "Output: ['Compare ', ('img', 1), ' and ', ('img', 2)]\n", - "\n", - "Input: Listen to [audio1] and watch [video1]\n", - "Output: ['Listen to ', ('audio', 1), ' and watch ', ('video', 1)]\n", - "\n", - "Input: [img1] at the beginning and [img2] at the end\n", - "Output: [('img', 1), ' at the beginning and ', ('img', 2), ' at the end']\n", - "\n" - ] - } - ], - "source": [ - "import re\n", - "\n", - "def split_media_tags(content):\n", - " # 匹配 [类型数字] 格式的标签\n", - " # 捕获组 1 捕获类型 (audio|video|img)\n", - " # 捕获组 2 捕获数字\n", - " pattern = r'\\[(audio|video|img)(\\d+)\\]'\n", - " \n", - " # 用 finditer 找到所有匹配\n", - " matches = list(re.finditer(pattern, content))\n", - " if not matches:\n", - " return [content]\n", - " \n", - " result = []\n", - " last_end = 0\n", - " \n", - " for match in matches:\n", - " # 添加标签之前的文本(如果有)\n", - " if match.start() > last_end:\n", - " result.append(content[last_end:match.start()])\n", - " \n", - " # 添加标签信息为元组 (类型, 数字)\n", - " media_type = match.group(1) # audio, video 或 img\n", - " media_num = int(match.group(2)) # 数字\n", - " result.append((media_type, media_num))\n", - " \n", - " last_end = match.end()\n", - " \n", - " # 添加最后一个标签之后的文本(如果有)\n", - " if last_end < len(content):\n", - " result.append(content[last_end:])\n", - " \n", - " return result\n", - "\n", - "# 测试\n", - "test_cases = [\n", - " \"Select the instrument represented in images that corresponds to the audio [audio1] from [img1] [img2] [img3] [img4].\",\n", - " \"[video1] is a great video\",\n", - " \"Compare [img1] and [img2]\",\n", - " \"Listen to [audio1] and watch [video1]\",\n", - " \"[img1] at the beginning and [img2] at the end\",\n", - "]\n", - "\n", - "for test in test_cases:\n", - " print(f\"Input: {test}\")\n", - " print(f\"Output: {split_media_tags(test)}\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "av-odyssey", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}