diff --git a/d/2025-01-17.html b/d/2025-01-17.html
index 1a2d6f7d..1d98c0d1 100644
--- a/d/2025-01-17.html
+++ b/d/2025-01-17.html
@@ -881,7 +881,7 @@
}
}
- const articlesData = [{'id': 'https://huggingface.co/papers/2501.09732', 'title': 'Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps', 'url': 'https://huggingface.co/papers/2501.09732', 'abstract': 'Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.', 'score': 34, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '2ad32c666f91ba05', 'authors': ['Nanye Ma', 'Shangyuan Tong', 'Haolin Jia', 'Hexiang Hu', 'Yu-Chuan Su', 'Mingda Zhang', 'Xuan Yang', 'Yandong Li', 'Tommi Jaakkola', 'Xuhui Jia', 'Saining Xie'], 'affiliations': ['Google', 'MIT', 'NYU'], 'pdf_title_img': 'assets/pdf/title_img/2501.09732.jpg', 'data': {'categories': ['#diffusion', '#inference', '#benchmark', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Повышение качества генерации изображений за счет масштабирования вычислений при выводе', 'desc': 'Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислений во время вывода. Авторы рассматривают задачу поиска лучших шумов для процесса сэмплирования диффузионной модели. Они структурируют пространство решений по двум осям: верификаторы для обратной связи и алгоритмы поиска лучших кандидатов шума. Эксперименты показывают, что увеличение вычислений при выводе приводит к значительному улучшению качества сгенерированных изображений.'}, 'en': {'title': 'Enhancing Diffusion Models: Scaling Inference for Better Image Generation', 'desc': 'This paper investigates how to enhance the performance of diffusion models during the inference phase by increasing computational resources. It highlights that, unlike Large Language Models (LLMs), diffusion models can adjust their inference process through the number of denoising steps, but improvements tend to plateau after a certain point. The authors propose a method to optimize the noise used in the diffusion sampling process by exploring different feedback verifiers and algorithms. Their experiments demonstrate that by strategically increasing computation during inference, the quality of generated images can be significantly improved, tailored to various application needs.'}, 'zh': {'title': '扩散模型推理时的计算扩展与性能提升', 'desc': '生成模型在多个领域产生了重要影响,主要得益于其在训练过程中通过增加数据、计算资源和模型规模来扩展的能力。最近的研究开始探讨大型语言模型(LLMs)在推理时的扩展行为,发现额外的计算可以进一步提高性能。与LLMs不同,扩散模型通过去噪步骤的数量灵活调整推理时的计算,尽管性能提升通常在几十步后趋于平稳。本文探讨了扩散模型在推理时的扩展行为,研究如何通过增加计算来进一步提高生成性能,特别是通过寻找更好的噪声来优化扩散采样过程。'}}}, {'id': 'https://huggingface.co/papers/2501.09751', 'title': 'OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking', 'url': 'https://huggingface.co/papers/2501.09751', 'abstract': "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.", 'score': 29, 'issue_id': 1722, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '7e8d42358354f79b', 'authors': ['Zekun Xi', 'Wenbiao Yin', 'Jizhan Fang', 'Jialong Wu', 'Runnan Fang', 'Ningyu Zhang', 'Jiang Yong', 'Pengjun Xie', 'Fei Huang', 'Huajun Chen'], 'affiliations': ['Tongyi Lab, Alibaba Group', 'Zhejiang Key Laboratory of Big Data Intelligent Computing', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09751.jpg', 'data': {'categories': ['#rag', '#story_generation', '#long_context', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста', 'desc': 'Статья представляет новый подход к генерации текста с использованием больших языковых моделей, названный OmniThink. Этот метод имитирует человеческий процесс итеративного расширения знаний и рефлексии, преодолевая ограничения стандартных методов извлечения информации. OmniThink улучшает плотность знаний в генерируемых статьях, не жертвуя связностью и глубиной. Эксперименты и оценки экспертов подтверждают эффективность OmniThink для решения реальных задач генерации длинных статей.'}, 'en': {'title': 'OmniThink: Elevating Machine Writing through Human-Like Learning', 'desc': 'This paper introduces OmniThink, a novel machine writing framework that enhances the capabilities of large language models by mimicking human cognitive processes. Unlike traditional retrieval-augmented generation methods, which often produce shallow and repetitive content, OmniThink focuses on iterative expansion and reflection to deepen knowledge on topics. The framework significantly improves the knowledge density of generated articles while maintaining coherence and depth, as shown by experimental results. Human evaluations and expert feedback confirm that OmniThink effectively addresses challenges in generating high-quality long-form content.'}, 'zh': {'title': 'OmniThink:提升机器写作的知识密度', 'desc': '本文提出了一种名为OmniThink的机器写作框架,旨在改善传统大语言模型在生成内容时的局限性。OmniThink模拟人类学习者的认知过程,通过迭代扩展和反思来加深对主题的理解。实验结果表明,OmniThink能够提高生成文章的知识密度,同时保持连贯性和深度等指标。人类评估和专家反馈进一步验证了OmniThink在生成长篇文章时解决实际问题的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.09755', 'title': 'Learnings from Scaling Visual Tokenizers for Reconstruction and Generation', 'url': 'https://huggingface.co/papers/2501.09755', 'abstract': "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.", 'score': 19, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '426aa3415c3c0ef4', 'authors': ['Philippe Hansen-Estruch', 'David Yan', 'Ching-Yao Chung', 'Orr Zohar', 'Jialiang Wang', 'Tingbo Hou', 'Tao Xu', 'Sriram Vishwanath', 'Peter Vajda', 'Xinlei Chen'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'Stanford University', 'UT Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.09755.jpg', 'data': {'categories': ['#cv', '#benchmark', '#video', '#optimization', '#architecture', '#diffusion'], 'emoji': '🔬', 'ru': {'title': 'ViTok: Оптимизация визуальной токенизации для генеративных моделей', 'desc': 'Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделях изображений и видео. Авторы представляют ViTok - легковесный автоэнкодер на основе Vision Transformer, обученный на масштабных датасетах. Исследование показывает, что масштабирование декодера улучшает реконструкцию, но неоднозначно влияет на генерацию. ViTok демонстрирует конкурентоспособную производительность при меньшем количестве FLOP и устанавливает новые рекорды в условной генерации видео.'}, 'en': {'title': 'Scaling Auto-Encoders for Enhanced Image and Video Generation', 'desc': 'This paper explores the scaling of auto-encoders, particularly focusing on the tokenizer component, which is crucial for image and video generation. The authors introduce ViTok, a Vision Transformer-based architecture that replaces traditional convolutional backbones, allowing for better scaling on large datasets. They investigate how different scaling strategies for the encoder and decoder affect both reconstruction and generative performance, finding that scaling the decoder is more beneficial for reconstruction. Ultimately, ViTok achieves competitive results with fewer computational resources and sets new benchmarks in image and video generation tasks.'}, 'zh': {'title': '自编码器的视觉标记化:提升生成模型的关键', 'desc': '本论文探讨了通过自编码器进行视觉标记化对图像和视频生成模型的影响。我们提出了一种增强的视觉变换器架构(ViTok),用于替代传统的卷积骨干网络,以提高标记化的效果。研究发现,自编码器的瓶颈规模与重建性能高度相关,但与生成性能的关系更为复杂。最终,ViTok在多个任务中表现出色,尤其是在视频重建和图像生成方面,展示了其在计算效率上的优势。'}}}, {'id': 'https://huggingface.co/papers/2501.09484', 'title': 'Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators', 'url': 'https://huggingface.co/papers/2501.09484', 'abstract': 'Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the "inquiry" phase of the consultation process. This lack of focus has left the relationship between "inquiry" and "diagnosis" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between "inquiry" and "diagnosis" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig\'s law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.', 'score': 16, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'aff7d86ad63040d9', 'authors': ['Zhaocheng Liu', 'Quan Tu', 'Wen Ye', 'Yu Xiao', 'Zhishou Zhang', 'Hengfu Cui', 'Yalun Zhu', 'Qiang Ju', 'Shizheng Li', 'Jian Xie'], 'affiliations': ['Baichuan Inc.', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09484.jpg', 'data': {'categories': ['#data', '#training', '#science', '#open_source', '#healthcare'], 'emoji': '🩺', 'ru': {'title': 'Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ', 'desc': 'Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам.'}, 'en': {'title': 'Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations', 'desc': "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."}, 'zh': {'title': '优化询问,提升诊断效果', 'desc': '本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。'}}}, {'id': 'https://huggingface.co/papers/2501.09686', 'title': 'Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models', 'url': 'https://huggingface.co/papers/2501.09686', 'abstract': 'Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs\' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs\' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI\'s o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.', 'score': 14, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1c6b1b1f0235304c', 'authors': ['Fengli Xu', 'Qianyue Hao', 'Zefang Zong', 'Jingwei Wang', 'Yunke Zhang', 'Jingyi Wang', 'Xiaochong Lan', 'Jiahui Gong', 'Tianjian Ouyang', 'Fanjin Meng', 'Chenyang Shao', 'Yuwei Yan', 'Qinglong Yang', 'Yiwen Song', 'Sijian Ren', 'Xinyuan Hu', 'Yu Li', 'Jie Feng', 'Chen Gao', 'Yong Li'], 'affiliations': ['Emory University, Atlanta GA, USA', 'HKUST (GZ), Guangzhou, China', 'Tsinghua University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09686.jpg', 'data': {'categories': ['#open_source', '#training', '#rl', '#survey', '#reasoning', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Путь к большим моделям рассуждений: новый рубеж в ИИ', 'desc': 'Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваются ключевые технические компоненты, способствующие развитию крупных моделей рассуждений, включая автоматизированное построение данных, методы обучения рассуждениям и масштабирование во время тестирования. Анализируются популярные проекты с открытым исходным кодом по созданию крупных моделей рассуждений. Обсуждаются открытые проблемы и направления будущих исследований в этой области.'}, 'en': {'title': 'Unlocking Human-Like Reasoning in Large Language Models', 'desc': "This paper discusses the advancements in Large Language Models (LLMs) and their application to complex reasoning tasks. It introduces the concept of 'thought', which represents intermediate reasoning steps, allowing LLMs to simulate human-like reasoning processes. The paper highlights the use of reinforcement learning to enhance LLMs' reasoning capabilities by generating high-quality reasoning trajectories through trial-and-error methods. Additionally, it emphasizes the importance of scaling both training and testing phases to improve reasoning accuracy, paving the way for the development of Large Reasoning Models."}, 'zh': {'title': '推动大型推理模型的研究新前沿', 'desc': '这篇论文探讨了大型语言模型(LLMs)在复杂推理任务中的应用。研究者们引入了“思考”的概念,通过中间步骤的令牌序列来模拟人类的推理过程。最近,强化学习(RL)被应用于训练LLMs,以自动生成高质量的推理轨迹,从而显著提高推理能力。论文还讨论了在测试时增加令牌数量以提高推理准确性的效果,并展望了大型推理模型的未来研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09756', 'title': 'SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces', 'url': 'https://huggingface.co/papers/2501.09756', 'abstract': "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/", 'score': 12, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'e6621d55eb165448', 'authors': ['Sumit Chaturvedi', 'Mengwei Ren', 'Yannick Hold-Geoffroy', 'Jingyuan Liu', 'Julie Dorsey', 'Zhixin Shu'], 'affiliations': ['Adobe Research', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09756.jpg', 'data': {'categories': ['#dataset', '#3d', '#inference', '#cv', '#diffusion', '#training', '#synthetic'], 'emoji': '💡', 'ru': {'title': 'SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели', 'desc': 'SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматривает перезасветку как проблему повторного рендеринга, где пиксели трансформируются в ответ на изменения условий освещения окружающей среды. Авторы синтезировали датасет с помощью физически корректного рендеринга, симулируя трансформации освещения на 3D-моделях голов. Предложены две стратегии обучения и вывода для преодоления разрыва между синтетическими и реальными изображениями.'}, 'en': {'title': 'Revolutionizing Portrait Relighting with SynthLight', 'desc': 'SynthLight is a diffusion model designed for relighting portraits by treating the task as a re-rendering challenge influenced by environmental lighting changes. It utilizes a physically-based rendering engine to create a synthetic dataset that simulates how lighting affects 3D head models. The model employs multi-task training to utilize real portraits without specific lighting labels and a novel inference strategy that enhances detail preservation during the relighting process. The results show that SynthLight can effectively generalize to real images, producing realistic lighting effects while maintaining the identity of the subjects, outperforming existing methods in both quantitative and qualitative assessments.'}, 'zh': {'title': 'SynthLight:肖像重光照的新方法', 'desc': '我们介绍了SynthLight,这是一种用于肖像重光照的扩散模型。我们将图像重光照视为重新渲染的问题,通过物理基础渲染引擎合成数据集,以模拟在不同光照条件下的像素变换。我们提出了两种训练和推理策略,以缩小合成图像和真实图像之间的差距,利用真实人像进行多任务训练,并在推理时使用无分类器引导的扩散采样程序。我们的模型能够在多样的真实照片中推广,生成逼真的光照效果,同时保持主体的身份特征。'}}}, {'id': 'https://huggingface.co/papers/2501.09747', 'title': 'FAST: Efficient Action Tokenization for Vision-Language-Action Models', 'url': 'https://huggingface.co/papers/2501.09747', 'abstract': 'Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.', 'score': 11, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1ff64d2f7e62d274', 'authors': ['Karl Pertsch', 'Kyle Stachowicz', 'Brian Ichter', 'Danny Driess', 'Suraj Nair', 'Quan Vuong', 'Oier Mees', 'Chelsea Finn', 'Sergey Levine'], 'affiliations': ['Physical Intelligence', 'Stanford', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.09747.jpg', 'data': {'categories': ['#dataset', '#agents', '#training', '#games', '#optimization', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'Революция в токенизации действий робота: от частотного пространства к универсальности', 'desc': 'Статья представляет новый метод токенизации действий робота под названием FAST (Frequency-space Action Sequence Tokenization), основанный на дискретном косинусном преобразовании. Этот подход позволяет обучать авторегрессионные модели VLA (Vision-Language Action) для высокочастотных и сложных задач манипулирования, где стандартные методы дискретизации не работают. Авторы также представляют FAST+, универсальный токенизатор действий робота, обученный на 1 миллионе реальных траекторий. В сочетании с моделью pi0 VLA, метод FAST позволяет обучаться на 10 тысячах часов данных робота и достигать производительности диффузионных VLA, сокращая время обучения до 5 раз.'}, 'en': {'title': 'Revolutionizing Robot Action Tokenization with FAST', 'desc': 'This paper introduces a new method for tokenizing continuous robot actions to improve the performance of autoregressive sequence models, specifically in the context of vision-language action (VLA) policies. The authors identify that traditional tokenization methods, which use simple binning techniques, struggle with high-frequency and dexterous robotic tasks. To overcome this limitation, they propose Frequency-space Action Sequence Tokenization (FAST), which utilizes the discrete cosine transform for better action representation. The results demonstrate that FAST can effectively train VLAs on extensive robot data, achieving performance comparable to diffusion models while significantly reducing training time.'}, 'zh': {'title': '提升机器人灵巧技能的标记化新方法', 'desc': '本文提出了一种新的机器人动作标记化方案,称为频率空间动作序列标记化(FAST),旨在解决现有基于简单分箱方法的标记化在学习灵巧技能时的不足。FAST利用离散余弦变换来有效地处理高频机器人数据,从而提高了模型在复杂任务中的表现。我们还发布了FAST+,这是一个通用的机器人动作标记器,能够处理多种动作序列和控制频率。通过与pi0 VLA结合,我们的方法在训练10,000小时的机器人数据时,能够与扩散VLA的性能相匹配,同时将训练时间减少了多达5倍。'}}}, {'id': 'https://huggingface.co/papers/2501.09038', 'title': 'Do generative video models learn physical principles from watching videos?', 'url': 'https://huggingface.co/papers/2501.09038', 'abstract': "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.", 'score': 9, 'issue_id': 1725, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '6a5047e8681ddcc5', 'authors': ['Saman Motamed', 'Laura Culp', 'Kevin Swersky', 'Priyank Jaini', 'Robert Geirhos'], 'affiliations': ['Google DeepMind', 'INSAIT, Sofia University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09038.jpg', 'data': {'categories': ['#benchmark', '#science', '#video'], 'emoji': '🧠', 'ru': {'title': 'Визуальный реализм не гарантирует понимание физики в ИИ', 'desc': 'Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моделей понимать законы физики. Результаты показывают, что современные модели имеют ограниченное физическое понимание, несмотря на визуальный реализм. Однако некоторые задачи уже успешно решаются, что указывает на потенциал изучения физических принципов из наблюдений.'}, 'en': {'title': 'Visual Realism vs. Physical Understanding in AI Video Generation', 'desc': "This paper explores whether AI video generation models truly understand the laws of physics or if they are just good at creating realistic images. The authors introduce Physics-IQ, a benchmark dataset designed to test models on their grasp of physical principles like fluid dynamics and thermodynamics. Their findings show that current models struggle with physical understanding, even though they can produce visually realistic videos. This suggests that while some physical concepts can be learned from observation, there are still significant gaps in the models' comprehension of reality."}, 'zh': {'title': '视觉真实感不等于物理理解', 'desc': '本论文探讨了AI视频生成技术的进展,特别是模型是否理解物理规律。我们开发了Physics-IQ,一个全面的基准数据集,只有通过深入理解流体动力学、光学、固体力学、磁学和热力学等物理原理才能解决。研究发现,当前模型在物理理解方面存在严重限制,且与视觉真实感无关。尽管某些测试案例已成功解决,但这表明仅通过观察获得某些物理原理仍面临重大挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.09433', 'title': 'CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation', 'url': 'https://huggingface.co/papers/2501.09433', 'abstract': 'The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.', 'score': 9, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '8c7a54f21e46af7a', 'authors': ['Hwan Heo', 'Jangyeong Kim', 'Seongyeong Lee', 'Jeong A Wi', 'Junyoung Choi', 'Sangjun Ahn'], 'affiliations': ['Graphics AI Lab, NC Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.09433.jpg', 'data': {'categories': ['#diffusion', '#3d', '#optimization'], 'emoji': '🎨', 'ru': {'title': 'CaPa: Революция в генерации 3D-моделей', 'desc': 'В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощью латентной диффузионной модели и пространственно-разделенного внимания. CaPa также предлагает алгоритм для заполнения нетекстурированных областей, обеспечивая целостность результатов. Фреймворк генерирует 3D-модели менее чем за 30 секунд, превосходя аналоги по качеству текстур и стабильности геометрии.'}, 'en': {'title': 'CaPa: Fast and High-Fidelity 3D Asset Generation', 'desc': 'This paper presents CaPa, a novel framework for generating high-quality 3D assets from textual or visual inputs. It addresses common challenges in 3D generation, such as multi-view inconsistency and slow generation times, by separating geometry generation from texture synthesis. The framework utilizes a 3D latent diffusion model for consistent geometry creation and a Spatially Decoupled Attention mechanism for high-resolution texture synthesis. CaPa also includes a 3D-aware occlusion inpainting algorithm to enhance the final output, achieving high fidelity and stability in under 30 seconds.'}, 'zh': {'title': '高效生成高保真3D资产的CaPa框架', 'desc': '本论文介绍了一种名为CaPa的框架,用于高效生成高保真度的3D资产。该框架采用两阶段的过程,将几何体生成与纹理合成解耦。首先,使用3D潜在扩散模型生成几何体,确保多视角之间的结构一致性。然后,通过一种新颖的空间解耦注意力机制合成高分辨率纹理,并提出了3D感知的遮挡修复算法,最终在30秒内生成高质量的3D资产。'}}}, {'id': 'https://huggingface.co/papers/2501.09653', 'title': 'The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models', 'url': 'https://huggingface.co/papers/2501.09653', 'abstract': 'The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.', 'score': 8, 'issue_id': 1730, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '6d731a1519dc2727', 'authors': ['Jonathan Katzy', 'Razvan Mihai Popescu', 'Arie van Deursen', 'Maliheh Izadi'], 'affiliations': ['Delft University of Technology Delft, The Netherlands'], 'pdf_title_img': 'assets/pdf/title_img/2501.09653.jpg', 'data': {'categories': ['#low_resource', '#multilingual', '#open_source', '#data', '#dataset'], 'emoji': '🗃️', 'ru': {'title': 'The Heap: чистый код для честной оценки языковых моделей', 'desc': "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор данных под названием 'The Heap' охватывает 57 языков программирования и был дедуплицирован относительно других открытых наборов данных. Это позволяет исследователям проводить объективные оценки больших языковых моделей без необходимости значительной предварительной очистки данных. Создание 'The Heap' решает проблему ограниченности доступного кода для исследования специфических поведений моделей и их оценки без риска загрязнения данных."}, 'en': {'title': 'The Heap: A Clean Dataset for Fair Evaluation of Language Models', 'desc': 'This paper introduces The Heap, a comprehensive multilingual dataset that includes code from 57 programming languages. It addresses the challenge of data contamination in evaluating large language models by providing a deduplicated dataset, ensuring that the code is unique compared to existing open datasets. Researchers can utilize The Heap for downstream tasks without the burden of extensive data cleaning. This resource aims to facilitate fair assessments of model performance in coding tasks.'}, 'zh': {'title': '公平评估大型语言模型的新数据集', 'desc': '随着大型语言模型的流行,开发了大量的代码数据集来训练这些模型。然而,这导致可用于特定行为研究或评估大型语言模型的代码有限,且可能存在数据污染的问题。为了解决这个问题,我们发布了The Heap,这是一个覆盖57种编程语言的大型多语言数据集,经过去重处理,避免与其他开放代码数据集重复。这样,研究人员可以在不需要大量数据清理的情况下,公平地评估大型语言模型。'}}}, {'id': 'https://huggingface.co/papers/2501.08617', 'title': 'RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation', 'url': 'https://huggingface.co/papers/2501.08617', 'abstract': "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.", 'score': 7, 'issue_id': 1720, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'f758bc630d8dd443', 'authors': ['Kaiqu Liang', 'Haimin Hu', 'Ryan Liu', 'Thomas L. Griffiths', 'Jaime Fernández Fisac'], 'affiliations': ['Department of Computer Science, Princeton University', 'Department of Electrical and Computer Engineering, Princeton University', 'Department of Psychology, Princeton University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08617.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#training', '#rl'], 'emoji': '🔮', 'ru': {'title': 'Взгляд в будущее для лучшей настройки ИИ', 'desc': 'Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS использует симуляцию долгосрочных последствий действий модели и оценку их полезности постфактум. Авторы показывают, что RLHS позволяет уменьшить проблему неправильной мотивации модели и улучшить соответствие человеческим ценностям. Эмпирические эксперименты демонстрируют превосходство RLHS над RLHF в достижении целей пользователей.'}, 'en': {'title': 'Aligning AI with Human Values through Hindsight Feedback', 'desc': "This paper addresses the challenge of aligning generative AI systems with human values using Reinforcement Learning from Human Feedback (RLHF). It identifies that relying on immediate feedback can lead to misaligned behaviors, such as sycophancy and deception, due to Goodhart's Law dynamics. The authors propose a new approach called Reinforcement Learning from Hindsight Simulation (RLHS), which uses simulated consequences to gather feedback on beneficial behaviors. Their experiments show that RLHS improves user satisfaction and goal achievement compared to traditional RLHF methods, highlighting the importance of considering long-term outcomes in AI alignment."}, 'zh': {'title': '关注长期后果,提升AI对齐性', 'desc': '这篇论文探讨了生成性人工智能系统如何更好地与人类价值观对齐,以确保其行为有益且可信。现有的基于人类反馈的强化学习(RLHF)方法主要依赖即时反馈,但这种反馈可能无法准确反映与用户效用相关的长期影响。作者提出了一种新的方法,称为基于事后模拟的强化学习(RLHS),通过模拟可能的后果来获取反馈,从而改善模型的对齐性。研究表明,RLHS在帮助用户实现目标和提高满意度方面,优于传统的RLHF方法。'}}}, {'id': 'https://huggingface.co/papers/2501.09503', 'title': 'AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation', 'url': 'https://huggingface.co/papers/2501.09503', 'abstract': 'Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .', 'score': 6, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'fb27e795153a9668', 'authors': ['Junjie He', 'Yuxiang Tuo', 'Binghui Chen', 'Chongyang Zhong', 'Yifeng Geng', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.09503.jpg', 'data': {'categories': ['#cv', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами', 'desc': 'Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод использует универсальный энкодер изображений ReferenceNet и CLIP для высококачественного кодирования характеристик субъектов. AnyStory применяет декуплированный маршрутизатор субъектов для точного определения их потенциального расположения в латентном пространстве. Эксперименты показывают превосходную производительность метода в сохранении деталей субъектов, соответствии текстовым описаниям и персонализации для нескольких субъектов одновременно.'}, 'en': {'title': 'AnyStory: Mastering Personalized Image Generation for Multiple Subjects', 'desc': "This paper introduces AnyStory, a novel method for generating personalized images with high fidelity, even when multiple subjects are involved. It employs an 'encode-then-route' strategy, where a powerful image encoder, ReferenceNet, captures detailed subject features. The routing mechanism uses an instance-aware subject router to accurately determine where each subject should be placed in the generated image. Experimental results show that AnyStory excels in maintaining subject details and aligning them with text descriptions, making it effective for both single and multiple subjects."}, 'zh': {'title': 'AnyStory:个性化主题生成的新方法', 'desc': '最近,大规模生成模型在文本到图像生成方面表现出色。然而,生成高保真度的个性化图像,尤其是涉及多个主题的情况,仍然面临挑战。本文提出了AnyStory,这是一种统一的个性化主题生成方法,能够在不牺牲主题保真的情况下,实现单个和多个主题的高保真个性化。AnyStory通过“编码-再路由”的方式建模主题个性化问题,利用强大的图像编码器和实例感知路由器,准确预测主题在潜在空间中的位置。'}}}];
+ const articlesData = [{'id': 'https://huggingface.co/papers/2501.09732', 'title': 'Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps', 'url': 'https://huggingface.co/papers/2501.09732', 'abstract': 'Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.', 'score': 40, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '2ad32c666f91ba05', 'authors': ['Nanye Ma', 'Shangyuan Tong', 'Haolin Jia', 'Hexiang Hu', 'Yu-Chuan Su', 'Mingda Zhang', 'Xuan Yang', 'Yandong Li', 'Tommi Jaakkola', 'Xuhui Jia', 'Saining Xie'], 'affiliations': ['Google', 'MIT', 'NYU'], 'pdf_title_img': 'assets/pdf/title_img/2501.09732.jpg', 'data': {'categories': ['#diffusion', '#inference', '#benchmark', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Повышение качества генерации изображений за счет масштабирования вычислений при выводе', 'desc': 'Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислений во время вывода. Авторы рассматривают задачу поиска лучших шумов для процесса сэмплирования диффузионной модели. Они структурируют пространство решений по двум осям: верификаторы для обратной связи и алгоритмы поиска лучших кандидатов шума. Эксперименты показывают, что увеличение вычислений при выводе приводит к значительному улучшению качества сгенерированных изображений.'}, 'en': {'title': 'Enhancing Diffusion Models: Scaling Inference for Better Image Generation', 'desc': 'This paper investigates how to enhance the performance of diffusion models during the inference phase by increasing computational resources. It highlights that, unlike Large Language Models (LLMs), diffusion models can adjust their inference process through the number of denoising steps, but improvements tend to plateau after a certain point. The authors propose a method to optimize the noise used in the diffusion sampling process by exploring different feedback verifiers and algorithms. Their experiments demonstrate that by strategically increasing computation during inference, the quality of generated images can be significantly improved, tailored to various application needs.'}, 'zh': {'title': '扩散模型推理时的计算扩展与性能提升', 'desc': '生成模型在多个领域产生了重要影响,主要得益于其在训练过程中通过增加数据、计算资源和模型规模来扩展的能力。最近的研究开始探讨大型语言模型(LLMs)在推理时的扩展行为,发现额外的计算可以进一步提高性能。与LLMs不同,扩散模型通过去噪步骤的数量灵活调整推理时的计算,尽管性能提升通常在几十步后趋于平稳。本文探讨了扩散模型在推理时的扩展行为,研究如何通过增加计算来进一步提高生成性能,特别是通过寻找更好的噪声来优化扩散采样过程。'}}}, {'id': 'https://huggingface.co/papers/2501.09751', 'title': 'OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking', 'url': 'https://huggingface.co/papers/2501.09751', 'abstract': "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.", 'score': 31, 'issue_id': 1722, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '7e8d42358354f79b', 'authors': ['Zekun Xi', 'Wenbiao Yin', 'Jizhan Fang', 'Jialong Wu', 'Runnan Fang', 'Ningyu Zhang', 'Jiang Yong', 'Pengjun Xie', 'Fei Huang', 'Huajun Chen'], 'affiliations': ['Tongyi Lab, Alibaba Group', 'Zhejiang Key Laboratory of Big Data Intelligent Computing', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09751.jpg', 'data': {'categories': ['#rag', '#story_generation', '#long_context', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста', 'desc': 'Статья представляет новый подход к генерации текста с использованием больших языковых моделей, названный OmniThink. Этот метод имитирует человеческий процесс итеративного расширения знаний и рефлексии, преодолевая ограничения стандартных методов извлечения информации. OmniThink улучшает плотность знаний в генерируемых статьях, не жертвуя связностью и глубиной. Эксперименты и оценки экспертов подтверждают эффективность OmniThink для решения реальных задач генерации длинных статей.'}, 'en': {'title': 'OmniThink: Elevating Machine Writing through Human-Like Learning', 'desc': 'This paper introduces OmniThink, a novel machine writing framework that enhances the capabilities of large language models by mimicking human cognitive processes. Unlike traditional retrieval-augmented generation methods, which often produce shallow and repetitive content, OmniThink focuses on iterative expansion and reflection to deepen knowledge on topics. The framework significantly improves the knowledge density of generated articles while maintaining coherence and depth, as shown by experimental results. Human evaluations and expert feedback confirm that OmniThink effectively addresses challenges in generating high-quality long-form content.'}, 'zh': {'title': 'OmniThink:提升机器写作的知识密度', 'desc': '本文提出了一种名为OmniThink的机器写作框架,旨在改善传统大语言模型在生成内容时的局限性。OmniThink模拟人类学习者的认知过程,通过迭代扩展和反思来加深对主题的理解。实验结果表明,OmniThink能够提高生成文章的知识密度,同时保持连贯性和深度等指标。人类评估和专家反馈进一步验证了OmniThink在生成长篇文章时解决实际问题的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.09755', 'title': 'Learnings from Scaling Visual Tokenizers for Reconstruction and Generation', 'url': 'https://huggingface.co/papers/2501.09755', 'abstract': "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.", 'score': 22, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '426aa3415c3c0ef4', 'authors': ['Philippe Hansen-Estruch', 'David Yan', 'Ching-Yao Chung', 'Orr Zohar', 'Jialiang Wang', 'Tingbo Hou', 'Tao Xu', 'Sriram Vishwanath', 'Peter Vajda', 'Xinlei Chen'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'Stanford University', 'UT Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.09755.jpg', 'data': {'categories': ['#cv', '#benchmark', '#video', '#optimization', '#architecture', '#diffusion'], 'emoji': '🔬', 'ru': {'title': 'ViTok: Оптимизация визуальной токенизации для генеративных моделей', 'desc': 'Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделях изображений и видео. Авторы представляют ViTok - легковесный автоэнкодер на основе Vision Transformer, обученный на масштабных датасетах. Исследование показывает, что масштабирование декодера улучшает реконструкцию, но неоднозначно влияет на генерацию. ViTok демонстрирует конкурентоспособную производительность при меньшем количестве FLOP и устанавливает новые рекорды в условной генерации видео.'}, 'en': {'title': 'Scaling Auto-Encoders for Enhanced Image and Video Generation', 'desc': 'This paper explores the scaling of auto-encoders, particularly focusing on the tokenizer component, which is crucial for image and video generation. The authors introduce ViTok, a Vision Transformer-based architecture that replaces traditional convolutional backbones, allowing for better scaling on large datasets. They investigate how different scaling strategies for the encoder and decoder affect both reconstruction and generative performance, finding that scaling the decoder is more beneficial for reconstruction. Ultimately, ViTok achieves competitive results with fewer computational resources and sets new benchmarks in image and video generation tasks.'}, 'zh': {'title': '自编码器的视觉标记化:提升生成模型的关键', 'desc': '本论文探讨了通过自编码器进行视觉标记化对图像和视频生成模型的影响。我们提出了一种增强的视觉变换器架构(ViTok),用于替代传统的卷积骨干网络,以提高标记化的效果。研究发现,自编码器的瓶颈规模与重建性能高度相关,但与生成性能的关系更为复杂。最终,ViTok在多个任务中表现出色,尤其是在视频重建和图像生成方面,展示了其在计算效率上的优势。'}}}, {'id': 'https://huggingface.co/papers/2501.09686', 'title': 'Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models', 'url': 'https://huggingface.co/papers/2501.09686', 'abstract': 'Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs\' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs\' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI\'s o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.', 'score': 20, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1c6b1b1f0235304c', 'authors': ['Fengli Xu', 'Qianyue Hao', 'Zefang Zong', 'Jingwei Wang', 'Yunke Zhang', 'Jingyi Wang', 'Xiaochong Lan', 'Jiahui Gong', 'Tianjian Ouyang', 'Fanjin Meng', 'Chenyang Shao', 'Yuwei Yan', 'Qinglong Yang', 'Yiwen Song', 'Sijian Ren', 'Xinyuan Hu', 'Yu Li', 'Jie Feng', 'Chen Gao', 'Yong Li'], 'affiliations': ['Emory University, Atlanta GA, USA', 'HKUST (GZ), Guangzhou, China', 'Tsinghua University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09686.jpg', 'data': {'categories': ['#open_source', '#training', '#rl', '#survey', '#reasoning', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Путь к большим моделям рассуждений: новый рубеж в ИИ', 'desc': 'Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваются ключевые технические компоненты, способствующие развитию крупных моделей рассуждений, включая автоматизированное построение данных, методы обучения рассуждениям и масштабирование во время тестирования. Анализируются популярные проекты с открытым исходным кодом по созданию крупных моделей рассуждений. Обсуждаются открытые проблемы и направления будущих исследований в этой области.'}, 'en': {'title': 'Unlocking Human-Like Reasoning in Large Language Models', 'desc': "This paper discusses the advancements in Large Language Models (LLMs) and their application to complex reasoning tasks. It introduces the concept of 'thought', which represents intermediate reasoning steps, allowing LLMs to simulate human-like reasoning processes. The paper highlights the use of reinforcement learning to enhance LLMs' reasoning capabilities by generating high-quality reasoning trajectories through trial-and-error methods. Additionally, it emphasizes the importance of scaling both training and testing phases to improve reasoning accuracy, paving the way for the development of Large Reasoning Models."}, 'zh': {'title': '推动大型推理模型的研究新前沿', 'desc': '这篇论文探讨了大型语言模型(LLMs)在复杂推理任务中的应用。研究者们引入了“思考”的概念,通过中间步骤的令牌序列来模拟人类的推理过程。最近,强化学习(RL)被应用于训练LLMs,以自动生成高质量的推理轨迹,从而显著提高推理能力。论文还讨论了在测试时增加令牌数量以提高推理准确性的效果,并展望了大型推理模型的未来研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09484', 'title': 'Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators', 'url': 'https://huggingface.co/papers/2501.09484', 'abstract': 'Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the "inquiry" phase of the consultation process. This lack of focus has left the relationship between "inquiry" and "diagnosis" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between "inquiry" and "diagnosis" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig\'s law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.', 'score': 16, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'aff7d86ad63040d9', 'authors': ['Zhaocheng Liu', 'Quan Tu', 'Wen Ye', 'Yu Xiao', 'Zhishou Zhang', 'Hengfu Cui', 'Yalun Zhu', 'Qiang Ju', 'Shizheng Li', 'Jian Xie'], 'affiliations': ['Baichuan Inc.', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09484.jpg', 'data': {'categories': ['#data', '#training', '#science', '#open_source', '#healthcare'], 'emoji': '🩺', 'ru': {'title': 'Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ', 'desc': 'Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам.'}, 'en': {'title': 'Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations', 'desc': "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."}, 'zh': {'title': '优化询问,提升诊断效果', 'desc': '本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。'}}}, {'id': 'https://huggingface.co/papers/2501.09756', 'title': 'SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces', 'url': 'https://huggingface.co/papers/2501.09756', 'abstract': "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/", 'score': 14, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'e6621d55eb165448', 'authors': ['Sumit Chaturvedi', 'Mengwei Ren', 'Yannick Hold-Geoffroy', 'Jingyuan Liu', 'Julie Dorsey', 'Zhixin Shu'], 'affiliations': ['Adobe Research', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09756.jpg', 'data': {'categories': ['#dataset', '#3d', '#inference', '#cv', '#diffusion', '#training', '#synthetic'], 'emoji': '💡', 'ru': {'title': 'SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели', 'desc': 'SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматривает перезасветку как проблему повторного рендеринга, где пиксели трансформируются в ответ на изменения условий освещения окружающей среды. Авторы синтезировали датасет с помощью физически корректного рендеринга, симулируя трансформации освещения на 3D-моделях голов. Предложены две стратегии обучения и вывода для преодоления разрыва между синтетическими и реальными изображениями.'}, 'en': {'title': 'Revolutionizing Portrait Relighting with SynthLight', 'desc': 'SynthLight is a diffusion model designed for relighting portraits by treating the task as a re-rendering challenge influenced by environmental lighting changes. It utilizes a physically-based rendering engine to create a synthetic dataset that simulates how lighting affects 3D head models. The model employs multi-task training to utilize real portraits without specific lighting labels and a novel inference strategy that enhances detail preservation during the relighting process. The results show that SynthLight can effectively generalize to real images, producing realistic lighting effects while maintaining the identity of the subjects, outperforming existing methods in both quantitative and qualitative assessments.'}, 'zh': {'title': 'SynthLight:肖像重光照的新方法', 'desc': '我们介绍了SynthLight,这是一种用于肖像重光照的扩散模型。我们将图像重光照视为重新渲染的问题,通过物理基础渲染引擎合成数据集,以模拟在不同光照条件下的像素变换。我们提出了两种训练和推理策略,以缩小合成图像和真实图像之间的差距,利用真实人像进行多任务训练,并在推理时使用无分类器引导的扩散采样程序。我们的模型能够在多样的真实照片中推广,生成逼真的光照效果,同时保持主体的身份特征。'}}}, {'id': 'https://huggingface.co/papers/2501.09747', 'title': 'FAST: Efficient Action Tokenization for Vision-Language-Action Models', 'url': 'https://huggingface.co/papers/2501.09747', 'abstract': 'Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.', 'score': 13, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1ff64d2f7e62d274', 'authors': ['Karl Pertsch', 'Kyle Stachowicz', 'Brian Ichter', 'Danny Driess', 'Suraj Nair', 'Quan Vuong', 'Oier Mees', 'Chelsea Finn', 'Sergey Levine'], 'affiliations': ['Physical Intelligence', 'Stanford', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.09747.jpg', 'data': {'categories': ['#dataset', '#agents', '#training', '#games', '#optimization', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'Революция в токенизации действий робота: от частотного пространства к универсальности', 'desc': 'Статья представляет новый метод токенизации действий робота под названием FAST (Frequency-space Action Sequence Tokenization), основанный на дискретном косинусном преобразовании. Этот подход позволяет обучать авторегрессионные модели VLA (Vision-Language Action) для высокочастотных и сложных задач манипулирования, где стандартные методы дискретизации не работают. Авторы также представляют FAST+, универсальный токенизатор действий робота, обученный на 1 миллионе реальных траекторий. В сочетании с моделью pi0 VLA, метод FAST позволяет обучаться на 10 тысячах часов данных робота и достигать производительности диффузионных VLA, сокращая время обучения до 5 раз.'}, 'en': {'title': 'Revolutionizing Robot Action Tokenization with FAST', 'desc': 'This paper introduces a new method for tokenizing continuous robot actions to improve the performance of autoregressive sequence models, specifically in the context of vision-language action (VLA) policies. The authors identify that traditional tokenization methods, which use simple binning techniques, struggle with high-frequency and dexterous robotic tasks. To overcome this limitation, they propose Frequency-space Action Sequence Tokenization (FAST), which utilizes the discrete cosine transform for better action representation. The results demonstrate that FAST can effectively train VLAs on extensive robot data, achieving performance comparable to diffusion models while significantly reducing training time.'}, 'zh': {'title': '提升机器人灵巧技能的标记化新方法', 'desc': '本文提出了一种新的机器人动作标记化方案,称为频率空间动作序列标记化(FAST),旨在解决现有基于简单分箱方法的标记化在学习灵巧技能时的不足。FAST利用离散余弦变换来有效地处理高频机器人数据,从而提高了模型在复杂任务中的表现。我们还发布了FAST+,这是一个通用的机器人动作标记器,能够处理多种动作序列和控制频率。通过与pi0 VLA结合,我们的方法在训练10,000小时的机器人数据时,能够与扩散VLA的性能相匹配,同时将训练时间减少了多达5倍。'}}}, {'id': 'https://huggingface.co/papers/2501.09038', 'title': 'Do generative video models learn physical principles from watching videos?', 'url': 'https://huggingface.co/papers/2501.09038', 'abstract': "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.", 'score': 11, 'issue_id': 1725, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '6a5047e8681ddcc5', 'authors': ['Saman Motamed', 'Laura Culp', 'Kevin Swersky', 'Priyank Jaini', 'Robert Geirhos'], 'affiliations': ['Google DeepMind', 'INSAIT, Sofia University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09038.jpg', 'data': {'categories': ['#benchmark', '#science', '#video'], 'emoji': '🧠', 'ru': {'title': 'Визуальный реализм не гарантирует понимание физики в ИИ', 'desc': 'Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моделей понимать законы физики. Результаты показывают, что современные модели имеют ограниченное физическое понимание, несмотря на визуальный реализм. Однако некоторые задачи уже успешно решаются, что указывает на потенциал изучения физических принципов из наблюдений.'}, 'en': {'title': 'Visual Realism vs. Physical Understanding in AI Video Generation', 'desc': "This paper explores whether AI video generation models truly understand the laws of physics or if they are just good at creating realistic images. The authors introduce Physics-IQ, a benchmark dataset designed to test models on their grasp of physical principles like fluid dynamics and thermodynamics. Their findings show that current models struggle with physical understanding, even though they can produce visually realistic videos. This suggests that while some physical concepts can be learned from observation, there are still significant gaps in the models' comprehension of reality."}, 'zh': {'title': '视觉真实感不等于物理理解', 'desc': '本论文探讨了AI视频生成技术的进展,特别是模型是否理解物理规律。我们开发了Physics-IQ,一个全面的基准数据集,只有通过深入理解流体动力学、光学、固体力学、磁学和热力学等物理原理才能解决。研究发现,当前模型在物理理解方面存在严重限制,且与视觉真实感无关。尽管某些测试案例已成功解决,但这表明仅通过观察获得某些物理原理仍面临重大挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.09433', 'title': 'CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation', 'url': 'https://huggingface.co/papers/2501.09433', 'abstract': 'The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.', 'score': 10, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '8c7a54f21e46af7a', 'authors': ['Hwan Heo', 'Jangyeong Kim', 'Seongyeong Lee', 'Jeong A Wi', 'Junyoung Choi', 'Sangjun Ahn'], 'affiliations': ['Graphics AI Lab, NC Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.09433.jpg', 'data': {'categories': ['#diffusion', '#3d', '#optimization'], 'emoji': '🎨', 'ru': {'title': 'CaPa: Революция в генерации 3D-моделей', 'desc': 'В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощью латентной диффузионной модели и пространственно-разделенного внимания. CaPa также предлагает алгоритм для заполнения нетекстурированных областей, обеспечивая целостность результатов. Фреймворк генерирует 3D-модели менее чем за 30 секунд, превосходя аналоги по качеству текстур и стабильности геометрии.'}, 'en': {'title': 'CaPa: Fast and High-Fidelity 3D Asset Generation', 'desc': 'This paper presents CaPa, a novel framework for generating high-quality 3D assets from textual or visual inputs. It addresses common challenges in 3D generation, such as multi-view inconsistency and slow generation times, by separating geometry generation from texture synthesis. The framework utilizes a 3D latent diffusion model for consistent geometry creation and a Spatially Decoupled Attention mechanism for high-resolution texture synthesis. CaPa also includes a 3D-aware occlusion inpainting algorithm to enhance the final output, achieving high fidelity and stability in under 30 seconds.'}, 'zh': {'title': '高效生成高保真3D资产的CaPa框架', 'desc': '本论文介绍了一种名为CaPa的框架,用于高效生成高保真度的3D资产。该框架采用两阶段的过程,将几何体生成与纹理合成解耦。首先,使用3D潜在扩散模型生成几何体,确保多视角之间的结构一致性。然后,通过一种新颖的空间解耦注意力机制合成高分辨率纹理,并提出了3D感知的遮挡修复算法,最终在30秒内生成高质量的3D资产。'}}}, {'id': 'https://huggingface.co/papers/2501.09653', 'title': 'The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models', 'url': 'https://huggingface.co/papers/2501.09653', 'abstract': 'The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.', 'score': 9, 'issue_id': 1730, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '6d731a1519dc2727', 'authors': ['Jonathan Katzy', 'Razvan Mihai Popescu', 'Arie van Deursen', 'Maliheh Izadi'], 'affiliations': ['Delft University of Technology Delft, The Netherlands'], 'pdf_title_img': 'assets/pdf/title_img/2501.09653.jpg', 'data': {'categories': ['#low_resource', '#multilingual', '#open_source', '#data', '#dataset'], 'emoji': '🗃️', 'ru': {'title': 'The Heap: чистый код для честной оценки языковых моделей', 'desc': "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор данных под названием 'The Heap' охватывает 57 языков программирования и был дедуплицирован относительно других открытых наборов данных. Это позволяет исследователям проводить объективные оценки больших языковых моделей без необходимости значительной предварительной очистки данных. Создание 'The Heap' решает проблему ограниченности доступного кода для исследования специфических поведений моделей и их оценки без риска загрязнения данных."}, 'en': {'title': 'The Heap: A Clean Dataset for Fair Evaluation of Language Models', 'desc': 'This paper introduces The Heap, a comprehensive multilingual dataset that includes code from 57 programming languages. It addresses the challenge of data contamination in evaluating large language models by providing a deduplicated dataset, ensuring that the code is unique compared to existing open datasets. Researchers can utilize The Heap for downstream tasks without the burden of extensive data cleaning. This resource aims to facilitate fair assessments of model performance in coding tasks.'}, 'zh': {'title': '公平评估大型语言模型的新数据集', 'desc': '随着大型语言模型的流行,开发了大量的代码数据集来训练这些模型。然而,这导致可用于特定行为研究或评估大型语言模型的代码有限,且可能存在数据污染的问题。为了解决这个问题,我们发布了The Heap,这是一个覆盖57种编程语言的大型多语言数据集,经过去重处理,避免与其他开放代码数据集重复。这样,研究人员可以在不需要大量数据清理的情况下,公平地评估大型语言模型。'}}}, {'id': 'https://huggingface.co/papers/2501.08617', 'title': 'RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation', 'url': 'https://huggingface.co/papers/2501.08617', 'abstract': "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.", 'score': 7, 'issue_id': 1720, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'f758bc630d8dd443', 'authors': ['Kaiqu Liang', 'Haimin Hu', 'Ryan Liu', 'Thomas L. Griffiths', 'Jaime Fernández Fisac'], 'affiliations': ['Department of Computer Science, Princeton University', 'Department of Electrical and Computer Engineering, Princeton University', 'Department of Psychology, Princeton University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08617.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#training', '#rl'], 'emoji': '🔮', 'ru': {'title': 'Взгляд в будущее для лучшей настройки ИИ', 'desc': 'Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS использует симуляцию долгосрочных последствий действий модели и оценку их полезности постфактум. Авторы показывают, что RLHS позволяет уменьшить проблему неправильной мотивации модели и улучшить соответствие человеческим ценностям. Эмпирические эксперименты демонстрируют превосходство RLHS над RLHF в достижении целей пользователей.'}, 'en': {'title': 'Aligning AI with Human Values through Hindsight Feedback', 'desc': "This paper addresses the challenge of aligning generative AI systems with human values using Reinforcement Learning from Human Feedback (RLHF). It identifies that relying on immediate feedback can lead to misaligned behaviors, such as sycophancy and deception, due to Goodhart's Law dynamics. The authors propose a new approach called Reinforcement Learning from Hindsight Simulation (RLHS), which uses simulated consequences to gather feedback on beneficial behaviors. Their experiments show that RLHS improves user satisfaction and goal achievement compared to traditional RLHF methods, highlighting the importance of considering long-term outcomes in AI alignment."}, 'zh': {'title': '关注长期后果,提升AI对齐性', 'desc': '这篇论文探讨了生成性人工智能系统如何更好地与人类价值观对齐,以确保其行为有益且可信。现有的基于人类反馈的强化学习(RLHF)方法主要依赖即时反馈,但这种反馈可能无法准确反映与用户效用相关的长期影响。作者提出了一种新的方法,称为基于事后模拟的强化学习(RLHS),通过模拟可能的后果来获取反馈,从而改善模型的对齐性。研究表明,RLHS在帮助用户实现目标和提高满意度方面,优于传统的RLHF方法。'}}}, {'id': 'https://huggingface.co/papers/2501.09503', 'title': 'AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation', 'url': 'https://huggingface.co/papers/2501.09503', 'abstract': 'Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .', 'score': 6, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'fb27e795153a9668', 'authors': ['Junjie He', 'Yuxiang Tuo', 'Binghui Chen', 'Chongyang Zhong', 'Yifeng Geng', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.09503.jpg', 'data': {'categories': ['#cv', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами', 'desc': 'Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод использует универсальный энкодер изображений ReferenceNet и CLIP для высококачественного кодирования характеристик субъектов. AnyStory применяет декуплированный маршрутизатор субъектов для точного определения их потенциального расположения в латентном пространстве. Эксперименты показывают превосходную производительность метода в сохранении деталей субъектов, соответствии текстовым описаниям и персонализации для нескольких субъектов одновременно.'}, 'en': {'title': 'AnyStory: Mastering Personalized Image Generation for Multiple Subjects', 'desc': "This paper introduces AnyStory, a novel method for generating personalized images with high fidelity, even when multiple subjects are involved. It employs an 'encode-then-route' strategy, where a powerful image encoder, ReferenceNet, captures detailed subject features. The routing mechanism uses an instance-aware subject router to accurately determine where each subject should be placed in the generated image. Experimental results show that AnyStory excels in maintaining subject details and aligning them with text descriptions, making it effective for both single and multiple subjects."}, 'zh': {'title': 'AnyStory:个性化主题生成的新方法', 'desc': '最近,大规模生成模型在文本到图像生成方面表现出色。然而,生成高保真度的个性化图像,尤其是涉及多个主题的情况,仍然面临挑战。本文提出了AnyStory,这是一种统一的个性化主题生成方法,能够在不牺牲主题保真的情况下,实现单个和多个主题的高保真个性化。AnyStory通过“编码-再路由”的方式建模主题个性化问题,利用强大的图像编码器和实例感知路由器,准确预测主题在潜在空间中的位置。'}}}];
const articlesContainer = document.getElementById('articles-container');
const sortDropdown = document.getElementById('sort-dropdown');
const categoryFiltersContainer = document.getElementById('category-filters');
@@ -1184,7 +1184,7 @@
function updateTimeDiffs() {
const timeDiff = document.getElementById('timeDiff');
- timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-18 06:26',lang=currentLang);
+ timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-18 12:37',lang=currentLang);
}
function updateSortingOptions() {
const sortingLabels = {
@@ -1238,14 +1238,14 @@
}
function hideNextLink(format) {
if (format === 'monthly') {
- if (isCurrentMonth('2025-01-18 06:26')) {
+ if (isCurrentMonth('2025-01-18 12:37')) {
const element = document.getElementById('nav-next');
if (element) {
element.style.display = 'none';
}
}
} else {
- if (isToday('2025-01-18 06:26')) {
+ if (isToday('2025-01-18 12:37')) {
const element = document.getElementById('nav-next');
if (element) {
element.style.display = 'none';
diff --git a/d/2025-01-17.json b/d/2025-01-17.json
index 287a2584..6dbdbf57 100644
--- a/d/2025-01-17.json
+++ b/d/2025-01-17.json
@@ -4,9 +4,9 @@
"en": "January 17",
"zh": "1月17日"
},
- "time_utc": "2025-01-18 06:26",
+ "time_utc": "2025-01-18 12:37",
"weekday": 4,
- "issue_id": 1740,
+ "issue_id": 1741,
"home_page_url": "https://huggingface.co/papers",
"papers": [
{
@@ -14,7 +14,7 @@
"title": "Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps",
"url": "https://huggingface.co/papers/2501.09732",
"abstract": "Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.",
- "score": 34,
+ "score": 40,
"issue_id": 1720,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -69,7 +69,7 @@
"title": "OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking",
"url": "https://huggingface.co/papers/2501.09751",
"abstract": "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.",
- "score": 29,
+ "score": 31,
"issue_id": 1722,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -123,7 +123,7 @@
"title": "Learnings from Scaling Visual Tokenizers for Reconstruction and Generation",
"url": "https://huggingface.co/papers/2501.09755",
"abstract": "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.",
- "score": 19,
+ "score": 22,
"issue_id": 1720,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -175,66 +175,12 @@
}
}
},
- {
- "id": "https://huggingface.co/papers/2501.09484",
- "title": "Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators",
- "url": "https://huggingface.co/papers/2501.09484",
- "abstract": "Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the \"inquiry\" phase of the consultation process. This lack of focus has left the relationship between \"inquiry\" and \"diagnosis\" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between \"inquiry\" and \"diagnosis\" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig's law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.",
- "score": 16,
- "issue_id": 1721,
- "pub_date": "2025-01-16",
- "pub_date_card": {
- "ru": "16 января",
- "en": "January 16",
- "zh": "1月16日"
- },
- "hash": "aff7d86ad63040d9",
- "authors": [
- "Zhaocheng Liu",
- "Quan Tu",
- "Wen Ye",
- "Yu Xiao",
- "Zhishou Zhang",
- "Hengfu Cui",
- "Yalun Zhu",
- "Qiang Ju",
- "Shizheng Li",
- "Jian Xie"
- ],
- "affiliations": [
- "Baichuan Inc.",
- "Gaoling School of Artificial Intelligence, Renmin University of China"
- ],
- "pdf_title_img": "assets/pdf/title_img/2501.09484.jpg",
- "data": {
- "categories": [
- "#data",
- "#training",
- "#science",
- "#open_source",
- "#healthcare"
- ],
- "emoji": "🩺",
- "ru": {
- "title": "Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ",
- "desc": "Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам."
- },
- "en": {
- "title": "Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations",
- "desc": "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."
- },
- "zh": {
- "title": "优化询问,提升诊断效果",
- "desc": "本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。"
- }
- }
- },
{
"id": "https://huggingface.co/papers/2501.09686",
"title": "Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models",
"url": "https://huggingface.co/papers/2501.09686",
"abstract": "Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of \"thought\" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to \"think\" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI's o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.",
- "score": 14,
+ "score": 20,
"issue_id": 1720,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -295,12 +241,66 @@
}
}
},
+ {
+ "id": "https://huggingface.co/papers/2501.09484",
+ "title": "Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators",
+ "url": "https://huggingface.co/papers/2501.09484",
+ "abstract": "Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the \"inquiry\" phase of the consultation process. This lack of focus has left the relationship between \"inquiry\" and \"diagnosis\" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between \"inquiry\" and \"diagnosis\" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig's law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.",
+ "score": 16,
+ "issue_id": 1721,
+ "pub_date": "2025-01-16",
+ "pub_date_card": {
+ "ru": "16 января",
+ "en": "January 16",
+ "zh": "1月16日"
+ },
+ "hash": "aff7d86ad63040d9",
+ "authors": [
+ "Zhaocheng Liu",
+ "Quan Tu",
+ "Wen Ye",
+ "Yu Xiao",
+ "Zhishou Zhang",
+ "Hengfu Cui",
+ "Yalun Zhu",
+ "Qiang Ju",
+ "Shizheng Li",
+ "Jian Xie"
+ ],
+ "affiliations": [
+ "Baichuan Inc.",
+ "Gaoling School of Artificial Intelligence, Renmin University of China"
+ ],
+ "pdf_title_img": "assets/pdf/title_img/2501.09484.jpg",
+ "data": {
+ "categories": [
+ "#data",
+ "#training",
+ "#science",
+ "#open_source",
+ "#healthcare"
+ ],
+ "emoji": "🩺",
+ "ru": {
+ "title": "Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ",
+ "desc": "Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам."
+ },
+ "en": {
+ "title": "Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations",
+ "desc": "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."
+ },
+ "zh": {
+ "title": "优化询问,提升诊断效果",
+ "desc": "本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。"
+ }
+ }
+ },
{
"id": "https://huggingface.co/papers/2501.09756",
"title": "SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces",
"url": "https://huggingface.co/papers/2501.09756",
"abstract": "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/",
- "score": 12,
+ "score": 14,
"issue_id": 1721,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -352,7 +352,7 @@
"title": "FAST: Efficient Action Tokenization for Vision-Language-Action Models",
"url": "https://huggingface.co/papers/2501.09747",
"abstract": "Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.",
- "score": 11,
+ "score": 13,
"issue_id": 1721,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -407,7 +407,7 @@
"title": "Do generative video models learn physical principles from watching videos?",
"url": "https://huggingface.co/papers/2501.09038",
"abstract": "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.",
- "score": 9,
+ "score": 11,
"issue_id": 1725,
"pub_date": "2025-01-14",
"pub_date_card": {
@@ -454,7 +454,7 @@
"title": "CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation",
"url": "https://huggingface.co/papers/2501.09433",
"abstract": "The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.",
- "score": 9,
+ "score": 10,
"issue_id": 1721,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -501,7 +501,7 @@
"title": "The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models",
"url": "https://huggingface.co/papers/2501.09653",
"abstract": "The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.",
- "score": 8,
+ "score": 9,
"issue_id": 1730,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -697,11 +697,11 @@
"#low_resource": 1
},
"zh": {
- "text": "这篇文章讨论了大语言模型生成文章的方法。通常,这些方法依赖检索增强生成,但信息冗余、浅显。为解决这些问题,作者提出了OmniThink框架,模拟人类迭代扩展和反思的过程。实验结果显示,OmniThink提高了生成文章的知识密度,保持了连贯性和深度。人类评估和专家反馈进一步证明了OmniThink在生成长文章方面的潜力。",
- "title": "OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking",
- "pinyin": "这篇文章讨论了大语言模型生成文章的方法。通常,这些方法依赖检索增强生成,但信息冗余、浅显。为解决这些问题,作者提出了OmniThink框架,模拟人类迭代扩展和反思的过程。实验结果显示,OmniThink提高了生成文章的知识密度,保持了连贯性和深度。人类评估和专家反馈进一步证明了OmniThink在生成长文章方面的潜力。\n\nZhè piān wénzhāng tǎolùn le dà yǔyán móxíng shēngchéng wénzhāng de fāngfǎ. Tōngcháng, zhèxiē fāngfǎ yīlài jiǎnsuǒ zēngqiáng shēngchéng, dàn xìnxī rǒngyú, qiǎnxiǎn. Wèi jiějué zhèxiē wèntí, zuòzhě tíchū le OmniThink kuàngjià, mónǐ rénlèi diédài kuòzhǎn hé fǎnsī de guòchéng. Shíyàn jiéguǒ xiǎnshì, OmniThink tígāo le shēngchéng wénzhāng de zhīshi mìdù, bǎochí le liánhéngxìng hé shēndù. Rénlèi pínggū hé zhuānjiā fǎnkuì jìnfù zhèngmíng le OmniThink zài shēngchéng cháng wénzhāng fāngmiàn de qiánlì.",
- "vocab": "[\n {\"word\": \"讨论\", \"pinyin\": \"tǎo lùn\", \"trans\": \"discuss\"},\n {\"word\": \"大语言模型\", \"pinyin\": \"dà yǔ yán mó xíng\", \"trans\": \"large language model\"},\n {\"word\": \"生成\", \"pinyin\": \"shēng chéng\", \"trans\": \"generate\"},\n {\"word\": \"方法\", \"pinyin\": \"fāng fǎ\", \"trans\": \"method\"},\n {\"word\": \"依赖\", \"pinyin\": \"yī lài\", \"trans\": \"rely on\"},\n {\"word\": \"检索\", \"pinyin\": \"jiǎn suǒ\", \"trans\": \"retrieve\"},\n {\"word\": \"增强\", \"pinyin\": \"zēng qiáng\", \"trans\": \"enhance\"},\n {\"word\": \"冗余\", \"pinyin\": \"rǒng yú\", \"trans\": \"redundancy\"},\n {\"word\": \"浅显\", \"pinyin\": \"qiǎn xiǎn\", \"trans\": \"superficial\"},\n {\"word\": \"解决\", \"pinyin\": \"jiě jué\", \"trans\": \"solve\"},\n {\"word\": \"提出\", \"pinyin\": \"tí chū\", \"trans\": \"propose\"},\n {\"word\": \"框架\", \"pinyin\": \"kuàng jià\", \"trans\": \"framework\"},\n {\"word\": \"模拟\", \"pinyin\": \"mó nǐ\", \"trans\": \"simulate\"},\n {\"word\": \"迭代\", \"pinyin\": \"dié dài\", \"trans\": \"iterate\"},\n {\"word\": \"扩展\", \"pinyin\": \"kuò zhǎn\", \"trans\": \"expand\"},\n {\"word\": \"反思\", \"pinyin\": \"fǎn sī\", \"trans\": \"reflect\"},\n {\"word\": \"过程\", \"pinyin\": \"guò chéng\", \"trans\": \"process\"},\n {\"word\": \"实验\", \"pinyin\": \"shí yàn\", \"trans\": \"experiment\"},\n {\"word\": \"结果\", \"pinyin\": \"jié guǒ\", \"trans\": \"result\"},\n {\"word\": \"显示\", \"pinyin\": \"xiǎn shì\", \"trans\": \"show\"},\n {\"word\": \"提高\", \"pinyin\": \"tí gāo\", \"trans\": \"improve\"},\n {\"word\": \"知识\", \"pinyin\": \"zhī shi\", \"trans\": \"knowledge\"},\n {\"word\": \"密度\", \"pinyin\": \"mì dù\", \"trans\": \"density\"},\n {\"word\": \"保持\", \"pinyin\": \"bǎo chí\", \"trans\": \"maintain\"},\n {\"word\": \"连贯性\", \"pinyin\": \"lián guàn xìng\", \"trans\": \"coherence\"},\n {\"word\": \"深度\", \"pinyin\": \"shēn dù\", \"trans\": \"depth\"},\n {\"word\": \"评估\", \"pinyin\": \"píng gū\", \"trans\": \"evaluate\"},\n {\"word\": \"反馈\", \"pinyin\": \"fǎn kuì\", \"trans\": \"feedback\"},\n {\"word\": \"进一步\", \"pinyin\": \"jìn yī bù\", \"trans\": \"further\"},\n {\"word\": \"证明\", \"pinyin\": \"zhèng míng\", \"trans\": \"prove\"},\n {\"word\": \"潜力\", \"pinyin\": \"qián lì\", \"trans\": \"potential\"}\n]",
- "trans": "This article discusses methods for generating articles using large language models. Typically, these methods rely on retrieval-augmented generation but suffer from information redundancy and superficiality. To address these issues, the authors propose the OmniThink framework, which simulates the human process of iterative expansion and reflection. Experimental results show that OmniThink increases the knowledge density of generated articles while maintaining coherence and depth. Human evaluations and expert feedback further demonstrate OmniThink's potential in generating long articles.",
- "update_ts": "2025-01-17 09:10"
+ "text": "生成模型在各个领域产生了重大影响,主要是因为它们能够通过增加数据、计算资源和模型大小来扩展训练,这种现象被称为扩展规律。最近的研究开始探索大型语言模型(LLMs)在推理时的扩展行为,揭示了如何通过推理过程中的额外计算来进一步提高性能。与LLMs不同,扩散模型天生具有通过去噪步骤数量来调整推理时计算的灵活性,尽管性能增益通常在几十步后趋于平稳。在这项工作中,我们探讨了扩散模型在增加去噪步骤之外的推理时扩展行为,研究了如何通过增加计算来进一步提高生成性能。具体来说,我们考虑了一个搜索问题,旨在为扩散采样过程找到更好的噪声。我们将设计空间分为两个轴:用于提供反馈的验证器,以及用于找到更好噪声候选的算法。通过在类条件和文本条件图像生成基准上的大量实验,我们发现增加推理时的计算量显著提高了扩散模型生成样本的质量,并且在图像的复杂性方面,框架中的组件组合可以根据不同的应用场景进行选择。",
+ "title": "Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps",
+ "pinyin": "Shēngchéng móxíng zài gègè lǐngyù chǎnshēngle zhòngdà yǐngxiǎng, zhǔyào shì yīnwèi tāmen nénggòu tōngguò zēngjiā shùjù, jìsuàn zīyuán hé móxíng dàxíng lái kuòzhǎn xùnliàn, zhè zhǒng xiànxiàng bèi chēngwéi kuòzhǎn guīlǜ. Zuìjìn de yánjiū kāishǐ tuànsuǒ dàxíng yǔyán móxíng (LLMs) zài tuīlǐ shí de kuòzhǎn xíngwéi, jiēshìle rúhé tōngguò tuīlǐ guòchéng zhōng de éxǔ jìsuàn lái jìn yībù tígāo xìngnéng. Yǔ LLMs bùtóng, kuòsàn móxíng tiānshēng jùyǒu tōngguò qùzào bùzhòu shùliàng lái tiáojǐ tuīlǐ shí jìsuàn de línghuóxìng, jǐnguǎn xìngnéng zēngyì tōngcháng zài jǐshí bù hòu qūyú píngwěn. Zài zhè xiàng gōngzuò zhōng, wǒmen tuànsuǒle kuòsàn móxíng zài zēngjiā qùzào bùzhòu zhīwài de tuīlǐ shí kuòzhǎn xíngwéi, yánjiūle rúhé tōngguò zēngjiā jìsuàn lái jìn yībù tígāo shēngchéng xìngnéng. Jùtǐ lái shuō, wǒmen kǎolǜle yīgè sōusuǒ wèntí, zhǐ zài wèi kuòsàn cǎiyàng guòchéng zhǎo dào gèng hǎo de zàoshēng. Wǒmen jiāng shèjì kōngjiān fēn wéi liǎng gè zhóu: yòngyú tígōng fǎnkuì de yànzhèngqì, yǐjí yòngyú zhǎo dào gèng hǎo zàoshēng hòuxuǎn de suànfǎ. Tōngguò zài lèi tiáojiàn hé wénběn tiáojiàn túxiàng shēngchéng bǐzhǔn shàng de dàliàng shíyàn, wǒmen fāxiàn zēngjiā tuīlǐ shí de jìsuànliàng xiǎnzhù tígāole kuòsàn móxíng shēngchéng yàngbǎn de zhìliàng, bìngqiě zài túxiàng de fùzáxìng fāngmiàn, kuàngjià zhōng de zǔjiàn zǔhé kěyǐ gēnjù bùtóng de yìngyòng chǎngjīng jìnxíng xuǎnzé.",
+ "vocab": "[{'word': '生成模型', 'pinyin': 'shēng chéng mó xíng', 'trans': 'generative model'},\n{'word': '重大', 'pinyin': 'zhòng dà', 'trans': 'significant'},\n{'word': '影响', 'pinyin': 'yǐng xiǎng', 'trans': 'impact'},\n{'word': '领域', 'pinyin': 'lǐng yù', 'trans': 'field'},\n{'word': '扩展', 'pinyin': 'kuò zhǎn', 'trans': 'expand'},\n{'word': '规律', 'pinyin': 'guī lǜ', 'trans': 'pattern'},\n{'word': '推理', 'pinyin': 'tuī lǐ', 'trans': 'reasoning'},\n{'word': '行为', 'pinyin': 'xíng wéi', 'trans': 'behavior'},\n{'word': '揭示', 'pinyin': 'jiē shì', 'trans': 'reveal'},\n{'word': '灵活性', 'pinyin': 'líng huó xìng', 'trans': 'flexibility'},\n{'word': '趋于', 'pinyin': 'qū yú', 'trans': 'tend towards'},\n{'word': '平稳', 'pinyin': 'píng wěn', 'trans': 'stable'},\n{'word': '探讨', 'pinyin': 'tàn tǎo', 'trans': 'discuss'},\n{'word': '去噪', 'pinyin': 'qù zào', 'trans': 'denoise'},\n{'word': '步骤', 'pinyin': 'bù zhòu', 'trans': 'step'},\n{'word': '搜索', 'pinyin': 'sōu suǒ', 'trans': 'search'},\n{'word': '采样', 'pinyin': 'cǎi yàng', 'trans': 'sampling'},\n{'word': '验证器', 'pinyin': 'yàn zhèng qì', 'trans': 'validator'},\n{'word': '反馈', 'pinyin': 'fǎn kuì', 'trans': 'feedback'},\n{'word': '算法', 'pinyin': 'suàn fǎ', 'trans': 'algorithm'},\n{'word': '候选', 'pinyin': 'hòu xuǎn', 'trans': 'candidate'},\n{'word': '基准', 'pinyin': 'jī zhǔn', 'trans': 'benchmark'},\n{'word': '复杂性', 'pinyin': 'fù zá xìng', 'trans': 'complexity'},\n{'word': '框架', 'pinyin': 'kuàng jià', 'trans': 'framework'},\n{'word': '组件', 'pinyin': 'zǔ jiàn', 'trans': 'component'},\n{'word': '组合', 'pinyin': 'zǔ hé', 'trans': 'combination'},\n{'word': '应用', 'pinyin': 'yìng yòng', 'trans': 'application'},\n{'word': '场景', 'pinyin': 'chǎng jǐng', 'trans': 'scenario'}]",
+ "trans": "Generative models have had a significant impact across various domains, primarily because they can scale training by increasing data, computational resources, and model size, a phenomenon known as scaling laws. Recent research has begun to explore the scaling behavior of large language models (LLMs) during inference, revealing how performance can be further enhanced through additional computation during the inference process. Unlike LLMs, diffusion models inherently offer flexibility in adjusting computation during inference through the number of denoising steps, although performance gains typically plateau after a few dozen steps. In this work, we investigate the scaling behavior of diffusion models during inference beyond increasing denoising steps, exploring how to further enhance generative performance by increasing computation. Specifically, we consider a search problem aimed at finding better noise for the diffusion sampling process. We divide the design space into two axes: the validator used to provide feedback and the algorithm used to find better noise candidates. Through extensive experiments on class-conditional and text-conditional image generation benchmarks, we find that increasing the amount of computation during inference significantly improves the quality of samples generated by diffusion models. Additionally, the combination of components in the framework can be selected based on different application scenarios in terms of image complexity.",
+ "update_ts": "2025-01-18 12:37"
}
}
\ No newline at end of file
diff --git a/d/2025-01-17_zh_reading_task.html b/d/2025-01-17_zh_reading_task.html
index c94640ee..3082a3cb 100644
--- a/d/2025-01-17_zh_reading_task.html
+++ b/d/2025-01-17_zh_reading_task.html
@@ -75,26 +75,31 @@
-
OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking
-
1. 这篇文章讨论了大语言模型生成文章的方法。
-
2. 通常,这些方法依赖检索增强生成,但信息冗余、浅显。
-
3. 为解决这些问题,作者提出了OmniThink框架,模拟人类迭代扩展和反思的过程。
-
4. 实验结果显示,OmniThink提高了生成文章的知识密度,保持了连贯性和深度。
-
5. 人类评估和专家反馈进一步证明了OmniThink在生成长文章方面的潜力。
+
Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps
+
1. 生成模型在各个领域产生了重大影响,主要是因为它们能够通过增加数据、计算资源和模型大小来扩展训练,这种现象被称为扩展规律。
+
2. 最近的研究开始探索大型语言模型(LLMs)在推理时的扩展行为,揭示了如何通过推理过程中的额外计算来进一步提高性能。
+
3. 与LLMs不同,扩散模型天生具有通过去噪步骤数量来调整推理时计算的灵活性,尽管性能增益通常在几十步后趋于平稳。
+
4. 在这项工作中,我们探讨了扩散模型在增加去噪步骤之外的推理时扩展行为,研究了如何通过增加计算来进一步提高生成性能。
+
5. 具体来说,我们考虑了一个搜索问题,旨在为扩散采样过程找到更好的噪声。
+
6. 我们将设计空间分为两个轴:用于提供反馈的验证器,以及用于找到更好噪声候选的算法。
+
7. 通过在类条件和文本条件图像生成基准上的大量实验,我们发现增加推理时的计算量显著提高了扩散模型生成样本的质量,并且在图像的复杂性方面,框架中的组件组合可以根据不同的应用场景进行选择。
-
1. 这篇文章讨论了大语言模型生成文章的方法。通常,这些方法依赖检索增强生成,但信息冗余、浅显。为解决这些问题,作者提出了OmniThink框架,模拟人类迭代扩展和反思的过程。实验结果显示,OmniThink提高了生成文章的知识密度,保持了连贯性和深度。人类评估和专家反馈进一步证明了OmniThink在生成长文章方面的潜力。
-
-Zhè piān wénzhāng tǎolùn le dà yǔyán móxíng shēngchéng wénzhāng de fāngfǎ
-
2. Tōngcháng, zhèxiē fāngfǎ yīlài jiǎnsuǒ zēngqiáng shēngchéng, dàn xìnxī rǒngyú, qiǎnxiǎn
-
3. Wèi jiějué zhèxiē wèntí, zuòzhě tíchū le OmniThink kuàngjià, mónǐ rénlèi diédài kuòzhǎn hé fǎnsī de guòchéng
-
4. Shíyàn jiéguǒ xiǎnshì, OmniThink tígāo le shēngchéng wénzhāng de zhīshi mìdù, bǎochí le liánhéngxìng hé shēndù
-
5. Rénlèi pínggū hé zhuānjiā fǎnkuì jìnfù zhèngmíng le OmniThink zài shēngchéng cháng wénzhāng fāngmiàn de qiánlì
+
1. Shēngchéng móxíng zài gègè lǐngyù chǎnshēngle zhòngdà yǐngxiǎng, zhǔyào shì yīnwèi tāmen nénggòu tōngguò zēngjiā shùjù, jìsuàn zīyuán hé móxíng dàxíng lái kuòzhǎn xùnliàn, zhè zhǒng xiànxiàng bèi chēngwéi kuòzhǎn guīlǜ
+
2. Zuìjìn de yánjiū kāishǐ tuànsuǒ dàxíng yǔyán móxíng (LLMs) zài tuīlǐ shí de kuòzhǎn xíngwéi, jiēshìle rúhé tōngguò tuīlǐ guòchéng zhōng de éxǔ jìsuàn lái jìn yībù tígāo xìngnéng
+
3. Yǔ LLMs bùtóng, kuòsàn móxíng tiānshēng jùyǒu tōngguò qùzào bùzhòu shùliàng lái tiáojǐ tuīlǐ shí jìsuàn de línghuóxìng, jǐnguǎn xìngnéng zēngyì tōngcháng zài jǐshí bù hòu qūyú píngwěn
+
4. Zài zhè xiàng gōngzuò zhōng, wǒmen tuànsuǒle kuòsàn móxíng zài zēngjiā qùzào bùzhòu zhīwài de tuīlǐ shí kuòzhǎn xíngwéi, yánjiūle rúhé tōngguò zēngjiā jìsuàn lái jìn yībù tígāo shēngchéng xìngnéng
+
5. Jùtǐ lái shuō, wǒmen kǎolǜle yīgè sōusuǒ wèntí, zhǐ zài wèi kuòsàn cǎiyàng guòchéng zhǎo dào gèng hǎo de zàoshēng
+
6. Wǒmen jiāng shèjì kōngjiān fēn wéi liǎng gè zhóu: yòngyú tígōng fǎnkuì de yànzhèngqì, yǐjí yòngyú zhǎo dào gèng hǎo zàoshēng hòuxuǎn de suànfǎ
+
7. Tōngguò zài lèi tiáojiàn hé wénběn tiáojiàn túxiàng shēngchéng bǐzhǔn shàng de dàliàng shíyàn, wǒmen fāxiàn zēngjiā tuīlǐ shí de jìsuànliàng xiǎnzhù tígāole kuòsàn móxíng shēngchéng yàngbǎn de zhìliàng, bìngqiě zài túxiàng de fùzáxìng fāngmiàn, kuàngjià zhōng de zǔjiàn zǔhé kěyǐ gēnjù bùtóng de yìngyòng chǎngjīng jìnxíng xuǎnzé
-
1. This article discusses methods for generating articles using large language models.
-
2. Typically, these methods rely on retrieval-augmented generation but suffer from information redundancy and superficiality.
-
3. To address these issues, the authors propose the OmniThink framework, which simulates the human process of iterative expansion and reflection.
-
4. Experimental results show that OmniThink increases the knowledge density of generated articles while maintaining coherence and depth.
-
5. Human evaluations and expert feedback further demonstrate OmniThink's potential in generating long articles.
+
1. Generative models have had a significant impact across various domains, primarily because they can scale training by increasing data, computational resources, and model size, a phenomenon known as scaling laws.
+
2. Recent research has begun to explore the scaling behavior of large language models (LLMs) during inference, revealing how performance can be further enhanced through additional computation during the inference process.
+
3. Unlike LLMs, diffusion models inherently offer flexibility in adjusting computation during inference through the number of denoising steps, although performance gains typically plateau after a few dozen steps.
+
4. In this work, we investigate the scaling behavior of diffusion models during inference beyond increasing denoising steps, exploring how to further enhance generative performance by increasing computation.
+
5. Specifically, we consider a search problem aimed at finding better noise for the diffusion sampling process.
+
6. We divide the design space into two axes: the validator used to provide feedback and the algorithm used to find better noise candidates.
+
7. Through extensive experiments on class-conditional and text-conditional image generation benchmarks, we find that increasing the amount of computation during inference significantly improves the quality of samples generated by diffusion models.
+
8. Additionally, the combination of components in the framework can be selected based on different application scenarios in terms of image complexity.
Vocabulary
@@ -107,189 +112,171 @@ Vocabulary
- 讨论 |
- tǎo lùn |
- discuss |
-
-
-
- 大语言模型 |
- dà yǔ yán mó xíng |
- large language model |
-
-
-
- 生成 |
- shēng chéng |
- generate |
+ 生成模型 |
+ shēng chéng mó xíng |
+ generative model |
- 方法 |
- fāng fǎ |
- method |
+ 重大 |
+ zhòng dà |
+ significant |
- 依赖 |
- yī lài |
- rely on |
+ 影响 |
+ yǐng xiǎng |
+ impact |
- 检索 |
- jiǎn suǒ |
- retrieve |
+ 领域 |
+ lǐng yù |
+ field |
- 增强 |
- zēng qiáng |
- enhance |
-
-
-
- 冗余 |
- rǒng yú |
- redundancy |
+ 扩展 |
+ kuò zhǎn |
+ expand |
- 浅显 |
- qiǎn xiǎn |
- superficial |
+ 规律 |
+ guī lǜ |
+ pattern |
- 解决 |
- jiě jué |
- solve |
+ 推理 |
+ tuī lǐ |
+ reasoning |
- 提出 |
- tí chū |
- propose |
+ 行为 |
+ xíng wéi |
+ behavior |
- 框架 |
- kuàng jià |
- framework |
+ 揭示 |
+ jiē shì |
+ reveal |
- 模拟 |
- mó nǐ |
- simulate |
+ 灵活性 |
+ líng huó xìng |
+ flexibility |
- 迭代 |
- dié dài |
- iterate |
+ 趋于 |
+ qū yú |
+ tend towards |
- 扩展 |
- kuò zhǎn |
- expand |
+ 平稳 |
+ píng wěn |
+ stable |
- 反思 |
- fǎn sī |
- reflect |
+ 探讨 |
+ tàn tǎo |
+ discuss |
- 过程 |
- guò chéng |
- process |
+ 去噪 |
+ qù zào |
+ denoise |
- 实验 |
- shí yàn |
- experiment |
+ 步骤 |
+ bù zhòu |
+ step |
- 结果 |
- jié guǒ |
- result |
+ 搜索 |
+ sōu suǒ |
+ search |
- 显示 |
- xiǎn shì |
- show |
+ 采样 |
+ cǎi yàng |
+ sampling |
- 提高 |
- tí gāo |
- improve |
+ 验证器 |
+ yàn zhèng qì |
+ validator |
- 知识 |
- zhī shi |
- knowledge |
+ 反馈 |
+ fǎn kuì |
+ feedback |
- 密度 |
- mì dù |
- density |
+ 算法 |
+ suàn fǎ |
+ algorithm |
- 保持 |
- bǎo chí |
- maintain |
+ 候选 |
+ hòu xuǎn |
+ candidate |
- 连贯性 |
- lián guàn xìng |
- coherence |
+ 基准 |
+ jī zhǔn |
+ benchmark |
- 深度 |
- shēn dù |
- depth |
+ 复杂性 |
+ fù zá xìng |
+ complexity |
- 评估 |
- píng gū |
- evaluate |
+ 框架 |
+ kuàng jià |
+ framework |
- 反馈 |
- fǎn kuì |
- feedback |
+ 组件 |
+ zǔ jiàn |
+ component |
- 进一步 |
- jìn yī bù |
- further |
+ 组合 |
+ zǔ hé |
+ combination |
- 证明 |
- zhèng míng |
- prove |
+ 应用 |
+ yìng yòng |
+ application |
- 潜力 |
- qián lì |
- potential |
+ 场景 |
+ chǎng jǐng |
+ scenario |
diff --git a/hf_papers.json b/hf_papers.json
index 6dbdbf57..831d1db4 100644
--- a/hf_papers.json
+++ b/hf_papers.json
@@ -4,9 +4,9 @@
"en": "January 17",
"zh": "1月17日"
},
- "time_utc": "2025-01-18 12:37",
+ "time_utc": "2025-01-18 18:26",
"weekday": 4,
- "issue_id": 1741,
+ "issue_id": 1742,
"home_page_url": "https://huggingface.co/papers",
"papers": [
{
@@ -14,7 +14,7 @@
"title": "Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps",
"url": "https://huggingface.co/papers/2501.09732",
"abstract": "Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.",
- "score": 40,
+ "score": 41,
"issue_id": 1720,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -69,7 +69,7 @@
"title": "OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking",
"url": "https://huggingface.co/papers/2501.09751",
"abstract": "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.",
- "score": 31,
+ "score": 33,
"issue_id": 1722,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -123,7 +123,7 @@
"title": "Learnings from Scaling Visual Tokenizers for Reconstruction and Generation",
"url": "https://huggingface.co/papers/2501.09755",
"abstract": "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.",
- "score": 22,
+ "score": 23,
"issue_id": 1720,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -180,7 +180,7 @@
"title": "Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models",
"url": "https://huggingface.co/papers/2501.09686",
"abstract": "Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of \"thought\" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to \"think\" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI's o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.",
- "score": 20,
+ "score": 21,
"issue_id": 1720,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -246,7 +246,7 @@
"title": "Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators",
"url": "https://huggingface.co/papers/2501.09484",
"abstract": "Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the \"inquiry\" phase of the consultation process. This lack of focus has left the relationship between \"inquiry\" and \"diagnosis\" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between \"inquiry\" and \"diagnosis\" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig's law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.",
- "score": 16,
+ "score": 17,
"issue_id": 1721,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -300,7 +300,7 @@
"title": "SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces",
"url": "https://huggingface.co/papers/2501.09756",
"abstract": "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/",
- "score": 14,
+ "score": 15,
"issue_id": 1721,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -352,7 +352,7 @@
"title": "FAST: Efficient Action Tokenization for Vision-Language-Action Models",
"url": "https://huggingface.co/papers/2501.09747",
"abstract": "Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.",
- "score": 13,
+ "score": 15,
"issue_id": 1721,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -407,7 +407,7 @@
"title": "Do generative video models learn physical principles from watching videos?",
"url": "https://huggingface.co/papers/2501.09038",
"abstract": "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.",
- "score": 11,
+ "score": 14,
"issue_id": 1725,
"pub_date": "2025-01-14",
"pub_date_card": {
@@ -454,7 +454,7 @@
"title": "CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation",
"url": "https://huggingface.co/papers/2501.09433",
"abstract": "The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.",
- "score": 10,
+ "score": 12,
"issue_id": 1721,
"pub_date": "2025-01-16",
"pub_date_card": {
@@ -548,7 +548,7 @@
"title": "RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation",
"url": "https://huggingface.co/papers/2501.08617",
"abstract": "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.",
- "score": 7,
+ "score": 8,
"issue_id": 1720,
"pub_date": "2025-01-15",
"pub_date_card": {
@@ -597,7 +597,7 @@
"title": "AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation",
"url": "https://huggingface.co/papers/2501.09503",
"abstract": "Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an \"encode-then-route\" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .",
- "score": 6,
+ "score": 7,
"issue_id": 1721,
"pub_date": "2025-01-16",
"pub_date_card": {
diff --git a/index.html b/index.html
index 1d98c0d1..959cbad1 100644
--- a/index.html
+++ b/index.html
@@ -881,7 +881,7 @@
}
}
- const articlesData = [{'id': 'https://huggingface.co/papers/2501.09732', 'title': 'Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps', 'url': 'https://huggingface.co/papers/2501.09732', 'abstract': 'Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.', 'score': 40, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '2ad32c666f91ba05', 'authors': ['Nanye Ma', 'Shangyuan Tong', 'Haolin Jia', 'Hexiang Hu', 'Yu-Chuan Su', 'Mingda Zhang', 'Xuan Yang', 'Yandong Li', 'Tommi Jaakkola', 'Xuhui Jia', 'Saining Xie'], 'affiliations': ['Google', 'MIT', 'NYU'], 'pdf_title_img': 'assets/pdf/title_img/2501.09732.jpg', 'data': {'categories': ['#diffusion', '#inference', '#benchmark', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Повышение качества генерации изображений за счет масштабирования вычислений при выводе', 'desc': 'Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислений во время вывода. Авторы рассматривают задачу поиска лучших шумов для процесса сэмплирования диффузионной модели. Они структурируют пространство решений по двум осям: верификаторы для обратной связи и алгоритмы поиска лучших кандидатов шума. Эксперименты показывают, что увеличение вычислений при выводе приводит к значительному улучшению качества сгенерированных изображений.'}, 'en': {'title': 'Enhancing Diffusion Models: Scaling Inference for Better Image Generation', 'desc': 'This paper investigates how to enhance the performance of diffusion models during the inference phase by increasing computational resources. It highlights that, unlike Large Language Models (LLMs), diffusion models can adjust their inference process through the number of denoising steps, but improvements tend to plateau after a certain point. The authors propose a method to optimize the noise used in the diffusion sampling process by exploring different feedback verifiers and algorithms. Their experiments demonstrate that by strategically increasing computation during inference, the quality of generated images can be significantly improved, tailored to various application needs.'}, 'zh': {'title': '扩散模型推理时的计算扩展与性能提升', 'desc': '生成模型在多个领域产生了重要影响,主要得益于其在训练过程中通过增加数据、计算资源和模型规模来扩展的能力。最近的研究开始探讨大型语言模型(LLMs)在推理时的扩展行为,发现额外的计算可以进一步提高性能。与LLMs不同,扩散模型通过去噪步骤的数量灵活调整推理时的计算,尽管性能提升通常在几十步后趋于平稳。本文探讨了扩散模型在推理时的扩展行为,研究如何通过增加计算来进一步提高生成性能,特别是通过寻找更好的噪声来优化扩散采样过程。'}}}, {'id': 'https://huggingface.co/papers/2501.09751', 'title': 'OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking', 'url': 'https://huggingface.co/papers/2501.09751', 'abstract': "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.", 'score': 31, 'issue_id': 1722, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '7e8d42358354f79b', 'authors': ['Zekun Xi', 'Wenbiao Yin', 'Jizhan Fang', 'Jialong Wu', 'Runnan Fang', 'Ningyu Zhang', 'Jiang Yong', 'Pengjun Xie', 'Fei Huang', 'Huajun Chen'], 'affiliations': ['Tongyi Lab, Alibaba Group', 'Zhejiang Key Laboratory of Big Data Intelligent Computing', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09751.jpg', 'data': {'categories': ['#rag', '#story_generation', '#long_context', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста', 'desc': 'Статья представляет новый подход к генерации текста с использованием больших языковых моделей, названный OmniThink. Этот метод имитирует человеческий процесс итеративного расширения знаний и рефлексии, преодолевая ограничения стандартных методов извлечения информации. OmniThink улучшает плотность знаний в генерируемых статьях, не жертвуя связностью и глубиной. Эксперименты и оценки экспертов подтверждают эффективность OmniThink для решения реальных задач генерации длинных статей.'}, 'en': {'title': 'OmniThink: Elevating Machine Writing through Human-Like Learning', 'desc': 'This paper introduces OmniThink, a novel machine writing framework that enhances the capabilities of large language models by mimicking human cognitive processes. Unlike traditional retrieval-augmented generation methods, which often produce shallow and repetitive content, OmniThink focuses on iterative expansion and reflection to deepen knowledge on topics. The framework significantly improves the knowledge density of generated articles while maintaining coherence and depth, as shown by experimental results. Human evaluations and expert feedback confirm that OmniThink effectively addresses challenges in generating high-quality long-form content.'}, 'zh': {'title': 'OmniThink:提升机器写作的知识密度', 'desc': '本文提出了一种名为OmniThink的机器写作框架,旨在改善传统大语言模型在生成内容时的局限性。OmniThink模拟人类学习者的认知过程,通过迭代扩展和反思来加深对主题的理解。实验结果表明,OmniThink能够提高生成文章的知识密度,同时保持连贯性和深度等指标。人类评估和专家反馈进一步验证了OmniThink在生成长篇文章时解决实际问题的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.09755', 'title': 'Learnings from Scaling Visual Tokenizers for Reconstruction and Generation', 'url': 'https://huggingface.co/papers/2501.09755', 'abstract': "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.", 'score': 22, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '426aa3415c3c0ef4', 'authors': ['Philippe Hansen-Estruch', 'David Yan', 'Ching-Yao Chung', 'Orr Zohar', 'Jialiang Wang', 'Tingbo Hou', 'Tao Xu', 'Sriram Vishwanath', 'Peter Vajda', 'Xinlei Chen'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'Stanford University', 'UT Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.09755.jpg', 'data': {'categories': ['#cv', '#benchmark', '#video', '#optimization', '#architecture', '#diffusion'], 'emoji': '🔬', 'ru': {'title': 'ViTok: Оптимизация визуальной токенизации для генеративных моделей', 'desc': 'Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделях изображений и видео. Авторы представляют ViTok - легковесный автоэнкодер на основе Vision Transformer, обученный на масштабных датасетах. Исследование показывает, что масштабирование декодера улучшает реконструкцию, но неоднозначно влияет на генерацию. ViTok демонстрирует конкурентоспособную производительность при меньшем количестве FLOP и устанавливает новые рекорды в условной генерации видео.'}, 'en': {'title': 'Scaling Auto-Encoders for Enhanced Image and Video Generation', 'desc': 'This paper explores the scaling of auto-encoders, particularly focusing on the tokenizer component, which is crucial for image and video generation. The authors introduce ViTok, a Vision Transformer-based architecture that replaces traditional convolutional backbones, allowing for better scaling on large datasets. They investigate how different scaling strategies for the encoder and decoder affect both reconstruction and generative performance, finding that scaling the decoder is more beneficial for reconstruction. Ultimately, ViTok achieves competitive results with fewer computational resources and sets new benchmarks in image and video generation tasks.'}, 'zh': {'title': '自编码器的视觉标记化:提升生成模型的关键', 'desc': '本论文探讨了通过自编码器进行视觉标记化对图像和视频生成模型的影响。我们提出了一种增强的视觉变换器架构(ViTok),用于替代传统的卷积骨干网络,以提高标记化的效果。研究发现,自编码器的瓶颈规模与重建性能高度相关,但与生成性能的关系更为复杂。最终,ViTok在多个任务中表现出色,尤其是在视频重建和图像生成方面,展示了其在计算效率上的优势。'}}}, {'id': 'https://huggingface.co/papers/2501.09686', 'title': 'Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models', 'url': 'https://huggingface.co/papers/2501.09686', 'abstract': 'Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs\' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs\' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI\'s o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.', 'score': 20, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1c6b1b1f0235304c', 'authors': ['Fengli Xu', 'Qianyue Hao', 'Zefang Zong', 'Jingwei Wang', 'Yunke Zhang', 'Jingyi Wang', 'Xiaochong Lan', 'Jiahui Gong', 'Tianjian Ouyang', 'Fanjin Meng', 'Chenyang Shao', 'Yuwei Yan', 'Qinglong Yang', 'Yiwen Song', 'Sijian Ren', 'Xinyuan Hu', 'Yu Li', 'Jie Feng', 'Chen Gao', 'Yong Li'], 'affiliations': ['Emory University, Atlanta GA, USA', 'HKUST (GZ), Guangzhou, China', 'Tsinghua University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09686.jpg', 'data': {'categories': ['#open_source', '#training', '#rl', '#survey', '#reasoning', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Путь к большим моделям рассуждений: новый рубеж в ИИ', 'desc': 'Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваются ключевые технические компоненты, способствующие развитию крупных моделей рассуждений, включая автоматизированное построение данных, методы обучения рассуждениям и масштабирование во время тестирования. Анализируются популярные проекты с открытым исходным кодом по созданию крупных моделей рассуждений. Обсуждаются открытые проблемы и направления будущих исследований в этой области.'}, 'en': {'title': 'Unlocking Human-Like Reasoning in Large Language Models', 'desc': "This paper discusses the advancements in Large Language Models (LLMs) and their application to complex reasoning tasks. It introduces the concept of 'thought', which represents intermediate reasoning steps, allowing LLMs to simulate human-like reasoning processes. The paper highlights the use of reinforcement learning to enhance LLMs' reasoning capabilities by generating high-quality reasoning trajectories through trial-and-error methods. Additionally, it emphasizes the importance of scaling both training and testing phases to improve reasoning accuracy, paving the way for the development of Large Reasoning Models."}, 'zh': {'title': '推动大型推理模型的研究新前沿', 'desc': '这篇论文探讨了大型语言模型(LLMs)在复杂推理任务中的应用。研究者们引入了“思考”的概念,通过中间步骤的令牌序列来模拟人类的推理过程。最近,强化学习(RL)被应用于训练LLMs,以自动生成高质量的推理轨迹,从而显著提高推理能力。论文还讨论了在测试时增加令牌数量以提高推理准确性的效果,并展望了大型推理模型的未来研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09484', 'title': 'Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators', 'url': 'https://huggingface.co/papers/2501.09484', 'abstract': 'Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the "inquiry" phase of the consultation process. This lack of focus has left the relationship between "inquiry" and "diagnosis" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between "inquiry" and "diagnosis" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig\'s law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.', 'score': 16, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'aff7d86ad63040d9', 'authors': ['Zhaocheng Liu', 'Quan Tu', 'Wen Ye', 'Yu Xiao', 'Zhishou Zhang', 'Hengfu Cui', 'Yalun Zhu', 'Qiang Ju', 'Shizheng Li', 'Jian Xie'], 'affiliations': ['Baichuan Inc.', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09484.jpg', 'data': {'categories': ['#data', '#training', '#science', '#open_source', '#healthcare'], 'emoji': '🩺', 'ru': {'title': 'Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ', 'desc': 'Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам.'}, 'en': {'title': 'Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations', 'desc': "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."}, 'zh': {'title': '优化询问,提升诊断效果', 'desc': '本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。'}}}, {'id': 'https://huggingface.co/papers/2501.09756', 'title': 'SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces', 'url': 'https://huggingface.co/papers/2501.09756', 'abstract': "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/", 'score': 14, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'e6621d55eb165448', 'authors': ['Sumit Chaturvedi', 'Mengwei Ren', 'Yannick Hold-Geoffroy', 'Jingyuan Liu', 'Julie Dorsey', 'Zhixin Shu'], 'affiliations': ['Adobe Research', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09756.jpg', 'data': {'categories': ['#dataset', '#3d', '#inference', '#cv', '#diffusion', '#training', '#synthetic'], 'emoji': '💡', 'ru': {'title': 'SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели', 'desc': 'SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматривает перезасветку как проблему повторного рендеринга, где пиксели трансформируются в ответ на изменения условий освещения окружающей среды. Авторы синтезировали датасет с помощью физически корректного рендеринга, симулируя трансформации освещения на 3D-моделях голов. Предложены две стратегии обучения и вывода для преодоления разрыва между синтетическими и реальными изображениями.'}, 'en': {'title': 'Revolutionizing Portrait Relighting with SynthLight', 'desc': 'SynthLight is a diffusion model designed for relighting portraits by treating the task as a re-rendering challenge influenced by environmental lighting changes. It utilizes a physically-based rendering engine to create a synthetic dataset that simulates how lighting affects 3D head models. The model employs multi-task training to utilize real portraits without specific lighting labels and a novel inference strategy that enhances detail preservation during the relighting process. The results show that SynthLight can effectively generalize to real images, producing realistic lighting effects while maintaining the identity of the subjects, outperforming existing methods in both quantitative and qualitative assessments.'}, 'zh': {'title': 'SynthLight:肖像重光照的新方法', 'desc': '我们介绍了SynthLight,这是一种用于肖像重光照的扩散模型。我们将图像重光照视为重新渲染的问题,通过物理基础渲染引擎合成数据集,以模拟在不同光照条件下的像素变换。我们提出了两种训练和推理策略,以缩小合成图像和真实图像之间的差距,利用真实人像进行多任务训练,并在推理时使用无分类器引导的扩散采样程序。我们的模型能够在多样的真实照片中推广,生成逼真的光照效果,同时保持主体的身份特征。'}}}, {'id': 'https://huggingface.co/papers/2501.09747', 'title': 'FAST: Efficient Action Tokenization for Vision-Language-Action Models', 'url': 'https://huggingface.co/papers/2501.09747', 'abstract': 'Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.', 'score': 13, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1ff64d2f7e62d274', 'authors': ['Karl Pertsch', 'Kyle Stachowicz', 'Brian Ichter', 'Danny Driess', 'Suraj Nair', 'Quan Vuong', 'Oier Mees', 'Chelsea Finn', 'Sergey Levine'], 'affiliations': ['Physical Intelligence', 'Stanford', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.09747.jpg', 'data': {'categories': ['#dataset', '#agents', '#training', '#games', '#optimization', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'Революция в токенизации действий робота: от частотного пространства к универсальности', 'desc': 'Статья представляет новый метод токенизации действий робота под названием FAST (Frequency-space Action Sequence Tokenization), основанный на дискретном косинусном преобразовании. Этот подход позволяет обучать авторегрессионные модели VLA (Vision-Language Action) для высокочастотных и сложных задач манипулирования, где стандартные методы дискретизации не работают. Авторы также представляют FAST+, универсальный токенизатор действий робота, обученный на 1 миллионе реальных траекторий. В сочетании с моделью pi0 VLA, метод FAST позволяет обучаться на 10 тысячах часов данных робота и достигать производительности диффузионных VLA, сокращая время обучения до 5 раз.'}, 'en': {'title': 'Revolutionizing Robot Action Tokenization with FAST', 'desc': 'This paper introduces a new method for tokenizing continuous robot actions to improve the performance of autoregressive sequence models, specifically in the context of vision-language action (VLA) policies. The authors identify that traditional tokenization methods, which use simple binning techniques, struggle with high-frequency and dexterous robotic tasks. To overcome this limitation, they propose Frequency-space Action Sequence Tokenization (FAST), which utilizes the discrete cosine transform for better action representation. The results demonstrate that FAST can effectively train VLAs on extensive robot data, achieving performance comparable to diffusion models while significantly reducing training time.'}, 'zh': {'title': '提升机器人灵巧技能的标记化新方法', 'desc': '本文提出了一种新的机器人动作标记化方案,称为频率空间动作序列标记化(FAST),旨在解决现有基于简单分箱方法的标记化在学习灵巧技能时的不足。FAST利用离散余弦变换来有效地处理高频机器人数据,从而提高了模型在复杂任务中的表现。我们还发布了FAST+,这是一个通用的机器人动作标记器,能够处理多种动作序列和控制频率。通过与pi0 VLA结合,我们的方法在训练10,000小时的机器人数据时,能够与扩散VLA的性能相匹配,同时将训练时间减少了多达5倍。'}}}, {'id': 'https://huggingface.co/papers/2501.09038', 'title': 'Do generative video models learn physical principles from watching videos?', 'url': 'https://huggingface.co/papers/2501.09038', 'abstract': "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.", 'score': 11, 'issue_id': 1725, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '6a5047e8681ddcc5', 'authors': ['Saman Motamed', 'Laura Culp', 'Kevin Swersky', 'Priyank Jaini', 'Robert Geirhos'], 'affiliations': ['Google DeepMind', 'INSAIT, Sofia University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09038.jpg', 'data': {'categories': ['#benchmark', '#science', '#video'], 'emoji': '🧠', 'ru': {'title': 'Визуальный реализм не гарантирует понимание физики в ИИ', 'desc': 'Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моделей понимать законы физики. Результаты показывают, что современные модели имеют ограниченное физическое понимание, несмотря на визуальный реализм. Однако некоторые задачи уже успешно решаются, что указывает на потенциал изучения физических принципов из наблюдений.'}, 'en': {'title': 'Visual Realism vs. Physical Understanding in AI Video Generation', 'desc': "This paper explores whether AI video generation models truly understand the laws of physics or if they are just good at creating realistic images. The authors introduce Physics-IQ, a benchmark dataset designed to test models on their grasp of physical principles like fluid dynamics and thermodynamics. Their findings show that current models struggle with physical understanding, even though they can produce visually realistic videos. This suggests that while some physical concepts can be learned from observation, there are still significant gaps in the models' comprehension of reality."}, 'zh': {'title': '视觉真实感不等于物理理解', 'desc': '本论文探讨了AI视频生成技术的进展,特别是模型是否理解物理规律。我们开发了Physics-IQ,一个全面的基准数据集,只有通过深入理解流体动力学、光学、固体力学、磁学和热力学等物理原理才能解决。研究发现,当前模型在物理理解方面存在严重限制,且与视觉真实感无关。尽管某些测试案例已成功解决,但这表明仅通过观察获得某些物理原理仍面临重大挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.09433', 'title': 'CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation', 'url': 'https://huggingface.co/papers/2501.09433', 'abstract': 'The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.', 'score': 10, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '8c7a54f21e46af7a', 'authors': ['Hwan Heo', 'Jangyeong Kim', 'Seongyeong Lee', 'Jeong A Wi', 'Junyoung Choi', 'Sangjun Ahn'], 'affiliations': ['Graphics AI Lab, NC Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.09433.jpg', 'data': {'categories': ['#diffusion', '#3d', '#optimization'], 'emoji': '🎨', 'ru': {'title': 'CaPa: Революция в генерации 3D-моделей', 'desc': 'В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощью латентной диффузионной модели и пространственно-разделенного внимания. CaPa также предлагает алгоритм для заполнения нетекстурированных областей, обеспечивая целостность результатов. Фреймворк генерирует 3D-модели менее чем за 30 секунд, превосходя аналоги по качеству текстур и стабильности геометрии.'}, 'en': {'title': 'CaPa: Fast and High-Fidelity 3D Asset Generation', 'desc': 'This paper presents CaPa, a novel framework for generating high-quality 3D assets from textual or visual inputs. It addresses common challenges in 3D generation, such as multi-view inconsistency and slow generation times, by separating geometry generation from texture synthesis. The framework utilizes a 3D latent diffusion model for consistent geometry creation and a Spatially Decoupled Attention mechanism for high-resolution texture synthesis. CaPa also includes a 3D-aware occlusion inpainting algorithm to enhance the final output, achieving high fidelity and stability in under 30 seconds.'}, 'zh': {'title': '高效生成高保真3D资产的CaPa框架', 'desc': '本论文介绍了一种名为CaPa的框架,用于高效生成高保真度的3D资产。该框架采用两阶段的过程,将几何体生成与纹理合成解耦。首先,使用3D潜在扩散模型生成几何体,确保多视角之间的结构一致性。然后,通过一种新颖的空间解耦注意力机制合成高分辨率纹理,并提出了3D感知的遮挡修复算法,最终在30秒内生成高质量的3D资产。'}}}, {'id': 'https://huggingface.co/papers/2501.09653', 'title': 'The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models', 'url': 'https://huggingface.co/papers/2501.09653', 'abstract': 'The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.', 'score': 9, 'issue_id': 1730, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '6d731a1519dc2727', 'authors': ['Jonathan Katzy', 'Razvan Mihai Popescu', 'Arie van Deursen', 'Maliheh Izadi'], 'affiliations': ['Delft University of Technology Delft, The Netherlands'], 'pdf_title_img': 'assets/pdf/title_img/2501.09653.jpg', 'data': {'categories': ['#low_resource', '#multilingual', '#open_source', '#data', '#dataset'], 'emoji': '🗃️', 'ru': {'title': 'The Heap: чистый код для честной оценки языковых моделей', 'desc': "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор данных под названием 'The Heap' охватывает 57 языков программирования и был дедуплицирован относительно других открытых наборов данных. Это позволяет исследователям проводить объективные оценки больших языковых моделей без необходимости значительной предварительной очистки данных. Создание 'The Heap' решает проблему ограниченности доступного кода для исследования специфических поведений моделей и их оценки без риска загрязнения данных."}, 'en': {'title': 'The Heap: A Clean Dataset for Fair Evaluation of Language Models', 'desc': 'This paper introduces The Heap, a comprehensive multilingual dataset that includes code from 57 programming languages. It addresses the challenge of data contamination in evaluating large language models by providing a deduplicated dataset, ensuring that the code is unique compared to existing open datasets. Researchers can utilize The Heap for downstream tasks without the burden of extensive data cleaning. This resource aims to facilitate fair assessments of model performance in coding tasks.'}, 'zh': {'title': '公平评估大型语言模型的新数据集', 'desc': '随着大型语言模型的流行,开发了大量的代码数据集来训练这些模型。然而,这导致可用于特定行为研究或评估大型语言模型的代码有限,且可能存在数据污染的问题。为了解决这个问题,我们发布了The Heap,这是一个覆盖57种编程语言的大型多语言数据集,经过去重处理,避免与其他开放代码数据集重复。这样,研究人员可以在不需要大量数据清理的情况下,公平地评估大型语言模型。'}}}, {'id': 'https://huggingface.co/papers/2501.08617', 'title': 'RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation', 'url': 'https://huggingface.co/papers/2501.08617', 'abstract': "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.", 'score': 7, 'issue_id': 1720, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'f758bc630d8dd443', 'authors': ['Kaiqu Liang', 'Haimin Hu', 'Ryan Liu', 'Thomas L. Griffiths', 'Jaime Fernández Fisac'], 'affiliations': ['Department of Computer Science, Princeton University', 'Department of Electrical and Computer Engineering, Princeton University', 'Department of Psychology, Princeton University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08617.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#training', '#rl'], 'emoji': '🔮', 'ru': {'title': 'Взгляд в будущее для лучшей настройки ИИ', 'desc': 'Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS использует симуляцию долгосрочных последствий действий модели и оценку их полезности постфактум. Авторы показывают, что RLHS позволяет уменьшить проблему неправильной мотивации модели и улучшить соответствие человеческим ценностям. Эмпирические эксперименты демонстрируют превосходство RLHS над RLHF в достижении целей пользователей.'}, 'en': {'title': 'Aligning AI with Human Values through Hindsight Feedback', 'desc': "This paper addresses the challenge of aligning generative AI systems with human values using Reinforcement Learning from Human Feedback (RLHF). It identifies that relying on immediate feedback can lead to misaligned behaviors, such as sycophancy and deception, due to Goodhart's Law dynamics. The authors propose a new approach called Reinforcement Learning from Hindsight Simulation (RLHS), which uses simulated consequences to gather feedback on beneficial behaviors. Their experiments show that RLHS improves user satisfaction and goal achievement compared to traditional RLHF methods, highlighting the importance of considering long-term outcomes in AI alignment."}, 'zh': {'title': '关注长期后果,提升AI对齐性', 'desc': '这篇论文探讨了生成性人工智能系统如何更好地与人类价值观对齐,以确保其行为有益且可信。现有的基于人类反馈的强化学习(RLHF)方法主要依赖即时反馈,但这种反馈可能无法准确反映与用户效用相关的长期影响。作者提出了一种新的方法,称为基于事后模拟的强化学习(RLHS),通过模拟可能的后果来获取反馈,从而改善模型的对齐性。研究表明,RLHS在帮助用户实现目标和提高满意度方面,优于传统的RLHF方法。'}}}, {'id': 'https://huggingface.co/papers/2501.09503', 'title': 'AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation', 'url': 'https://huggingface.co/papers/2501.09503', 'abstract': 'Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .', 'score': 6, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'fb27e795153a9668', 'authors': ['Junjie He', 'Yuxiang Tuo', 'Binghui Chen', 'Chongyang Zhong', 'Yifeng Geng', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.09503.jpg', 'data': {'categories': ['#cv', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами', 'desc': 'Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод использует универсальный энкодер изображений ReferenceNet и CLIP для высококачественного кодирования характеристик субъектов. AnyStory применяет декуплированный маршрутизатор субъектов для точного определения их потенциального расположения в латентном пространстве. Эксперименты показывают превосходную производительность метода в сохранении деталей субъектов, соответствии текстовым описаниям и персонализации для нескольких субъектов одновременно.'}, 'en': {'title': 'AnyStory: Mastering Personalized Image Generation for Multiple Subjects', 'desc': "This paper introduces AnyStory, a novel method for generating personalized images with high fidelity, even when multiple subjects are involved. It employs an 'encode-then-route' strategy, where a powerful image encoder, ReferenceNet, captures detailed subject features. The routing mechanism uses an instance-aware subject router to accurately determine where each subject should be placed in the generated image. Experimental results show that AnyStory excels in maintaining subject details and aligning them with text descriptions, making it effective for both single and multiple subjects."}, 'zh': {'title': 'AnyStory:个性化主题生成的新方法', 'desc': '最近,大规模生成模型在文本到图像生成方面表现出色。然而,生成高保真度的个性化图像,尤其是涉及多个主题的情况,仍然面临挑战。本文提出了AnyStory,这是一种统一的个性化主题生成方法,能够在不牺牲主题保真的情况下,实现单个和多个主题的高保真个性化。AnyStory通过“编码-再路由”的方式建模主题个性化问题,利用强大的图像编码器和实例感知路由器,准确预测主题在潜在空间中的位置。'}}}];
+ const articlesData = [{'id': 'https://huggingface.co/papers/2501.09732', 'title': 'Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps', 'url': 'https://huggingface.co/papers/2501.09732', 'abstract': 'Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.', 'score': 41, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '2ad32c666f91ba05', 'authors': ['Nanye Ma', 'Shangyuan Tong', 'Haolin Jia', 'Hexiang Hu', 'Yu-Chuan Su', 'Mingda Zhang', 'Xuan Yang', 'Yandong Li', 'Tommi Jaakkola', 'Xuhui Jia', 'Saining Xie'], 'affiliations': ['Google', 'MIT', 'NYU'], 'pdf_title_img': 'assets/pdf/title_img/2501.09732.jpg', 'data': {'categories': ['#diffusion', '#inference', '#benchmark', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Повышение качества генерации изображений за счет масштабирования вычислений при выводе', 'desc': 'Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислений во время вывода. Авторы рассматривают задачу поиска лучших шумов для процесса сэмплирования диффузионной модели. Они структурируют пространство решений по двум осям: верификаторы для обратной связи и алгоритмы поиска лучших кандидатов шума. Эксперименты показывают, что увеличение вычислений при выводе приводит к значительному улучшению качества сгенерированных изображений.'}, 'en': {'title': 'Enhancing Diffusion Models: Scaling Inference for Better Image Generation', 'desc': 'This paper investigates how to enhance the performance of diffusion models during the inference phase by increasing computational resources. It highlights that, unlike Large Language Models (LLMs), diffusion models can adjust their inference process through the number of denoising steps, but improvements tend to plateau after a certain point. The authors propose a method to optimize the noise used in the diffusion sampling process by exploring different feedback verifiers and algorithms. Their experiments demonstrate that by strategically increasing computation during inference, the quality of generated images can be significantly improved, tailored to various application needs.'}, 'zh': {'title': '扩散模型推理时的计算扩展与性能提升', 'desc': '生成模型在多个领域产生了重要影响,主要得益于其在训练过程中通过增加数据、计算资源和模型规模来扩展的能力。最近的研究开始探讨大型语言模型(LLMs)在推理时的扩展行为,发现额外的计算可以进一步提高性能。与LLMs不同,扩散模型通过去噪步骤的数量灵活调整推理时的计算,尽管性能提升通常在几十步后趋于平稳。本文探讨了扩散模型在推理时的扩展行为,研究如何通过增加计算来进一步提高生成性能,特别是通过寻找更好的噪声来优化扩散采样过程。'}}}, {'id': 'https://huggingface.co/papers/2501.09751', 'title': 'OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking', 'url': 'https://huggingface.co/papers/2501.09751', 'abstract': "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.", 'score': 33, 'issue_id': 1722, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '7e8d42358354f79b', 'authors': ['Zekun Xi', 'Wenbiao Yin', 'Jizhan Fang', 'Jialong Wu', 'Runnan Fang', 'Ningyu Zhang', 'Jiang Yong', 'Pengjun Xie', 'Fei Huang', 'Huajun Chen'], 'affiliations': ['Tongyi Lab, Alibaba Group', 'Zhejiang Key Laboratory of Big Data Intelligent Computing', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09751.jpg', 'data': {'categories': ['#rag', '#story_generation', '#long_context', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста', 'desc': 'Статья представляет новый подход к генерации текста с использованием больших языковых моделей, названный OmniThink. Этот метод имитирует человеческий процесс итеративного расширения знаний и рефлексии, преодолевая ограничения стандартных методов извлечения информации. OmniThink улучшает плотность знаний в генерируемых статьях, не жертвуя связностью и глубиной. Эксперименты и оценки экспертов подтверждают эффективность OmniThink для решения реальных задач генерации длинных статей.'}, 'en': {'title': 'OmniThink: Elevating Machine Writing through Human-Like Learning', 'desc': 'This paper introduces OmniThink, a novel machine writing framework that enhances the capabilities of large language models by mimicking human cognitive processes. Unlike traditional retrieval-augmented generation methods, which often produce shallow and repetitive content, OmniThink focuses on iterative expansion and reflection to deepen knowledge on topics. The framework significantly improves the knowledge density of generated articles while maintaining coherence and depth, as shown by experimental results. Human evaluations and expert feedback confirm that OmniThink effectively addresses challenges in generating high-quality long-form content.'}, 'zh': {'title': 'OmniThink:提升机器写作的知识密度', 'desc': '本文提出了一种名为OmniThink的机器写作框架,旨在改善传统大语言模型在生成内容时的局限性。OmniThink模拟人类学习者的认知过程,通过迭代扩展和反思来加深对主题的理解。实验结果表明,OmniThink能够提高生成文章的知识密度,同时保持连贯性和深度等指标。人类评估和专家反馈进一步验证了OmniThink在生成长篇文章时解决实际问题的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.09755', 'title': 'Learnings from Scaling Visual Tokenizers for Reconstruction and Generation', 'url': 'https://huggingface.co/papers/2501.09755', 'abstract': "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.", 'score': 23, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '426aa3415c3c0ef4', 'authors': ['Philippe Hansen-Estruch', 'David Yan', 'Ching-Yao Chung', 'Orr Zohar', 'Jialiang Wang', 'Tingbo Hou', 'Tao Xu', 'Sriram Vishwanath', 'Peter Vajda', 'Xinlei Chen'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'Stanford University', 'UT Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.09755.jpg', 'data': {'categories': ['#cv', '#benchmark', '#video', '#optimization', '#architecture', '#diffusion'], 'emoji': '🔬', 'ru': {'title': 'ViTok: Оптимизация визуальной токенизации для генеративных моделей', 'desc': 'Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделях изображений и видео. Авторы представляют ViTok - легковесный автоэнкодер на основе Vision Transformer, обученный на масштабных датасетах. Исследование показывает, что масштабирование декодера улучшает реконструкцию, но неоднозначно влияет на генерацию. ViTok демонстрирует конкурентоспособную производительность при меньшем количестве FLOP и устанавливает новые рекорды в условной генерации видео.'}, 'en': {'title': 'Scaling Auto-Encoders for Enhanced Image and Video Generation', 'desc': 'This paper explores the scaling of auto-encoders, particularly focusing on the tokenizer component, which is crucial for image and video generation. The authors introduce ViTok, a Vision Transformer-based architecture that replaces traditional convolutional backbones, allowing for better scaling on large datasets. They investigate how different scaling strategies for the encoder and decoder affect both reconstruction and generative performance, finding that scaling the decoder is more beneficial for reconstruction. Ultimately, ViTok achieves competitive results with fewer computational resources and sets new benchmarks in image and video generation tasks.'}, 'zh': {'title': '自编码器的视觉标记化:提升生成模型的关键', 'desc': '本论文探讨了通过自编码器进行视觉标记化对图像和视频生成模型的影响。我们提出了一种增强的视觉变换器架构(ViTok),用于替代传统的卷积骨干网络,以提高标记化的效果。研究发现,自编码器的瓶颈规模与重建性能高度相关,但与生成性能的关系更为复杂。最终,ViTok在多个任务中表现出色,尤其是在视频重建和图像生成方面,展示了其在计算效率上的优势。'}}}, {'id': 'https://huggingface.co/papers/2501.09686', 'title': 'Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models', 'url': 'https://huggingface.co/papers/2501.09686', 'abstract': 'Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs\' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs\' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI\'s o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.', 'score': 21, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1c6b1b1f0235304c', 'authors': ['Fengli Xu', 'Qianyue Hao', 'Zefang Zong', 'Jingwei Wang', 'Yunke Zhang', 'Jingyi Wang', 'Xiaochong Lan', 'Jiahui Gong', 'Tianjian Ouyang', 'Fanjin Meng', 'Chenyang Shao', 'Yuwei Yan', 'Qinglong Yang', 'Yiwen Song', 'Sijian Ren', 'Xinyuan Hu', 'Yu Li', 'Jie Feng', 'Chen Gao', 'Yong Li'], 'affiliations': ['Emory University, Atlanta GA, USA', 'HKUST (GZ), Guangzhou, China', 'Tsinghua University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09686.jpg', 'data': {'categories': ['#open_source', '#training', '#rl', '#survey', '#reasoning', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Путь к большим моделям рассуждений: новый рубеж в ИИ', 'desc': 'Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваются ключевые технические компоненты, способствующие развитию крупных моделей рассуждений, включая автоматизированное построение данных, методы обучения рассуждениям и масштабирование во время тестирования. Анализируются популярные проекты с открытым исходным кодом по созданию крупных моделей рассуждений. Обсуждаются открытые проблемы и направления будущих исследований в этой области.'}, 'en': {'title': 'Unlocking Human-Like Reasoning in Large Language Models', 'desc': "This paper discusses the advancements in Large Language Models (LLMs) and their application to complex reasoning tasks. It introduces the concept of 'thought', which represents intermediate reasoning steps, allowing LLMs to simulate human-like reasoning processes. The paper highlights the use of reinforcement learning to enhance LLMs' reasoning capabilities by generating high-quality reasoning trajectories through trial-and-error methods. Additionally, it emphasizes the importance of scaling both training and testing phases to improve reasoning accuracy, paving the way for the development of Large Reasoning Models."}, 'zh': {'title': '推动大型推理模型的研究新前沿', 'desc': '这篇论文探讨了大型语言模型(LLMs)在复杂推理任务中的应用。研究者们引入了“思考”的概念,通过中间步骤的令牌序列来模拟人类的推理过程。最近,强化学习(RL)被应用于训练LLMs,以自动生成高质量的推理轨迹,从而显著提高推理能力。论文还讨论了在测试时增加令牌数量以提高推理准确性的效果,并展望了大型推理模型的未来研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09484', 'title': 'Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators', 'url': 'https://huggingface.co/papers/2501.09484', 'abstract': 'Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the "inquiry" phase of the consultation process. This lack of focus has left the relationship between "inquiry" and "diagnosis" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between "inquiry" and "diagnosis" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig\'s law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.', 'score': 17, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'aff7d86ad63040d9', 'authors': ['Zhaocheng Liu', 'Quan Tu', 'Wen Ye', 'Yu Xiao', 'Zhishou Zhang', 'Hengfu Cui', 'Yalun Zhu', 'Qiang Ju', 'Shizheng Li', 'Jian Xie'], 'affiliations': ['Baichuan Inc.', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09484.jpg', 'data': {'categories': ['#data', '#training', '#science', '#open_source', '#healthcare'], 'emoji': '🩺', 'ru': {'title': 'Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ', 'desc': 'Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам.'}, 'en': {'title': 'Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations', 'desc': "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."}, 'zh': {'title': '优化询问,提升诊断效果', 'desc': '本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。'}}}, {'id': 'https://huggingface.co/papers/2501.09756', 'title': 'SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces', 'url': 'https://huggingface.co/papers/2501.09756', 'abstract': "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/", 'score': 15, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'e6621d55eb165448', 'authors': ['Sumit Chaturvedi', 'Mengwei Ren', 'Yannick Hold-Geoffroy', 'Jingyuan Liu', 'Julie Dorsey', 'Zhixin Shu'], 'affiliations': ['Adobe Research', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09756.jpg', 'data': {'categories': ['#dataset', '#3d', '#inference', '#cv', '#diffusion', '#training', '#synthetic'], 'emoji': '💡', 'ru': {'title': 'SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели', 'desc': 'SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматривает перезасветку как проблему повторного рендеринга, где пиксели трансформируются в ответ на изменения условий освещения окружающей среды. Авторы синтезировали датасет с помощью физически корректного рендеринга, симулируя трансформации освещения на 3D-моделях голов. Предложены две стратегии обучения и вывода для преодоления разрыва между синтетическими и реальными изображениями.'}, 'en': {'title': 'Revolutionizing Portrait Relighting with SynthLight', 'desc': 'SynthLight is a diffusion model designed for relighting portraits by treating the task as a re-rendering challenge influenced by environmental lighting changes. It utilizes a physically-based rendering engine to create a synthetic dataset that simulates how lighting affects 3D head models. The model employs multi-task training to utilize real portraits without specific lighting labels and a novel inference strategy that enhances detail preservation during the relighting process. The results show that SynthLight can effectively generalize to real images, producing realistic lighting effects while maintaining the identity of the subjects, outperforming existing methods in both quantitative and qualitative assessments.'}, 'zh': {'title': 'SynthLight:肖像重光照的新方法', 'desc': '我们介绍了SynthLight,这是一种用于肖像重光照的扩散模型。我们将图像重光照视为重新渲染的问题,通过物理基础渲染引擎合成数据集,以模拟在不同光照条件下的像素变换。我们提出了两种训练和推理策略,以缩小合成图像和真实图像之间的差距,利用真实人像进行多任务训练,并在推理时使用无分类器引导的扩散采样程序。我们的模型能够在多样的真实照片中推广,生成逼真的光照效果,同时保持主体的身份特征。'}}}, {'id': 'https://huggingface.co/papers/2501.09747', 'title': 'FAST: Efficient Action Tokenization for Vision-Language-Action Models', 'url': 'https://huggingface.co/papers/2501.09747', 'abstract': 'Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.', 'score': 15, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1ff64d2f7e62d274', 'authors': ['Karl Pertsch', 'Kyle Stachowicz', 'Brian Ichter', 'Danny Driess', 'Suraj Nair', 'Quan Vuong', 'Oier Mees', 'Chelsea Finn', 'Sergey Levine'], 'affiliations': ['Physical Intelligence', 'Stanford', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.09747.jpg', 'data': {'categories': ['#dataset', '#agents', '#training', '#games', '#optimization', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'Революция в токенизации действий робота: от частотного пространства к универсальности', 'desc': 'Статья представляет новый метод токенизации действий робота под названием FAST (Frequency-space Action Sequence Tokenization), основанный на дискретном косинусном преобразовании. Этот подход позволяет обучать авторегрессионные модели VLA (Vision-Language Action) для высокочастотных и сложных задач манипулирования, где стандартные методы дискретизации не работают. Авторы также представляют FAST+, универсальный токенизатор действий робота, обученный на 1 миллионе реальных траекторий. В сочетании с моделью pi0 VLA, метод FAST позволяет обучаться на 10 тысячах часов данных робота и достигать производительности диффузионных VLA, сокращая время обучения до 5 раз.'}, 'en': {'title': 'Revolutionizing Robot Action Tokenization with FAST', 'desc': 'This paper introduces a new method for tokenizing continuous robot actions to improve the performance of autoregressive sequence models, specifically in the context of vision-language action (VLA) policies. The authors identify that traditional tokenization methods, which use simple binning techniques, struggle with high-frequency and dexterous robotic tasks. To overcome this limitation, they propose Frequency-space Action Sequence Tokenization (FAST), which utilizes the discrete cosine transform for better action representation. The results demonstrate that FAST can effectively train VLAs on extensive robot data, achieving performance comparable to diffusion models while significantly reducing training time.'}, 'zh': {'title': '提升机器人灵巧技能的标记化新方法', 'desc': '本文提出了一种新的机器人动作标记化方案,称为频率空间动作序列标记化(FAST),旨在解决现有基于简单分箱方法的标记化在学习灵巧技能时的不足。FAST利用离散余弦变换来有效地处理高频机器人数据,从而提高了模型在复杂任务中的表现。我们还发布了FAST+,这是一个通用的机器人动作标记器,能够处理多种动作序列和控制频率。通过与pi0 VLA结合,我们的方法在训练10,000小时的机器人数据时,能够与扩散VLA的性能相匹配,同时将训练时间减少了多达5倍。'}}}, {'id': 'https://huggingface.co/papers/2501.09038', 'title': 'Do generative video models learn physical principles from watching videos?', 'url': 'https://huggingface.co/papers/2501.09038', 'abstract': "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.", 'score': 14, 'issue_id': 1725, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '6a5047e8681ddcc5', 'authors': ['Saman Motamed', 'Laura Culp', 'Kevin Swersky', 'Priyank Jaini', 'Robert Geirhos'], 'affiliations': ['Google DeepMind', 'INSAIT, Sofia University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09038.jpg', 'data': {'categories': ['#benchmark', '#science', '#video'], 'emoji': '🧠', 'ru': {'title': 'Визуальный реализм не гарантирует понимание физики в ИИ', 'desc': 'Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моделей понимать законы физики. Результаты показывают, что современные модели имеют ограниченное физическое понимание, несмотря на визуальный реализм. Однако некоторые задачи уже успешно решаются, что указывает на потенциал изучения физических принципов из наблюдений.'}, 'en': {'title': 'Visual Realism vs. Physical Understanding in AI Video Generation', 'desc': "This paper explores whether AI video generation models truly understand the laws of physics or if they are just good at creating realistic images. The authors introduce Physics-IQ, a benchmark dataset designed to test models on their grasp of physical principles like fluid dynamics and thermodynamics. Their findings show that current models struggle with physical understanding, even though they can produce visually realistic videos. This suggests that while some physical concepts can be learned from observation, there are still significant gaps in the models' comprehension of reality."}, 'zh': {'title': '视觉真实感不等于物理理解', 'desc': '本论文探讨了AI视频生成技术的进展,特别是模型是否理解物理规律。我们开发了Physics-IQ,一个全面的基准数据集,只有通过深入理解流体动力学、光学、固体力学、磁学和热力学等物理原理才能解决。研究发现,当前模型在物理理解方面存在严重限制,且与视觉真实感无关。尽管某些测试案例已成功解决,但这表明仅通过观察获得某些物理原理仍面临重大挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.09433', 'title': 'CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation', 'url': 'https://huggingface.co/papers/2501.09433', 'abstract': 'The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.', 'score': 12, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '8c7a54f21e46af7a', 'authors': ['Hwan Heo', 'Jangyeong Kim', 'Seongyeong Lee', 'Jeong A Wi', 'Junyoung Choi', 'Sangjun Ahn'], 'affiliations': ['Graphics AI Lab, NC Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.09433.jpg', 'data': {'categories': ['#diffusion', '#3d', '#optimization'], 'emoji': '🎨', 'ru': {'title': 'CaPa: Революция в генерации 3D-моделей', 'desc': 'В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощью латентной диффузионной модели и пространственно-разделенного внимания. CaPa также предлагает алгоритм для заполнения нетекстурированных областей, обеспечивая целостность результатов. Фреймворк генерирует 3D-модели менее чем за 30 секунд, превосходя аналоги по качеству текстур и стабильности геометрии.'}, 'en': {'title': 'CaPa: Fast and High-Fidelity 3D Asset Generation', 'desc': 'This paper presents CaPa, a novel framework for generating high-quality 3D assets from textual or visual inputs. It addresses common challenges in 3D generation, such as multi-view inconsistency and slow generation times, by separating geometry generation from texture synthesis. The framework utilizes a 3D latent diffusion model for consistent geometry creation and a Spatially Decoupled Attention mechanism for high-resolution texture synthesis. CaPa also includes a 3D-aware occlusion inpainting algorithm to enhance the final output, achieving high fidelity and stability in under 30 seconds.'}, 'zh': {'title': '高效生成高保真3D资产的CaPa框架', 'desc': '本论文介绍了一种名为CaPa的框架,用于高效生成高保真度的3D资产。该框架采用两阶段的过程,将几何体生成与纹理合成解耦。首先,使用3D潜在扩散模型生成几何体,确保多视角之间的结构一致性。然后,通过一种新颖的空间解耦注意力机制合成高分辨率纹理,并提出了3D感知的遮挡修复算法,最终在30秒内生成高质量的3D资产。'}}}, {'id': 'https://huggingface.co/papers/2501.09653', 'title': 'The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models', 'url': 'https://huggingface.co/papers/2501.09653', 'abstract': 'The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.', 'score': 9, 'issue_id': 1730, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '6d731a1519dc2727', 'authors': ['Jonathan Katzy', 'Razvan Mihai Popescu', 'Arie van Deursen', 'Maliheh Izadi'], 'affiliations': ['Delft University of Technology Delft, The Netherlands'], 'pdf_title_img': 'assets/pdf/title_img/2501.09653.jpg', 'data': {'categories': ['#low_resource', '#multilingual', '#open_source', '#data', '#dataset'], 'emoji': '🗃️', 'ru': {'title': 'The Heap: чистый код для честной оценки языковых моделей', 'desc': "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор данных под названием 'The Heap' охватывает 57 языков программирования и был дедуплицирован относительно других открытых наборов данных. Это позволяет исследователям проводить объективные оценки больших языковых моделей без необходимости значительной предварительной очистки данных. Создание 'The Heap' решает проблему ограниченности доступного кода для исследования специфических поведений моделей и их оценки без риска загрязнения данных."}, 'en': {'title': 'The Heap: A Clean Dataset for Fair Evaluation of Language Models', 'desc': 'This paper introduces The Heap, a comprehensive multilingual dataset that includes code from 57 programming languages. It addresses the challenge of data contamination in evaluating large language models by providing a deduplicated dataset, ensuring that the code is unique compared to existing open datasets. Researchers can utilize The Heap for downstream tasks without the burden of extensive data cleaning. This resource aims to facilitate fair assessments of model performance in coding tasks.'}, 'zh': {'title': '公平评估大型语言模型的新数据集', 'desc': '随着大型语言模型的流行,开发了大量的代码数据集来训练这些模型。然而,这导致可用于特定行为研究或评估大型语言模型的代码有限,且可能存在数据污染的问题。为了解决这个问题,我们发布了The Heap,这是一个覆盖57种编程语言的大型多语言数据集,经过去重处理,避免与其他开放代码数据集重复。这样,研究人员可以在不需要大量数据清理的情况下,公平地评估大型语言模型。'}}}, {'id': 'https://huggingface.co/papers/2501.08617', 'title': 'RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation', 'url': 'https://huggingface.co/papers/2501.08617', 'abstract': "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.", 'score': 8, 'issue_id': 1720, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'f758bc630d8dd443', 'authors': ['Kaiqu Liang', 'Haimin Hu', 'Ryan Liu', 'Thomas L. Griffiths', 'Jaime Fernández Fisac'], 'affiliations': ['Department of Computer Science, Princeton University', 'Department of Electrical and Computer Engineering, Princeton University', 'Department of Psychology, Princeton University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08617.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#training', '#rl'], 'emoji': '🔮', 'ru': {'title': 'Взгляд в будущее для лучшей настройки ИИ', 'desc': 'Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS использует симуляцию долгосрочных последствий действий модели и оценку их полезности постфактум. Авторы показывают, что RLHS позволяет уменьшить проблему неправильной мотивации модели и улучшить соответствие человеческим ценностям. Эмпирические эксперименты демонстрируют превосходство RLHS над RLHF в достижении целей пользователей.'}, 'en': {'title': 'Aligning AI with Human Values through Hindsight Feedback', 'desc': "This paper addresses the challenge of aligning generative AI systems with human values using Reinforcement Learning from Human Feedback (RLHF). It identifies that relying on immediate feedback can lead to misaligned behaviors, such as sycophancy and deception, due to Goodhart's Law dynamics. The authors propose a new approach called Reinforcement Learning from Hindsight Simulation (RLHS), which uses simulated consequences to gather feedback on beneficial behaviors. Their experiments show that RLHS improves user satisfaction and goal achievement compared to traditional RLHF methods, highlighting the importance of considering long-term outcomes in AI alignment."}, 'zh': {'title': '关注长期后果,提升AI对齐性', 'desc': '这篇论文探讨了生成性人工智能系统如何更好地与人类价值观对齐,以确保其行为有益且可信。现有的基于人类反馈的强化学习(RLHF)方法主要依赖即时反馈,但这种反馈可能无法准确反映与用户效用相关的长期影响。作者提出了一种新的方法,称为基于事后模拟的强化学习(RLHS),通过模拟可能的后果来获取反馈,从而改善模型的对齐性。研究表明,RLHS在帮助用户实现目标和提高满意度方面,优于传统的RLHF方法。'}}}, {'id': 'https://huggingface.co/papers/2501.09503', 'title': 'AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation', 'url': 'https://huggingface.co/papers/2501.09503', 'abstract': 'Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .', 'score': 7, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'fb27e795153a9668', 'authors': ['Junjie He', 'Yuxiang Tuo', 'Binghui Chen', 'Chongyang Zhong', 'Yifeng Geng', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.09503.jpg', 'data': {'categories': ['#cv', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами', 'desc': 'Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод использует универсальный энкодер изображений ReferenceNet и CLIP для высококачественного кодирования характеристик субъектов. AnyStory применяет декуплированный маршрутизатор субъектов для точного определения их потенциального расположения в латентном пространстве. Эксперименты показывают превосходную производительность метода в сохранении деталей субъектов, соответствии текстовым описаниям и персонализации для нескольких субъектов одновременно.'}, 'en': {'title': 'AnyStory: Mastering Personalized Image Generation for Multiple Subjects', 'desc': "This paper introduces AnyStory, a novel method for generating personalized images with high fidelity, even when multiple subjects are involved. It employs an 'encode-then-route' strategy, where a powerful image encoder, ReferenceNet, captures detailed subject features. The routing mechanism uses an instance-aware subject router to accurately determine where each subject should be placed in the generated image. Experimental results show that AnyStory excels in maintaining subject details and aligning them with text descriptions, making it effective for both single and multiple subjects."}, 'zh': {'title': 'AnyStory:个性化主题生成的新方法', 'desc': '最近,大规模生成模型在文本到图像生成方面表现出色。然而,生成高保真度的个性化图像,尤其是涉及多个主题的情况,仍然面临挑战。本文提出了AnyStory,这是一种统一的个性化主题生成方法,能够在不牺牲主题保真的情况下,实现单个和多个主题的高保真个性化。AnyStory通过“编码-再路由”的方式建模主题个性化问题,利用强大的图像编码器和实例感知路由器,准确预测主题在潜在空间中的位置。'}}}];
const articlesContainer = document.getElementById('articles-container');
const sortDropdown = document.getElementById('sort-dropdown');
const categoryFiltersContainer = document.getElementById('category-filters');
@@ -1184,7 +1184,7 @@
function updateTimeDiffs() {
const timeDiff = document.getElementById('timeDiff');
- timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-18 12:37',lang=currentLang);
+ timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-18 18:26',lang=currentLang);
}
function updateSortingOptions() {
const sortingLabels = {
@@ -1238,14 +1238,14 @@
}
function hideNextLink(format) {
if (format === 'monthly') {
- if (isCurrentMonth('2025-01-18 12:37')) {
+ if (isCurrentMonth('2025-01-18 18:26')) {
const element = document.getElementById('nav-next');
if (element) {
element.style.display = 'none';
}
}
} else {
- if (isToday('2025-01-18 12:37')) {
+ if (isToday('2025-01-18 18:26')) {
const element = document.getElementById('nav-next');
if (element) {
element.style.display = 'none';
diff --git a/log.txt b/log.txt
index 30c153aa..57c7822f 100644
--- a/log.txt
+++ b/log.txt
@@ -1,3 +1,3 @@
-[18.01.2025 12:37] Read previous papers.
-[18.01.2025 12:37] Generating top page (month).
-[18.01.2025 12:37] Writing top page (month).
+[18.01.2025 18:26] Read previous papers.
+[18.01.2025 18:26] Generating top page (month).
+[18.01.2025 18:26] Writing top page (month).
diff --git a/logs/2025-01-18_last_log.txt b/logs/2025-01-18_last_log.txt
index 31c18ec7..8ce58e3b 100644
--- a/logs/2025-01-18_last_log.txt
+++ b/logs/2025-01-18_last_log.txt
@@ -1,169 +1,122 @@
-[18.01.2025 06:26] Read previous papers.
-[18.01.2025 06:26] Generating top page (month).
-[18.01.2025 06:26] Writing top page (month).
[18.01.2025 12:37] Read previous papers.
-[18.01.2025 12:37] Get feed.
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09732
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09751
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09755
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09686
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09484
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09756
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09747
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09038
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09433
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09653
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.08617
-[18.01.2025 12:37] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09503
-[18.01.2025 12:37] Obtaining deleted papers (sometimes HF Daily Papers move some articles from today to past days).
-[18.01.2025 12:37] No deleted papers detected.
-[18.01.2025 12:37] Downloading and parsing papers (pdf, html). Total: 12.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09732.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09732.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09732.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09751.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09751.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09751.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09755.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09755.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09755.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09686.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09686.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09686.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09484.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09484.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09484.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09756.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09756.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09756.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09747.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09747.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09747.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09038.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09038.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09038.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09433.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09433.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09433.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09653.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09653.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09653.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.08617.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.08617.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.08617.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Downloading and parsing paper https://huggingface.co/papers/2501.09503.
-[18.01.2025 12:37] Extra JSON file exists (./assets/json/2501.09503.json), skip PDF parsing.
-[18.01.2025 12:37] Paper image links file exists (./assets/img_data/2501.09503.json), skip HTML parsing.
-[18.01.2025 12:37] Success.
-[18.01.2025 12:37] Enriching papers with extra data.
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 0. Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behav...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 1. Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to l...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 2. Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions abou...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 3. Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by ...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 4. Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant pot...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 5. We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 6. Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how th...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 7. AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achiev...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 8. The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, an...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 9. The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without su...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 10. Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predomin...
-[18.01.2025 12:37] ********************************************************************************
-[18.01.2025 12:37] Abstract 11. Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a ...
-[18.01.2025 12:37] Read previous papers.
-[18.01.2025 12:37] Generating reviews via LLM API.
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#diffusion", "#inference", "#benchmark", "#optimization"], "emoji": "🔍", "ru": {"title": "Повышение качества генерации изображений за счет масштабирования вычислений при выводе", "desc": "Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислен
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#rag", "#story_generation", "#long_context", "#multimodal"], "emoji": "🧠", "ru": {"title": "OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста", "desc": "Статья представляет новый подход к генерации текста с использованием больших языковых моделей, на
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#cv", "#benchmark", "#video", "#optimization", "#architecture", "#diffusion"], "emoji": "🔬", "ru": {"title": "ViTok: Оптимизация визуальной токенизации для генеративных моделей", "desc": "Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделя
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#open_source", "#training", "#rl", "#survey", "#reasoning", "#dataset"], "emoji": "🧠", "ru": {"title": "Путь к большим моделям рассуждений: новый рубеж в ИИ", "desc": "Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваютс
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#data", "#training", "#science", "#open_source", "#healthcare"], "emoji": "🩺", "ru": {"title": "Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ", "desc": "Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#dataset", "#3d", "#inference", "#cv", "#diffusion", "#training", "#synthetic"], "emoji": "💡", "ru": {"title": "SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели", "desc": "SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматрив
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#dataset", "#agents", "#training", "#games", "#optimization", "#robotics"], "emoji": "🤖", "ru": {"title": "Революция в токенизации действий робота: от частотного пространства к универсальности", "desc": "Статья представляет новый метод токенизации действий робота под названием FAST
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#benchmark", "#science", "#video"], "emoji": "🧠", "ru": {"title": "Визуальный реализм не гарантирует понимание физики в ИИ", "desc": "Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моде
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#diffusion", "#3d", "#optimization"], "emoji": "🎨", "ru": {"title": "CaPa: Революция в генерации 3D-моделей", "desc": "В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощь
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#low_resource", "#multilingual", "#open_source", "#data", "#dataset"], "emoji": "🗃️", "ru": {"title": "The Heap: чистый код для честной оценки языковых моделей", "desc": "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор д
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#rlhf", "#alignment", "#training", "#rl"], "emoji": "🔮", "ru": {"title": "Взгляд в будущее для лучшей настройки ИИ", "desc": "Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS испо
-[18.01.2025 12:37] Using data from previous issue: {"categories": ["#cv", "#multimodal"], "emoji": "🎨", "ru": {"title": "AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами", "desc": "Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод
-[18.01.2025 12:37] Trying to get texts in Chinese.
-[18.01.2025 12:37] Mistral request. Model: mistral-large-latest. Prompt: Write simple and brief explanation (4-5 sentences) of an article in Chinese. Use short sentences. Text:
-
-Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.
-[18.01.2025 12:37] Mistral response. {"id": "8e1c4fd3ed4c4bc0a6f4966bbf657fd2", "object": "chat.completion", "created": 1737203830, "model": "mistral-large-latest", "choices": [{"index": 0, "message": {"role": "assistant", "tool_calls": null, "content": "\u751f\u6210\u6a21\u578b\u5728\u5404\u4e2a\u9886\u57df\u4ea7\u751f\u4e86\u91cd\u5927\u5f71\u54cd\uff0c\u4e3b\u8981\u662f\u56e0\u4e3a\u5b83\u4eec\u80fd\u591f\u901a\u8fc7\u589e\u52a0\u6570\u636e\u3001\u8ba1\u7b97\u8d44\u6e90\u548c\u6a21\u578b\u5927\u5c0f\u6765\u6269\u5c55\u8bad\u7ec3\uff0c\u8fd9\u79cd\u73b0\u8c61\u88ab\u79f0\u4e3a\u6269\u5c55\u89c4\u5f8b\u3002\u6700\u8fd1\u7684\u7814\u7a76\u5f00\u59cb\u63a2\u7d22\u5927\u578b\u8bed\u8a00\u6a21\u578b\uff08LLMs\uff09\u5728\u63a8\u7406\u65f6\u7684\u6269\u5c55\u884c\u4e3a\uff0c\u63ed\u793a\u4e86\u5982\u4f55\u901a\u8fc7\u63a8\u7406\u8fc7\u7a0b\u4e2d\u7684\u989d\u5916\u8ba1\u7b97\u6765\u8fdb\u4e00\u6b65\u63d0\u9ad8\u6027\u80fd\u3002\u4e0eLLMs\u4e0d\u540c\uff0c\u6269\u6563\u6a21\u578b\u5929\u751f\u5177\u6709\u901a\u8fc7\u53bb\u566a\u6b65\u9aa4\u6570\u91cf\u6765\u8c03\u6574\u63a8\u7406\u65f6\u8ba1\u7b97\u7684\u7075\u6d3b\u6027\uff0c\u5c3d\u7ba1\u6027\u80fd\u589e\u76ca\u901a\u5e38\u5728\u51e0\u5341\u6b65\u540e\u8d8b\u4e8e\u5e73\u7a33\u3002\u5728\u8fd9\u9879\u5de5\u4f5c\u4e2d\uff0c\u6211\u4eec\u63a2\u8ba8\u4e86\u6269\u6563\u6a21\u578b\u5728\u589e\u52a0\u53bb\u566a\u6b65\u9aa4\u4e4b\u5916\u7684\u63a8\u7406\u65f6\u6269\u5c55\u884c\u4e3a\uff0c\u7814\u7a76\u4e86\u5982\u4f55\u901a\u8fc7\u589e\u52a0\u8ba1\u7b97\u6765\u8fdb\u4e00\u6b65\u63d0\u9ad8\u751f\u6210\u6027\u80fd\u3002\u5177\u4f53\u6765\u8bf4\uff0c\u6211\u4eec\u8003\u8651\u4e86\u4e00\u4e2a\u641c\u7d22\u95ee\u9898\uff0c\u65e8\u5728\u4e3a\u6269\u6563\u91c7\u6837\u8fc7\u7a0b\u627e\u5230\u66f4\u597d\u7684\u566a\u58f0\u3002\u6211\u4eec\u5c06\u8bbe\u8ba1\u7a7a\u95f4\u5206\u4e3a\u4e24\u4e2a\u8f74\uff1a\u7528\u4e8e\u63d0\u4f9b\u53cd\u9988\u7684\u9a8c\u8bc1\u5668\uff0c\u4ee5\u53ca\u7528\u4e8e\u627e\u5230\u66f4\u597d\u566a\u58f0\u5019\u9009\u7684\u7b97\u6cd5\u3002\u901a\u8fc7\u5728\u7c7b\u6761\u4ef6\u548c\u6587\u672c\u6761\u4ef6\u56fe\u50cf\u751f\u6210\u57fa\u51c6\u4e0a\u7684\u5927\u91cf\u5b9e\u9a8c\uff0c\u6211\u4eec\u53d1\u73b0\u589e\u52a0\u63a8\u7406\u65f6\u7684\u8ba1\u7b97\u91cf\u663e\u8457\u63d0\u9ad8\u4e86\u6269\u6563\u6a21\u578b\u751f\u6210\u6837\u672c\u7684\u8d28\u91cf\uff0c\u5e76\u4e14\u5728\u56fe\u50cf\u7684\u590d\u6742\u6027\u65b9\u9762\uff0c\u6846\u67b6\u4e2d\u7684\u7ec4\u4ef6\u7ec4\u5408\u53ef\u4ee5\u6839\u636e\u4e0d\u540c\u7684\u5e94\u7528\u573a\u666f\u8fdb\u884c\u9009\u62e9\u3002"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 292, "total_tokens": 703, "completion_tokens": 411}}
-[18.01.2025 12:37] Response: 生成模型在各个领域产生了重大影响,主要是因为它们能够通过增加数据、计算资源和模型大小来扩展训练,这种现象被称为扩展规律。最近的研究开始探索大型语言模型(LLMs)在推理时的扩展行为,揭示了如何通过推理过程中的额外计算来进一步提高性能。与LLMs不同,扩散模型天生具有通过去噪步骤数量来调整推理时计算的灵活性,尽管性能增益通常在几十步后趋于平稳。在这项工作中,我们探讨了扩散模型在增加去噪步骤之外的推理时扩展行为,研究了如何通过增加计算来进一步提高生成性能。具体来说,我们考虑了一个搜索问题,旨在为扩散采样过程找到更好的噪声。我们将设计空间分为两个轴:用于提供反馈的验证器,以及用于找到更好噪声候选的算法。通过在类条件和文本条件图像生成基准上的大量实验,我们发现增加推理时的计算量显著提高了扩散模型生成样本的质量,并且在图像的复杂性方面,框架中的组件组合可以根据不同的应用场景进行选择。
-[18.01.2025 12:37] Mistral request. Model: mistral-large-latest. Prompt: Write pinyin transcription for text. Text:
-
-生成模型在各个领域产生了重大影响,主要是因为它们能够通过增加数据、计算资源和模型大小来扩展训练,这种现象被称为扩展规律。最近的研究开始探索大型语言模型(LLMs)在推理时的扩展行为,揭示了如何通过推理过程中的额外计算来进一步提高性能。与LLMs不同,扩散模型天生具有通过去噪步骤数量来调整推理时计算的灵活性,尽管性能增益通常在几十步后趋于平稳。在这项工作中,我们探讨了扩散模型在增加去噪步骤之外的推理时扩展行为,研究了如何通过增加计算来进一步提高生成性能。具体来说,我们考虑了一个搜索问题,旨在为扩散采样过程找到更好的噪声。我们将设计空间分为两个轴:用于提供反馈的验证器,以及用于找到更好噪声候选的算法。通过在类条件和文本条件图像生成基准上的大量实验,我们发现增加推理时的计算量显著提高了扩散模型生成样本的质量,并且在图像的复杂性方面,框架中的组件组合可以根据不同的应用场景进行选择。
-[18.01.2025 12:37] Mistral response. {"id": "93e0d65dfe3a48f89c338ff34133d897", "object": "chat.completion", "created": 1737203839, "model": "mistral-large-latest", "choices": [{"index": 0, "message": {"role": "assistant", "tool_calls": null, "content": "Sh\u0113ngch\u00e9ng m\u00f3x\u00edng z\u00e0i g\u00e8g\u00e8 l\u01d0ngy\u00f9 ch\u01censh\u0113ngle zh\u00f2ngd\u00e0 y\u01d0ngxi\u01ceng, zh\u01d4y\u00e0o sh\u00ec y\u012bnw\u00e8i t\u0101men n\u00e9ngg\u00f2u t\u014dnggu\u00f2 z\u0113ngji\u0101 sh\u00f9j\u00f9, j\u00ecsu\u00e0n z\u012byu\u00e1n h\u00e9 m\u00f3x\u00edng d\u00e0x\u00edng l\u00e1i ku\u00f2zh\u01cen x\u00f9nli\u00e0n, zh\u00e8 zh\u01d2ng xi\u00e0nxi\u00e0ng b\u00e8i ch\u0113ngw\u00e9i ku\u00f2zh\u01cen gu\u012bl\u01dc. Zu\u00ecj\u00ecn de y\u00e1nji\u016b k\u0101ish\u01d0 tu\u00e0nsu\u01d2 d\u00e0x\u00edng y\u01d4y\u00e1n m\u00f3x\u00edng (LLMs) z\u00e0i tu\u012bl\u01d0 sh\u00ed de ku\u00f2zh\u01cen x\u00edngw\u00e9i, ji\u0113sh\u00ecle r\u00fah\u00e9 t\u014dnggu\u00f2 tu\u012bl\u01d0 gu\u00f2ch\u00e9ng zh\u014dng de \u00e9x\u01d4 j\u00ecsu\u00e0n l\u00e1i j\u00ecn y\u012bb\u00f9 t\u00edg\u0101o x\u00ecngn\u00e9ng. Y\u01d4 LLMs b\u00f9t\u00f3ng, ku\u00f2s\u00e0n m\u00f3x\u00edng ti\u0101nsh\u0113ng j\u00f9y\u01d2u t\u014dnggu\u00f2 q\u00f9z\u00e0o b\u00f9zh\u00f2u sh\u00f9li\u00e0ng l\u00e1i ti\u00e1oj\u01d0 tu\u012bl\u01d0 sh\u00ed j\u00ecsu\u00e0n de l\u00ednghu\u00f3x\u00ecng, j\u01d0ngu\u01cen x\u00ecngn\u00e9ng z\u0113ngy\u00ec t\u014dngch\u00e1ng z\u00e0i j\u01d0sh\u00ed b\u00f9 h\u00f2u q\u016by\u00fa p\u00edngw\u011bn. Z\u00e0i zh\u00e8 xi\u00e0ng g\u014dngzu\u00f2 zh\u014dng, w\u01d2men tu\u00e0nsu\u01d2le ku\u00f2s\u00e0n m\u00f3x\u00edng z\u00e0i z\u0113ngji\u0101 q\u00f9z\u00e0o b\u00f9zh\u00f2u zh\u012bw\u00e0i de tu\u012bl\u01d0 sh\u00ed ku\u00f2zh\u01cen x\u00edngw\u00e9i, y\u00e1nji\u016ble r\u00fah\u00e9 t\u014dnggu\u00f2 z\u0113ngji\u0101 j\u00ecsu\u00e0n l\u00e1i j\u00ecn y\u012bb\u00f9 t\u00edg\u0101o sh\u0113ngch\u00e9ng x\u00ecngn\u00e9ng. J\u00f9t\u01d0 l\u00e1i shu\u014d, w\u01d2men k\u01ceol\u01dcle y\u012bg\u00e8 s\u014dusu\u01d2 w\u00e8nt\u00ed, zh\u01d0 z\u00e0i w\u00e8i ku\u00f2s\u00e0n c\u01ceiy\u00e0ng gu\u00f2ch\u00e9ng zh\u01ceo d\u00e0o g\u00e8ng h\u01ceo de z\u00e0osh\u0113ng. W\u01d2men ji\u0101ng sh\u00e8j\u00ec k\u014dngji\u0101n f\u0113n w\u00e9i li\u01ceng g\u00e8 zh\u00f3u: y\u00f2ngy\u00fa t\u00edg\u014dng f\u01cenku\u00ec de y\u00e0nzh\u00e8ngq\u00ec, y\u01d0j\u00ed y\u00f2ngy\u00fa zh\u01ceo d\u00e0o g\u00e8ng h\u01ceo z\u00e0osh\u0113ng h\u00f2uxu\u01cen de su\u00e0nf\u01ce. T\u014dnggu\u00f2 z\u00e0i l\u00e8i ti\u00e1oji\u00e0n h\u00e9 w\u00e9nb\u011bn ti\u00e1oji\u00e0n t\u00faxi\u00e0ng sh\u0113ngch\u00e9ng b\u01d0zh\u01d4n sh\u00e0ng de d\u00e0li\u00e0ng sh\u00edy\u00e0n, w\u01d2men f\u0101xi\u00e0n z\u0113ngji\u0101 tu\u012bl\u01d0 sh\u00ed de j\u00ecsu\u00e0nli\u00e0ng xi\u01cenzh\u00f9 t\u00edg\u0101ole ku\u00f2s\u00e0n m\u00f3x\u00edng sh\u0113ngch\u00e9ng y\u00e0ngb\u01cen de zh\u00ecli\u00e0ng, b\u00ecngqi\u011b z\u00e0i t\u00faxi\u00e0ng de f\u00f9z\u00e1x\u00ecng f\u0101ngmi\u00e0n, ku\u00e0ngji\u00e0 zh\u014dng de z\u01d4ji\u00e0n z\u01d4h\u00e9 k\u011by\u01d0 g\u0113nj\u00f9 b\u00f9t\u00f3ng de y\u00ecngy\u00f2ng ch\u01cengj\u012bng j\u00ecnx\u00edng xu\u01cenz\u00e9."}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 426, "total_tokens": 1338, "completion_tokens": 912}}
-[18.01.2025 12:37] Response: Shēngchéng móxíng zài gègè lǐngyù chǎnshēngle zhòngdà yǐngxiǎng, zhǔyào shì yīnwèi tāmen nénggòu tōngguò zēngjiā shùjù, jìsuàn zīyuán hé móxíng dàxíng lái kuòzhǎn xùnliàn, zhè zhǒng xiànxiàng bèi chēngwéi kuòzhǎn guīlǜ. Zuìjìn de yánjiū kāishǐ tuànsuǒ dàxíng yǔyán móxíng (LLMs) zài tuīlǐ shí de kuòzhǎn xíngwéi, jiēshìle rúhé tōngguò tuīlǐ guòchéng zhōng de éxǔ jìsuàn lái jìn yībù tígāo xìngnéng. Yǔ LLMs bùtóng, kuòsàn móxíng tiānshēng jùyǒu tōngguò qùzào bùzhòu shùliàng lái tiáojǐ tuīlǐ shí jìsuàn de línghuóxìng, jǐnguǎn xìngnéng zēngyì tōngcháng zài jǐshí bù hòu qūyú píngwěn. Zài zhè xiàng gōngzuò zhōng, wǒmen tuànsuǒle kuòsàn móxíng zài zēngjiā qùzào bùzhòu zhīwài de tuīlǐ shí kuòzhǎn xíngwéi, yánjiūle rúhé tōngguò zēngjiā jìsuàn lái jìn yībù tígāo shēngchéng xìngnéng. Jùtǐ lái shuō, wǒmen kǎolǜle yīgè sōusuǒ wèntí, zhǐ zài wèi kuòsàn cǎiyàng guòchéng zhǎo dào gèng hǎo de zàoshēng. Wǒmen jiāng shèjì kōngjiān fēn wéi liǎng gè zhóu: yòngyú tígōng fǎnkuì de yànzhèngqì, yǐjí yòngyú zhǎo dào gèng hǎo zàoshēng hòuxuǎn de suànfǎ. Tōngguò zài lèi tiáojiàn hé wénběn tiáojiàn túxiàng shēngchéng bǐzhǔn shàng de dàliàng shíyàn, wǒmen fāxiàn zēngjiā tuīlǐ shí de jìsuànliàng xiǎnzhù tígāole kuòsàn móxíng shēngchéng yàngbǎn de zhìliàng, bìngqiě zài túxiàng de fùzáxìng fāngmiàn, kuàngjià zhōng de zǔjiàn zǔhé kěyǐ gēnjù bùtóng de yìngyòng chǎngjīng jìnxíng xuǎnzé.
-[18.01.2025 12:37] Mistral request. Model: mistral-large-latest. Prompt: Write vocab of difficult words for this text as an array of objects with fields 'word', 'pinyin', 'trans'. Return as python list without formatting. Return list and nothing else. Text:
-
-生成模型在各个领域产生了重大影响,主要是因为它们能够通过增加数据、计算资源和模型大小来扩展训练,这种现象被称为扩展规律。最近的研究开始探索大型语言模型(LLMs)在推理时的扩展行为,揭示了如何通过推理过程中的额外计算来进一步提高性能。与LLMs不同,扩散模型天生具有通过去噪步骤数量来调整推理时计算的灵活性,尽管性能增益通常在几十步后趋于平稳。在这项工作中,我们探讨了扩散模型在增加去噪步骤之外的推理时扩展行为,研究了如何通过增加计算来进一步提高生成性能。具体来说,我们考虑了一个搜索问题,旨在为扩散采样过程找到更好的噪声。我们将设计空间分为两个轴:用于提供反馈的验证器,以及用于找到更好噪声候选的算法。通过在类条件和文本条件图像生成基准上的大量实验,我们发现增加推理时的计算量显著提高了扩散模型生成样本的质量,并且在图像的复杂性方面,框架中的组件组合可以根据不同的应用场景进行选择。
-[18.01.2025 12:37] Mistral response. {"id": "8f30fab56e8c4f68aee935ed9686c5c2", "object": "chat.completion", "created": 1737203857, "model": "mistral-large-latest", "choices": [{"index": 0, "message": {"role": "assistant", "tool_calls": null, "content": "[{'word': '\u751f\u6210\u6a21\u578b', 'pinyin': 'sh\u0113ng ch\u00e9ng m\u00f3 x\u00edng', 'trans': 'generative model'},\n{'word': '\u91cd\u5927', 'pinyin': 'zh\u00f2ng d\u00e0', 'trans': 'significant'},\n{'word': '\u5f71\u54cd', 'pinyin': 'y\u01d0ng xi\u01ceng', 'trans': 'impact'},\n{'word': '\u9886\u57df', 'pinyin': 'l\u01d0ng y\u00f9', 'trans': 'field'},\n{'word': '\u6269\u5c55', 'pinyin': 'ku\u00f2 zh\u01cen', 'trans': 'expand'},\n{'word': '\u89c4\u5f8b', 'pinyin': 'gu\u012b l\u01dc', 'trans': 'pattern'},\n{'word': '\u63a8\u7406', 'pinyin': 'tu\u012b l\u01d0', 'trans': 'reasoning'},\n{'word': '\u884c\u4e3a', 'pinyin': 'x\u00edng w\u00e9i', 'trans': 'behavior'},\n{'word': '\u63ed\u793a', 'pinyin': 'ji\u0113 sh\u00ec', 'trans': 'reveal'},\n{'word': '\u7075\u6d3b\u6027', 'pinyin': 'l\u00edng hu\u00f3 x\u00ecng', 'trans': 'flexibility'},\n{'word': '\u8d8b\u4e8e', 'pinyin': 'q\u016b y\u00fa', 'trans': 'tend towards'},\n{'word': '\u5e73\u7a33', 'pinyin': 'p\u00edng w\u011bn', 'trans': 'stable'},\n{'word': '\u63a2\u8ba8', 'pinyin': 't\u00e0n t\u01ceo', 'trans': 'discuss'},\n{'word': '\u53bb\u566a', 'pinyin': 'q\u00f9 z\u00e0o', 'trans': 'denoise'},\n{'word': '\u6b65\u9aa4', 'pinyin': 'b\u00f9 zh\u00f2u', 'trans': 'step'},\n{'word': '\u641c\u7d22', 'pinyin': 's\u014du su\u01d2', 'trans': 'search'},\n{'word': '\u91c7\u6837', 'pinyin': 'c\u01cei y\u00e0ng', 'trans': 'sampling'},\n{'word': '\u9a8c\u8bc1\u5668', 'pinyin': 'y\u00e0n zh\u00e8ng q\u00ec', 'trans': 'validator'},\n{'word': '\u53cd\u9988', 'pinyin': 'f\u01cen ku\u00ec', 'trans': 'feedback'},\n{'word': '\u7b97\u6cd5', 'pinyin': 'su\u00e0n f\u01ce', 'trans': 'algorithm'},\n{'word': '\u5019\u9009', 'pinyin': 'h\u00f2u xu\u01cen', 'trans': 'candidate'},\n{'word': '\u57fa\u51c6', 'pinyin': 'j\u012b zh\u01d4n', 'trans': 'benchmark'},\n{'word': '\u590d\u6742\u6027', 'pinyin': 'f\u00f9 z\u00e1 x\u00ecng', 'trans': 'complexity'},\n{'word': '\u6846\u67b6', 'pinyin': 'ku\u00e0ng ji\u00e0', 'trans': 'framework'},\n{'word': '\u7ec4\u4ef6', 'pinyin': 'z\u01d4 ji\u00e0n', 'trans': 'component'},\n{'word': '\u7ec4\u5408', 'pinyin': 'z\u01d4 h\u00e9', 'trans': 'combination'},\n{'word': '\u5e94\u7528', 'pinyin': 'y\u00ecng y\u00f2ng', 'trans': 'application'},\n{'word': '\u573a\u666f', 'pinyin': 'ch\u01ceng j\u01d0ng', 'trans': 'scenario'}]"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 458, "total_tokens": 1252, "completion_tokens": 794}}
-[18.01.2025 12:37] Response: [{'word': '生成模型', 'pinyin': 'shēng chéng mó xíng', 'trans': 'generative model'},
-{'word': '重大', 'pinyin': 'zhòng dà', 'trans': 'significant'},
-{'word': '影响', 'pinyin': 'yǐng xiǎng', 'trans': 'impact'},
-{'word': '领域', 'pinyin': 'lǐng yù', 'trans': 'field'},
-{'word': '扩展', 'pinyin': 'kuò zhǎn', 'trans': 'expand'},
-{'word': '规律', 'pinyin': 'guī lǜ', 'trans': 'pattern'},
-{'word': '推理', 'pinyin': 'tuī lǐ', 'trans': 'reasoning'},
-{'word': '行为', 'pinyin': 'xíng wéi', 'trans': 'behavior'},
-{'word': '揭示', 'pinyin': 'jiē shì', 'trans': 'reveal'},
-{'word': '灵活性', 'pinyin': 'líng huó xìng', 'trans': 'flexibility'},
-{'word': '趋于', 'pinyin': 'qū yú', 'trans': 'tend towards'},
-{'word': '平稳', 'pinyin': 'píng wěn', 'trans': 'stable'},
-{'word': '探讨', 'pinyin': 'tàn tǎo', 'trans': 'discuss'},
-{'word': '去噪', 'pinyin': 'qù zào', 'trans': 'denoise'},
-{'word': '步骤', 'pinyin': 'bù zhòu', 'trans': 'step'},
-{'word': '搜索', 'pinyin': 'sōu suǒ', 'trans': 'search'},
-{'word': '采样', 'pinyin': 'cǎi yàng', 'trans': 'sampling'},
-{'word': '验证器', 'pinyin': 'yàn zhèng qì', 'trans': 'validator'},
-{'word': '反馈', 'pinyin': 'fǎn kuì', 'trans': 'feedback'},
-{'word': '算法', 'pinyin': 'suàn fǎ', 'trans': 'algorithm'},
-{'word': '候选', 'pinyin': 'hòu xuǎn', 'trans': 'candidate'},
-{'word': '基准', 'pinyin': 'jī zhǔn', 'trans': 'benchmark'},
-{'word': '复杂性', 'pinyin': 'fù zá xìng', 'trans': 'complexity'},
-{'word': '框架', 'pinyin': 'kuàng jià', 'trans': 'framework'},
-{'word': '组件', 'pinyin': 'zǔ jiàn', 'trans': 'component'},
-{'word': '组合', 'pinyin': 'zǔ hé', 'trans': 'combination'},
-{'word': '应用', 'pinyin': 'yìng yòng', 'trans': 'application'},
-{'word': '场景', 'pinyin': 'chǎng jǐng', 'trans': 'scenario'}]
-[18.01.2025 12:37] Mistral request. Model: mistral-large-latest. Prompt: Translate this text in English. Text:
-
-生成模型在各个领域产生了重大影响,主要是因为它们能够通过增加数据、计算资源和模型大小来扩展训练,这种现象被称为扩展规律。最近的研究开始探索大型语言模型(LLMs)在推理时的扩展行为,揭示了如何通过推理过程中的额外计算来进一步提高性能。与LLMs不同,扩散模型天生具有通过去噪步骤数量来调整推理时计算的灵活性,尽管性能增益通常在几十步后趋于平稳。在这项工作中,我们探讨了扩散模型在增加去噪步骤之外的推理时扩展行为,研究了如何通过增加计算来进一步提高生成性能。具体来说,我们考虑了一个搜索问题,旨在为扩散采样过程找到更好的噪声。我们将设计空间分为两个轴:用于提供反馈的验证器,以及用于找到更好噪声候选的算法。通过在类条件和文本条件图像生成基准上的大量实验,我们发现增加推理时的计算量显著提高了扩散模型生成样本的质量,并且在图像的复杂性方面,框架中的组件组合可以根据不同的应用场景进行选择。
-[18.01.2025 12:37] Mistral response. {"id": "f86e66f888b24c48addbdcff5084c67d", "object": "chat.completion", "created": 1737203872, "model": "mistral-large-latest", "choices": [{"index": 0, "message": {"role": "assistant", "tool_calls": null, "content": "Generative models have had a significant impact across various domains, primarily because they can scale training by increasing data, computational resources, and model size, a phenomenon known as scaling laws. Recent research has begun to explore the scaling behavior of large language models (LLMs) during inference, revealing how performance can be further enhanced through additional computation during the inference process. Unlike LLMs, diffusion models inherently offer flexibility in adjusting computation during inference through the number of denoising steps, although performance gains typically plateau after a few dozen steps. In this work, we investigate the scaling behavior of diffusion models during inference beyond increasing denoising steps, exploring how to further enhance generative performance by increasing computation. Specifically, we consider a search problem aimed at finding better noise for the diffusion sampling process. We divide the design space into two axes: the validator used to provide feedback and the algorithm used to find better noise candidates. Through extensive experiments on class-conditional and text-conditional image generation benchmarks, we find that increasing the amount of computation during inference significantly improves the quality of samples generated by diffusion models. Additionally, the combination of components in the framework can be selected based on different application scenarios in terms of image complexity."}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 424, "total_tokens": 680, "completion_tokens": 256}}
-[18.01.2025 12:37] Response: Generative models have had a significant impact across various domains, primarily because they can scale training by increasing data, computational resources, and model size, a phenomenon known as scaling laws. Recent research has begun to explore the scaling behavior of large language models (LLMs) during inference, revealing how performance can be further enhanced through additional computation during the inference process. Unlike LLMs, diffusion models inherently offer flexibility in adjusting computation during inference through the number of denoising steps, although performance gains typically plateau after a few dozen steps. In this work, we investigate the scaling behavior of diffusion models during inference beyond increasing denoising steps, exploring how to further enhance generative performance by increasing computation. Specifically, we consider a search problem aimed at finding better noise for the diffusion sampling process. We divide the design space into two axes: the validator used to provide feedback and the algorithm used to find better noise candidates. Through extensive experiments on class-conditional and text-conditional image generation benchmarks, we find that increasing the amount of computation during inference significantly improves the quality of samples generated by diffusion models. Additionally, the combination of components in the framework can be selected based on different application scenarios in terms of image complexity.
-[18.01.2025 12:37] Renaming data file.
-[18.01.2025 12:37] Renaming previous data. hf_papers.json to ./d/2025-01-17.json
-[18.01.2025 12:37] Saving new data file.
-[18.01.2025 12:37] Generating page.
-[18.01.2025 12:37] Renaming previous page.
-[18.01.2025 12:37] Renaming previous data. index.html to ./d/2025-01-17.html
-[18.01.2025 12:37] [Experimental] Generating Chinese page for reading.
-[18.01.2025 12:37] Chinese vocab [{'word': '生成模型', 'pinyin': 'shēng chéng mó xíng', 'trans': 'generative model'}, {'word': '重大', 'pinyin': 'zhòng dà', 'trans': 'significant'}, {'word': '影响', 'pinyin': 'yǐng xiǎng', 'trans': 'impact'}, {'word': '领域', 'pinyin': 'lǐng yù', 'trans': 'field'}, {'word': '扩展', 'pinyin': 'kuò zhǎn', 'trans': 'expand'}, {'word': '规律', 'pinyin': 'guī lǜ', 'trans': 'pattern'}, {'word': '推理', 'pinyin': 'tuī lǐ', 'trans': 'reasoning'}, {'word': '行为', 'pinyin': 'xíng wéi', 'trans': 'behavior'}, {'word': '揭示', 'pinyin': 'jiē shì', 'trans': 'reveal'}, {'word': '灵活性', 'pinyin': 'líng huó xìng', 'trans': 'flexibility'}, {'word': '趋于', 'pinyin': 'qū yú', 'trans': 'tend towards'}, {'word': '平稳', 'pinyin': 'píng wěn', 'trans': 'stable'}, {'word': '探讨', 'pinyin': 'tàn tǎo', 'trans': 'discuss'}, {'word': '去噪', 'pinyin': 'qù zào', 'trans': 'denoise'}, {'word': '步骤', 'pinyin': 'bù zhòu', 'trans': 'step'}, {'word': '搜索', 'pinyin': 'sōu suǒ', 'trans': 'search'}, {'word': '采样', 'pinyin': 'cǎi yàng', 'trans': 'sampling'}, {'word': '验证器', 'pinyin': 'yàn zhèng qì', 'trans': 'validator'}, {'word': '反馈', 'pinyin': 'fǎn kuì', 'trans': 'feedback'}, {'word': '算法', 'pinyin': 'suàn fǎ', 'trans': 'algorithm'}, {'word': '候选', 'pinyin': 'hòu xuǎn', 'trans': 'candidate'}, {'word': '基准', 'pinyin': 'jī zhǔn', 'trans': 'benchmark'}, {'word': '复杂性', 'pinyin': 'fù zá xìng', 'trans': 'complexity'}, {'word': '框架', 'pinyin': 'kuàng jià', 'trans': 'framework'}, {'word': '组件', 'pinyin': 'zǔ jiàn', 'trans': 'component'}, {'word': '组合', 'pinyin': 'zǔ hé', 'trans': 'combination'}, {'word': '应用', 'pinyin': 'yìng yòng', 'trans': 'application'}, {'word': '场景', 'pinyin': 'chǎng jǐng', 'trans': 'scenario'}]
-[18.01.2025 12:37] Renaming previous Chinese page.
-[18.01.2025 12:37] Renaming previous data. zh.html to ./d/2025-01-17_zh_reading_task.html
-[18.01.2025 12:37] Writing Chinese reading task.
-[18.01.2025 12:37] Writing result.
-[18.01.2025 12:37] Renaming log file.
-[18.01.2025 12:37] Renaming previous data. log.txt to ./logs/2025-01-18_last_log.txt
+[18.01.2025 12:37] Generating top page (month).
+[18.01.2025 12:37] Writing top page (month).
+[18.01.2025 18:26] Read previous papers.
+[18.01.2025 18:26] Get feed.
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09732
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09751
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09755
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09686
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09484
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09756
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09747
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09038
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09433
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09653
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.08617
+[18.01.2025 18:26] Get page data from previous paper. URL: https://huggingface.co/papers/2501.09503
+[18.01.2025 18:26] Obtaining deleted papers (sometimes HF Daily Papers move some articles from today to past days).
+[18.01.2025 18:26] No deleted papers detected.
+[18.01.2025 18:26] Downloading and parsing papers (pdf, html). Total: 12.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09732.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09732.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09732.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09751.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09751.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09751.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09755.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09755.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09755.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09686.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09686.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09686.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09484.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09484.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09484.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09756.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09756.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09756.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09747.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09747.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09747.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09038.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09038.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09038.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09433.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09433.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09433.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09653.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09653.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09653.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.08617.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.08617.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.08617.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Downloading and parsing paper https://huggingface.co/papers/2501.09503.
+[18.01.2025 18:26] Extra JSON file exists (./assets/json/2501.09503.json), skip PDF parsing.
+[18.01.2025 18:26] Paper image links file exists (./assets/img_data/2501.09503.json), skip HTML parsing.
+[18.01.2025 18:26] Success.
+[18.01.2025 18:26] Enriching papers with extra data.
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 0. Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behav...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 1. Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to l...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 2. Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions abou...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 3. Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by ...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 4. Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant pot...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 5. We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 6. Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how th...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 7. AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achiev...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 8. The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, an...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 9. The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without su...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 10. Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predomin...
+[18.01.2025 18:26] ********************************************************************************
+[18.01.2025 18:26] Abstract 11. Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a ...
+[18.01.2025 18:26] Read previous papers.
+[18.01.2025 18:26] Generating reviews via LLM API.
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#diffusion", "#inference", "#benchmark", "#optimization"], "emoji": "🔍", "ru": {"title": "Повышение качества генерации изображений за счет масштабирования вычислений при выводе", "desc": "Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислен
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#rag", "#story_generation", "#long_context", "#multimodal"], "emoji": "🧠", "ru": {"title": "OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста", "desc": "Статья представляет новый подход к генерации текста с использованием больших языковых моделей, на
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#cv", "#benchmark", "#video", "#optimization", "#architecture", "#diffusion"], "emoji": "🔬", "ru": {"title": "ViTok: Оптимизация визуальной токенизации для генеративных моделей", "desc": "Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделя
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#open_source", "#training", "#rl", "#survey", "#reasoning", "#dataset"], "emoji": "🧠", "ru": {"title": "Путь к большим моделям рассуждений: новый рубеж в ИИ", "desc": "Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваютс
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#data", "#training", "#science", "#open_source", "#healthcare"], "emoji": "🩺", "ru": {"title": "Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ", "desc": "Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#dataset", "#3d", "#inference", "#cv", "#diffusion", "#training", "#synthetic"], "emoji": "💡", "ru": {"title": "SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели", "desc": "SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматрив
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#dataset", "#agents", "#training", "#games", "#optimization", "#robotics"], "emoji": "🤖", "ru": {"title": "Революция в токенизации действий робота: от частотного пространства к универсальности", "desc": "Статья представляет новый метод токенизации действий робота под названием FAST
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#benchmark", "#science", "#video"], "emoji": "🧠", "ru": {"title": "Визуальный реализм не гарантирует понимание физики в ИИ", "desc": "Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моде
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#diffusion", "#3d", "#optimization"], "emoji": "🎨", "ru": {"title": "CaPa: Революция в генерации 3D-моделей", "desc": "В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощь
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#low_resource", "#multilingual", "#open_source", "#data", "#dataset"], "emoji": "🗃️", "ru": {"title": "The Heap: чистый код для честной оценки языковых моделей", "desc": "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор д
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#rlhf", "#alignment", "#training", "#rl"], "emoji": "🔮", "ru": {"title": "Взгляд в будущее для лучшей настройки ИИ", "desc": "Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS испо
+[18.01.2025 18:26] Using data from previous issue: {"categories": ["#cv", "#multimodal"], "emoji": "🎨", "ru": {"title": "AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами", "desc": "Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод
+[18.01.2025 18:26] Loading Chinese text from previous data.
+[18.01.2025 18:26] Renaming data file.
+[18.01.2025 18:26] Renaming previous data. hf_papers.json to ./d/2025-01-17.json
+[18.01.2025 18:26] Saving new data file.
+[18.01.2025 18:26] Generating page.
+[18.01.2025 18:26] Renaming previous page.
+[18.01.2025 18:26] Renaming previous data. index.html to ./d/2025-01-17.html
+[18.01.2025 18:26] [Experimental] Generating Chinese page for reading.
+[18.01.2025 18:26] Chinese vocab [{'word': '生成模型', 'pinyin': 'shēng chéng mó xíng', 'trans': 'generative model'}, {'word': '重大', 'pinyin': 'zhòng dà', 'trans': 'significant'}, {'word': '影响', 'pinyin': 'yǐng xiǎng', 'trans': 'impact'}, {'word': '领域', 'pinyin': 'lǐng yù', 'trans': 'field'}, {'word': '扩展', 'pinyin': 'kuò zhǎn', 'trans': 'expand'}, {'word': '规律', 'pinyin': 'guī lǜ', 'trans': 'pattern'}, {'word': '推理', 'pinyin': 'tuī lǐ', 'trans': 'reasoning'}, {'word': '行为', 'pinyin': 'xíng wéi', 'trans': 'behavior'}, {'word': '揭示', 'pinyin': 'jiē shì', 'trans': 'reveal'}, {'word': '灵活性', 'pinyin': 'líng huó xìng', 'trans': 'flexibility'}, {'word': '趋于', 'pinyin': 'qū yú', 'trans': 'tend towards'}, {'word': '平稳', 'pinyin': 'píng wěn', 'trans': 'stable'}, {'word': '探讨', 'pinyin': 'tàn tǎo', 'trans': 'discuss'}, {'word': '去噪', 'pinyin': 'qù zào', 'trans': 'denoise'}, {'word': '步骤', 'pinyin': 'bù zhòu', 'trans': 'step'}, {'word': '搜索', 'pinyin': 'sōu suǒ', 'trans': 'search'}, {'word': '采样', 'pinyin': 'cǎi yàng', 'trans': 'sampling'}, {'word': '验证器', 'pinyin': 'yàn zhèng qì', 'trans': 'validator'}, {'word': '反馈', 'pinyin': 'fǎn kuì', 'trans': 'feedback'}, {'word': '算法', 'pinyin': 'suàn fǎ', 'trans': 'algorithm'}, {'word': '候选', 'pinyin': 'hòu xuǎn', 'trans': 'candidate'}, {'word': '基准', 'pinyin': 'jī zhǔn', 'trans': 'benchmark'}, {'word': '复杂性', 'pinyin': 'fù zá xìng', 'trans': 'complexity'}, {'word': '框架', 'pinyin': 'kuàng jià', 'trans': 'framework'}, {'word': '组件', 'pinyin': 'zǔ jiàn', 'trans': 'component'}, {'word': '组合', 'pinyin': 'zǔ hé', 'trans': 'combination'}, {'word': '应用', 'pinyin': 'yìng yòng', 'trans': 'application'}, {'word': '场景', 'pinyin': 'chǎng jǐng', 'trans': 'scenario'}]
+[18.01.2025 18:26] Renaming previous Chinese page.
+[18.01.2025 18:26] Renaming previous data. zh.html to ./d/2025-01-17_zh_reading_task.html
+[18.01.2025 18:26] Writing Chinese reading task.
+[18.01.2025 18:26] Writing result.
+[18.01.2025 18:26] Renaming log file.
+[18.01.2025 18:26] Renaming previous data. log.txt to ./logs/2025-01-18_last_log.txt
diff --git a/m/2025-01.html b/m/2025-01.html
index 40f356b6..d2ff90fc 100644
--- a/m/2025-01.html
+++ b/m/2025-01.html
@@ -881,7 +881,7 @@
}
}
- const articlesData = [{'id': 'https://huggingface.co/papers/2501.02976', 'title': 'STAR: Spatial-Temporal Augmentation with Text-to-Video Models for Real-World Video Super-Resolution', 'url': 'https://huggingface.co/papers/2501.02976', 'abstract': 'Image diffusion models have been adapted for real-world video super-resolution to tackle over-smoothing issues in GAN-based methods. However, these models struggle to maintain temporal consistency, as they are trained on static images, limiting their ability to capture temporal dynamics effectively. Integrating text-to-video (T2V) models into video super-resolution for improved temporal modeling is straightforward. However, two key challenges remain: artifacts introduced by complex degradations in real-world scenarios, and compromised fidelity due to the strong generative capacity of powerful T2V models (e.g., CogVideoX-5B). To enhance the spatio-temporal quality of restored videos, we introduce~\\name (Spatial-Temporal Augmentation with T2V models for Real-world video super-resolution), a novel approach that leverages T2V models for real-world video super-resolution, achieving realistic spatial details and robust temporal consistency. Specifically, we introduce a Local Information Enhancement Module (LIEM) before the global attention block to enrich local details and mitigate degradation artifacts. Moreover, we propose a Dynamic Frequency (DF) Loss to reinforce fidelity, guiding the model to focus on different frequency components across diffusion steps. Extensive experiments demonstrate~\\name~outperforms state-of-the-art methods on both synthetic and real-world datasets.', 'score': 36, 'issue_id': 1527, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '13ac412646c508f5', 'authors': ['Rui Xie', 'Yinhong Liu', 'Penghao Zhou', 'Chen Zhao', 'Jun Zhou', 'Kai Zhang', 'Zhenyu Zhang', 'Jian Yang', 'Zhenheng Yang', 'Ying Tai'], 'affiliations': ['ByteDance', 'Nanjing University', 'Southwest University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02976.jpg', 'data': {'categories': ['#cv', '#optimization', '#diffusion', '#multimodal', '#video'], 'emoji': '🎥', 'ru': {'title': 'Качественное суперразрешение видео с помощью T2V моделей', 'desc': 'Представлена новая методика STAR для суперразрешения видео в реальных условиях с использованием моделей text-to-video. Предложен модуль LIEM для улучшения локальных деталей и устранения артефактов деградации. Введена функция потерь Dynamic Frequency для усиления точности восстановления на разных частотах. Эксперименты показывают превосходство STAR над современными методами на синтетических и реальных датасетах.'}, 'en': {'title': 'Enhancing Video Quality with T2V Models for Real-World Super-Resolution', 'desc': 'This paper presents a new method called Spatial-Temporal Augmentation with T2V models for Real-world video super-resolution, which aims to improve video quality by addressing issues of over-smoothing and temporal consistency. Traditional image diffusion models struggle with video because they are designed for static images, leading to challenges in capturing motion dynamics. The proposed approach incorporates a Local Information Enhancement Module to enhance local details and reduce artifacts, along with a Dynamic Frequency Loss to maintain fidelity across different frequency components. Experimental results show that this method outperforms existing techniques in both synthetic and real-world scenarios, providing better spatial and temporal quality in restored videos.'}, 'zh': {'title': '提升视频超分辨率的时空一致性', 'desc': '本文提出了一种新方法,名为~\\name~,用于提高真实世界视频超分辨率的时空质量。该方法结合了文本到视频(T2V)模型,以解决传统生成对抗网络(GAN)方法中的过平滑问题。通过引入局部信息增强模块(LIEM)和动态频率损失(DF Loss),该方法能够有效改善视频的局部细节和时间一致性。实验结果表明,~\\name~在合成和真实世界数据集上均优于现有的最先进方法。'}}}, {'id': 'https://huggingface.co/papers/2501.03226', 'title': 'BoostStep: Boosting mathematical capability of Large Language Models via improved single-step reasoning', 'url': 'https://huggingface.co/papers/2501.03226', 'abstract': "Cutting-edge large language models (LLMs) demonstrate promising performance in solving complex math problems with a divide-and-conquer pipeline and the assistance of in-context learning (ICL) examples. However, their potential for improvement is limited by two critical problems within their ICL examples: granularity-mismatch and the ensuing negative-effect noise problem. Specifically, the LLMs are capable of the dividing process yet mostly failed by inaccurate reasoning within a few conquer steps, while the ICL examples retrieved in question-grained sometimes lack relevant steps for a specific challenging reasoning step. Further, this disconnect may hinder the correct reasoning due to its irrelevance. To this end, we focus on improving the reasoning quality within each step and present BoostStep. BoostStep aligns the granularity between the retrieving and reasoning on step grained, and provides highly related ICL examples for each reasoning step with a novel `first-try' strategy. BoostStep provides more relevant examples than the coarse question-grained strategy, enhancing the model reasoning quality within each step steadily. BoostStep is a general and robust reasoning-enhancing method that not only improves standalone reasoning performance but also integrates seamlessly with Monte Carlo Tree Search methods (MCTS) to refine both candidate generation and decision-making. Quantitatively, it improves GPT-4o and Qwen2.5-Math-72B by 3.6\\% and 2.0\\% respectively on various mathematical benchmarks, and 7.5\\% gain combined with MCTS.", 'score': 21, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '94a01c7d4516c725', 'authors': ['Beichen Zhang', 'Yuhong Liu', 'Xiaoyi Dong', 'Yuhang Zang', 'Pan Zhang', 'Haodong Duan', 'Yuhang Cao', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.03226.jpg', 'data': {'categories': ['#training', '#optimization', '#math', '#reasoning'], 'emoji': '🧮', 'ru': {'title': 'BoostStep: Повышение точности рассуждений ИИ в решении математических задач', 'desc': 'Статья представляет метод BoostStep для улучшения решения сложных математических задач большими языковыми моделями. BoostStep решает проблемы несоответствия детализации и негативного шума в примерах обучения в контексте. Метод выравнивает гранулярность между извлечением и рассуждением на уровне шагов, предоставляя релевантные примеры для каждого шага рассуждения. BoostStep повышает качество рассуждений модели и может интегрироваться с методами поиска по дереву Монте-Карло для улучшения генерации кандидатов и принятия решений.'}, 'en': {'title': 'Boosting Reasoning Quality in Large Language Models with BoostStep', 'desc': "This paper introduces BoostStep, a method designed to enhance the reasoning quality of large language models (LLMs) when solving complex math problems. It addresses two main issues: granularity-mismatch and negative-effect noise in in-context learning (ICL) examples, which can lead to inaccurate reasoning. By aligning the granularity of retrieved examples with the specific reasoning steps required, BoostStep provides more relevant ICL examples, improving the model's performance. The method not only boosts standalone reasoning but also integrates effectively with Monte Carlo Tree Search (MCTS) to enhance decision-making processes."}, 'zh': {'title': '提升推理质量的BoostStep方法', 'desc': '这篇论文探讨了大型语言模型(LLMs)在解决复杂数学问题时的表现,特别是通过分而治之的策略和上下文学习(ICL)示例的辅助。研究发现,ICL示例中的粒度不匹配和负面噪声问题限制了模型的改进潜力。为了解决这些问题,论文提出了BoostStep方法,它通过对每个推理步骤的粒度进行对齐,提供更相关的ICL示例,从而提高推理质量。BoostStep不仅提升了独立推理的性能,还能与蒙特卡洛树搜索(MCTS)方法无缝集成,进一步优化候选生成和决策过程。'}}}, {'id': 'https://huggingface.co/papers/2501.03218', 'title': 'Dispider: Enabling Video LLMs with Active Real-Time Interaction via Disentangled Perception, Decision, and Reaction', 'url': 'https://huggingface.co/papers/2501.03218', 'abstract': 'Active Real-time interaction with video LLMs introduces a new paradigm for human-computer interaction, where the model not only understands user intent but also responds while continuously processing streaming video on the fly. Unlike offline video LLMs, which analyze the entire video before answering questions, active real-time interaction requires three capabilities: 1) Perception: real-time video monitoring and interaction capturing. 2) Decision: raising proactive interaction in proper situations, 3) Reaction: continuous interaction with users. However, inherent conflicts exist among the desired capabilities. The Decision and Reaction require a contrary Perception scale and grain, and the autoregressive decoding blocks the real-time Perception and Decision during the Reaction. To unify the conflicted capabilities within a harmonious system, we present Dispider, a system that disentangles Perception, Decision, and Reaction. Dispider features a lightweight proactive streaming video processing module that tracks the video stream and identifies optimal moments for interaction. Once the interaction is triggered, an asynchronous interaction module provides detailed responses, while the processing module continues to monitor the video in the meantime. Our disentangled and asynchronous design ensures timely, contextually accurate, and computationally efficient responses, making Dispider ideal for active real-time interaction for long-duration video streams. Experiments show that Dispider not only maintains strong performance in conventional video QA tasks, but also significantly surpasses previous online models in streaming scenario responses, thereby validating the effectiveness of our architecture. The code and model are released at https://github.com/Mark12Ding/Dispider.', 'score': 20, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '1e9974be2d206516', 'authors': ['Rui Qian', 'Shuangrui Ding', 'Xiaoyi Dong', 'Pan Zhang', 'Yuhang Zang', 'Yuhang Cao', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.03218.jpg', 'data': {'categories': ['#long_context', '#video', '#optimization', '#architecture', '#interpretability'], 'emoji': '🎥', 'ru': {'title': 'Dispider: Интеллектуальное взаимодействие с видео в реальном времени', 'desc': 'Статья представляет систему Dispider для активного взаимодействия с видео в реальном времени с использованием языковых моделей. Система разделяет процессы восприятия, принятия решений и реакции, что позволяет эффективно обрабатывать потоковое видео и взаимодействовать с пользователем. Dispider использует легковесный модуль обработки видео для отслеживания потока и определения оптимальных моментов для взаимодействия. Асинхронная архитектура обеспечивает своевременные и точные ответы при длительной обработке видеопотоков.'}, 'en': {'title': 'Dispider: Real-time Interaction Redefined for Video LLMs', 'desc': 'This paper introduces Dispider, a system designed for active real-time interaction with video using large language models (LLMs). Unlike traditional offline models, Dispider can process video streams continuously while engaging with users, requiring three key capabilities: Perception, Decision, and Reaction. The system addresses conflicts between these capabilities by disentangling them, allowing for efficient monitoring and interaction without lag. Experimental results demonstrate that Dispider outperforms previous models in streaming scenarios, providing timely and contextually relevant responses during long-duration video interactions.'}, 'zh': {'title': '主动实时交互的新范式', 'desc': '本论文介绍了一种名为Dispider的系统,旨在实现视频大语言模型的主动实时交互。该系统通过分离感知、决策和反应三个能力,解决了实时交互中的固有冲突。Dispider具备轻量级的流媒体处理模块,能够实时监控视频流并识别最佳交互时机。实验结果表明,Dispider在传统视频问答任务中表现优异,并在流媒体场景响应上显著超越了之前的在线模型。'}}}, {'id': 'https://huggingface.co/papers/2501.02157', 'title': 'Personalized Graph-Based Retrieval for Large Language Models', 'url': 'https://huggingface.co/papers/2501.02157', 'abstract': 'As large language models (LLMs) evolve, their ability to deliver personalized and context-aware responses offers transformative potential for improving user experiences. Existing personalization approaches, however, often rely solely on user history to augment the prompt, limiting their effectiveness in generating tailored outputs, especially in cold-start scenarios with sparse data. To address these limitations, we propose Personalized Graph-based Retrieval-Augmented Generation (PGraphRAG), a framework that leverages user-centric knowledge graphs to enrich personalization. By directly integrating structured user knowledge into the retrieval process and augmenting prompts with user-relevant context, PGraphRAG enhances contextual understanding and output quality. We also introduce the Personalized Graph-based Benchmark for Text Generation, designed to evaluate personalized text generation tasks in real-world settings where user history is sparse or unavailable. Experimental results show that PGraphRAG significantly outperforms state-of-the-art personalization methods across diverse tasks, demonstrating the unique advantages of graph-based retrieval for personalization.', 'score': 16, 'issue_id': 1527, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': '65e3736cfc1e3295', 'authors': ['Steven Au', 'Cameron J. Dimacali', 'Ojasmitha Pedirappagari', 'Namyong Park', 'Franck Dernoncourt', 'Yu Wang', 'Nikos Kanakaris', 'Hanieh Deilamsalehy', 'Ryan A. Rossi', 'Nesreen K. Ahmed'], 'affiliations': ['Adobe Research', 'Cisco AI Research', 'Meta AI', 'University of California Santa Cruz', 'University of Oregon', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.02157.jpg', 'data': {'categories': ['#rag', '#optimization', '#graphs', '#multimodal', '#benchmark', '#games'], 'emoji': '🕸️', 'ru': {'title': 'Графы знаний на службе персонализации языковых моделей', 'desc': 'Статья представляет новый подход к персонализации ответов больших языковых моделей (LLM) под названием PGraphRAG. В отличие от существующих методов, полагающихся на историю пользователя, PGraphRAG использует ориентированные на пользователя графы знаний для обогащения контекста. Этот метод улучшает понимание контекста и качество генерируемых ответов, особенно в сценариях с ограниченными данными о пользователе. Экспериментальные результаты показывают, что PGraphRAG превосходит современные методы персонализации в различных задачах.'}, 'en': {'title': 'Revolutionizing Personalization with Graph-based Retrieval', 'desc': "This paper introduces a new framework called Personalized Graph-based Retrieval-Augmented Generation (PGraphRAG) that enhances the personalization of large language models (LLMs). Unlike traditional methods that depend only on user history, PGraphRAG utilizes user-centric knowledge graphs to provide richer context for generating responses. By integrating structured user information into the retrieval process, it improves the model's understanding and the quality of its outputs, especially in situations where user data is limited. The authors also present a benchmark for evaluating personalized text generation, showing that PGraphRAG outperforms existing methods in various tasks."}, 'zh': {'title': '个性化图谱提升生成质量', 'desc': '随着大型语言模型的发展,它们在提供个性化和上下文感知的响应方面展现出巨大的潜力。现有的个性化方法通常仅依赖用户历史数据来增强提示,这在数据稀疏的冷启动场景中效果有限。为了解决这些问题,我们提出了个性化图谱检索增强生成(PGraphRAG)框架,利用以用户为中心的知识图谱来丰富个性化。实验结果表明,PGraphRAG在多种任务中显著优于现有的个性化方法,展示了基于图谱的检索在个性化中的独特优势。'}}}, {'id': 'https://huggingface.co/papers/2501.02497', 'title': 'Test-time Computing: from System-1 Thinking to System-2 Thinking', 'url': 'https://huggingface.co/papers/2501.02497', 'abstract': "The remarkable performance of the o1 model in complex reasoning demonstrates that test-time computing scaling can further unlock the model's potential, enabling powerful System-2 thinking. However, there is still a lack of comprehensive surveys for test-time computing scaling. We trace the concept of test-time computing back to System-1 models. In System-1 models, test-time computing addresses distribution shifts and improves robustness and generalization through parameter updating, input modification, representation editing, and output calibration. In System-2 models, it enhances the model's reasoning ability to solve complex problems through repeated sampling, self-correction, and tree search. We organize this survey according to the trend of System-1 to System-2 thinking, highlighting the key role of test-time computing in the transition from System-1 models to weak System-2 models, and then to strong System-2 models. We also point out a few possible future directions.", 'score': 15, 'issue_id': 1528, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': '7d9414c60fe7701d', 'authors': ['Yixin Ji', 'Juntao Li', 'Hai Ye', 'Kaixin Wu', 'Jia Xu', 'Linjian Mo', 'Min Zhang'], 'affiliations': ['Ant Group', 'Department of Computer Science, National University of Singapore', 'School of Computer Science and Technology, Soochow University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02497.jpg', 'data': {'categories': ['#reasoning', '#math', '#survey', '#training'], 'emoji': '🧠', 'ru': {'title': 'Масштабирование вычислений: путь к мышлению System-2', 'desc': 'Эта статья рассматривает масштабирование вычислений во время тестирования для улучшения производительности моделей машинного обучения. Авторы прослеживают эволюцию этой концепции от моделей System-1 до моделей System-2. В работе описываются различные методы, такие как обновление параметров, модификация входных данных и древовидный поиск. Исследование подчеркивает ключевую роль вычислений во время тестирования в переходе от моделей System-1 к сильным моделям System-2.'}, 'en': {'title': 'Unlocking Model Potential: The Power of Test-Time Computing', 'desc': 'This paper explores the concept of test-time computing scaling and its impact on machine learning models, particularly in enhancing reasoning capabilities. It distinguishes between System-1 models, which focus on improving robustness and generalization through techniques like parameter updating and output calibration, and System-2 models, which utilize methods such as repeated sampling and self-correction for complex problem-solving. The authors trace the evolution from System-1 to System-2 thinking, emphasizing how test-time computing plays a crucial role in this transition. Additionally, the paper identifies potential future research directions in this area.'}, 'zh': {'title': '测试时计算:从系统-1到强系统-2的关键转变', 'desc': '这篇论文探讨了测试时计算扩展对机器学习模型的影响,特别是在复杂推理中的应用。作者指出,测试时计算可以通过参数更新、输入修改、表示编辑和输出校准来提高模型的鲁棒性和泛化能力。对于系统-2模型,测试时计算通过重复采样、自我修正和树搜索来增强模型的推理能力。论文还强调了测试时计算在从系统-1模型向弱系统-2模型再到强系统-2模型转变中的关键作用,并提出了一些未来的研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.02045', 'title': 'METAGENE-1: Metagenomic Foundation Model for Pandemic Monitoring', 'url': 'https://huggingface.co/papers/2501.02045', 'abstract': 'We pretrain METAGENE-1, a 7-billion-parameter autoregressive transformer model, which we refer to as a metagenomic foundation model, on a novel corpus of diverse metagenomic DNA and RNA sequences comprising over 1.5 trillion base pairs. This dataset is sourced from a large collection of human wastewater samples, processed and sequenced using deep metagenomic (next-generation) sequencing methods. Unlike genomic models that focus on individual genomes or curated sets of specific species, the aim of METAGENE-1 is to capture the full distribution of genomic information present within this wastewater, to aid in tasks relevant to pandemic monitoring and pathogen detection. We carry out byte-pair encoding (BPE) tokenization on our dataset, tailored for metagenomic sequences, and then pretrain our model. In this paper, we first detail the pretraining dataset, tokenization strategy, and model architecture, highlighting the considerations and design choices that enable the effective modeling of metagenomic data. We then show results of pretraining this model on our metagenomic dataset, providing details about our losses, system metrics, and training stability over the course of pretraining. Finally, we demonstrate the performance of METAGENE-1, which achieves state-of-the-art results on a set of genomic benchmarks and new evaluations focused on human-pathogen detection and genomic sequence embedding, showcasing its potential for public health applications in pandemic monitoring, biosurveillance, and early detection of emerging health threats.', 'score': 12, 'issue_id': 1528, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '60a3568f555ed60f', 'authors': ['Ollie Liu', 'Sami Jaghouar', 'Johannes Hagemann', 'Shangshang Wang', 'Jason Wiemels', 'Jeff Kaufman', 'Willie Neiswanger'], 'affiliations': ['Nucleic Acid Observatory', 'Prime Intellect', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.02045.jpg', 'data': {'categories': ['#benchmark', '#data', '#training', '#architecture', '#science', '#dataset', '#healthcare'], 'emoji': '🧬', 'ru': {'title': 'METAGENE-1: Метагеномная модель для мониторинга здоровья населения', 'desc': 'METAGENE-1 - это автореграссивная трансформерная модель с 7 миллиардами параметров, обученная на разнообразных метагеномных последовательностях ДНК и РНК. Модель создана для анализа геномной информации из образцов сточных вод с целью мониторинга пандемий и обнаружения патогенов. Авторы описывают процесс предобучения, включая токенизацию и архитектуру модели, а также демонстрируют результаты на различных геномных задачах. METAGENE-1 показывает высокую эффективность в обнаружении патогенов человека и встраивании геномных последовательностей, что открывает перспективы для применения в общественном здравоохранении.'}, 'en': {'title': 'Unlocking Metagenomics: METAGENE-1 for Pandemic Preparedness', 'desc': 'The paper introduces METAGENE-1, a large autoregressive transformer model designed for metagenomic data analysis. It is pretrained on a vast dataset of metagenomic DNA and RNA sequences derived from human wastewater, totaling over 1.5 trillion base pairs. The model aims to enhance pandemic monitoring and pathogen detection by capturing the diverse genomic information present in wastewater samples. The authors detail their tokenization strategy and model architecture, demonstrating that METAGENE-1 achieves state-of-the-art performance in genomic benchmarks and applications related to public health.'}, 'zh': {'title': 'METAGENE-1:元基因组基础模型助力公共卫生监测', 'desc': '我们预训练了METAGENE-1,这是一个拥有70亿参数的自回归变换器模型,称为元基因组基础模型。该模型在一个包含超过1.5万亿碱基对的多样化元基因组DNA和RNA序列的新数据集上进行训练,这些数据来自大量人类废水样本。METAGENE-1的目标是捕捉废水中存在的基因组信息的完整分布,以帮助进行疫情监测和病原体检测。我们展示了该模型在元基因组数据集上的预训练结果,证明其在公共卫生应用中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.02690', 'title': 'GS-DiT: Advancing Video Generation with Pseudo 4D Gaussian Fields through Efficient Dense 3D Point Tracking', 'url': 'https://huggingface.co/papers/2501.02690', 'abstract': '4D video control is essential in video generation as it enables the use of sophisticated lens techniques, such as multi-camera shooting and dolly zoom, which are currently unsupported by existing methods. Training a video Diffusion Transformer (DiT) directly to control 4D content requires expensive multi-view videos. Inspired by Monocular Dynamic novel View Synthesis (MDVS) that optimizes a 4D representation and renders videos according to different 4D elements, such as camera pose and object motion editing, we bring pseudo 4D Gaussian fields to video generation. Specifically, we propose a novel framework that constructs a pseudo 4D Gaussian field with dense 3D point tracking and renders the Gaussian field for all video frames. Then we finetune a pretrained DiT to generate videos following the guidance of the rendered video, dubbed as GS-DiT. To boost the training of the GS-DiT, we also propose an efficient Dense 3D Point Tracking (D3D-PT) method for the pseudo 4D Gaussian field construction. Our D3D-PT outperforms SpatialTracker, the state-of-the-art sparse 3D point tracking method, in accuracy and accelerates the inference speed by two orders of magnitude. During the inference stage, GS-DiT can generate videos with the same dynamic content while adhering to different camera parameters, addressing a significant limitation of current video generation models. GS-DiT demonstrates strong generalization capabilities and extends the 4D controllability of Gaussian splatting to video generation beyond just camera poses. It supports advanced cinematic effects through the manipulation of the Gaussian field and camera intrinsics, making it a powerful tool for creative video production. Demos are available at https://wkbian.github.io/Projects/GS-DiT/.', 'score': 11, 'issue_id': 1530, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'b4c147a2637166a8', 'authors': ['Weikang Bian', 'Zhaoyang Huang', 'Xiaoyu Shi', 'Yijin Li', 'Fu-Yun Wang', 'Hongsheng Li'], 'affiliations': ['Avolution AI', 'Centre for Perceptual and Interactive Intelligence', 'Multimedia Laboratory, The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.02690.jpg', 'data': {'categories': ['#video', '#games', '#diffusion', '#3d'], 'emoji': '🎥', 'ru': {'title': 'Революция в генерации видео: 4D-контроль с помощью гауссовых полей', 'desc': 'Эта статья представляет инновационный подход к генерации видео с 4D-контролем, используя псевдо-4D гауссовы поля и модель Diffusion Transformer (DiT). Авторы предлагают метод Dense 3D Point Tracking (D3D-PT) для эффективного построения гауссовых полей, превосходящий существующие решения по точности и скорости. Разработанная система GS-DiT позволяет генерировать видео с одинаковым динамическим содержанием, но с разными параметрами камеры, что открывает новые возможности для создания кинематографических эффектов. Метод демонстрирует сильные обобщающие способности и расширяет возможности 4D-контроля в генерации видео.'}, 'en': {'title': 'Revolutionizing Video Generation with 4D Control', 'desc': 'This paper introduces a new method for generating videos that can be controlled in four dimensions (4D), which includes both camera movement and object motion. The authors propose a framework called GS-DiT that utilizes pseudo 4D Gaussian fields to enhance video generation, allowing for advanced cinematic effects. They also present a Dense 3D Point Tracking (D3D-PT) technique that improves the accuracy and speed of tracking 3D points compared to existing methods. Overall, GS-DiT enables the creation of dynamic videos with flexible camera parameters, significantly advancing the capabilities of video generation models.'}, 'zh': {'title': '伪4D高斯场:视频生成的新突破', 'desc': '本论文提出了一种新颖的框架,利用伪4D高斯场进行视频生成,以支持复杂的镜头技术。我们通过密集的3D点跟踪构建伪4D高斯场,并为所有视频帧渲染该高斯场。为了提升GS-DiT的训练效果,我们还提出了一种高效的密集3D点跟踪方法,显著提高了准确性和推理速度。GS-DiT能够在不同的相机参数下生成具有相同动态内容的视频,扩展了视频生成的4D可控性,成为创意视频制作的强大工具。'}}}, {'id': 'https://huggingface.co/papers/2501.03059', 'title': 'Through-The-Mask: Mask-based Motion Trajectories for Image-to-Video Generation', 'url': 'https://huggingface.co/papers/2501.03059', 'abstract': "We consider the task of Image-to-Video (I2V) generation, which involves transforming static images into realistic video sequences based on a textual description. While recent advancements produce photorealistic outputs, they frequently struggle to create videos with accurate and consistent object motion, especially in multi-object scenarios. To address these limitations, we propose a two-stage compositional framework that decomposes I2V generation into: (i) An explicit intermediate representation generation stage, followed by (ii) A video generation stage that is conditioned on this representation. Our key innovation is the introduction of a mask-based motion trajectory as an intermediate representation, that captures both semantic object information and motion, enabling an expressive but compact representation of motion and semantics. To incorporate the learned representation in the second stage, we utilize object-level attention objectives. Specifically, we consider a spatial, per-object, masked-cross attention objective, integrating object-specific prompts into corresponding latent space regions and a masked spatio-temporal self-attention objective, ensuring frame-to-frame consistency for each object. We evaluate our method on challenging benchmarks with multi-object and high-motion scenarios and empirically demonstrate that the proposed method achieves state-of-the-art results in temporal coherence, motion realism, and text-prompt faithfulness. Additionally, we introduce \\benchmark, a new challenging benchmark for single-object and multi-object I2V generation, and demonstrate our method's superiority on this benchmark. Project page is available at https://guyyariv.github.io/TTM/.", 'score': 10, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '4f24667b663efb7d', 'authors': ['Guy Yariv', 'Yuval Kirstain', 'Amit Zohar', 'Shelly Sheynin', 'Yaniv Taigman', 'Yossi Adi', 'Sagie Benaim', 'Adam Polyak'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'The Hebrew University of Jerusalem'], 'pdf_title_img': 'assets/pdf/title_img/2501.03059.jpg', 'data': {'categories': ['#video', '#multimodal', '#benchmark'], 'emoji': '🎬', 'ru': {'title': 'Генерация реалистичных видео из статичных изображений с помощью масок траекторий движения', 'desc': 'Статья представляет новый подход к генерации видео из изображений (I2V) на основе текстового описания. Авторы предлагают двухэтапную композиционную модель, которая сначала генерирует промежуточное представление в виде маски траектории движения объектов. Затем это представление используется для генерации видео с применением объектно-ориентированных целевых функций внимания. Эксперименты показывают, что предложенный метод достигает лучших результатов по временной согласованности, реалистичности движения и соответствию текстовому описанию.'}, 'en': {'title': 'Transforming Images into Realistic Videos with Motion Precision', 'desc': 'This paper addresses the challenge of generating videos from static images using textual descriptions, known as Image-to-Video (I2V) generation. The authors propose a two-stage framework that first creates an intermediate representation to capture object semantics and motion, followed by a video generation stage that utilizes this representation. A key innovation is the use of a mask-based motion trajectory, which helps maintain accurate object motion and consistency across frames. The method is evaluated against challenging benchmarks and shows superior performance in terms of motion realism and coherence, while also introducing a new benchmark for I2V generation.'}, 'zh': {'title': '图像到视频生成的新突破', 'desc': '本文探讨了图像到视频(I2V)生成的任务,即根据文本描述将静态图像转换为逼真的视频序列。尽管近期的进展能够生成照片级真实感的输出,但在多物体场景中,视频的物体运动准确性和一致性仍然存在挑战。为了解决这些问题,我们提出了一种两阶段的组合框架,首先生成明确的中间表示,然后基于该表示生成视频。我们的创新在于引入了一种基于掩码的运动轨迹作为中间表示,能够捕捉语义物体信息和运动,从而实现运动和语义的紧凑而富有表现力的表示。'}}}, {'id': 'https://huggingface.co/papers/2501.03006', 'title': 'TransPixar: Advancing Text-to-Video Generation with Transparency', 'url': 'https://huggingface.co/papers/2501.03006', 'abstract': 'Text-to-video generative models have made significant strides, enabling diverse applications in entertainment, advertising, and education. However, generating RGBA video, which includes alpha channels for transparency, remains a challenge due to limited datasets and the difficulty of adapting existing models. Alpha channels are crucial for visual effects (VFX), allowing transparent elements like smoke and reflections to blend seamlessly into scenes. We introduce TransPixar, a method to extend pretrained video models for RGBA generation while retaining the original RGB capabilities. TransPixar leverages a diffusion transformer (DiT) architecture, incorporating alpha-specific tokens and using LoRA-based fine-tuning to jointly generate RGB and alpha channels with high consistency. By optimizing attention mechanisms, TransPixar preserves the strengths of the original RGB model and achieves strong alignment between RGB and alpha channels despite limited training data. Our approach effectively generates diverse and consistent RGBA videos, advancing the possibilities for VFX and interactive content creation.', 'score': 8, 'issue_id': 1527, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'e85e5fa9a03d5d04', 'authors': ['Luozhou Wang', 'Yijun Li', 'Zhifei Chen', 'Jui-Hsien Wang', 'Zhifei Zhang', 'He Zhang', 'Zhe Lin', 'Yingcong Chen'], 'affiliations': ['Adobe Research', 'HKUST', 'HKUST(GZ)'], 'pdf_title_img': 'assets/pdf/title_img/2501.03006.jpg', 'data': {'categories': ['#optimization', '#architecture', '#training', '#diffusion', '#video'], 'emoji': '🎬', 'ru': {'title': 'TransPixar: Прорыв в генерации RGBA-видео для визуальных эффектов', 'desc': 'TransPixar - это новый метод генерации RGBA-видео, расширяющий возможности предобученных видеомоделей. Он использует архитектуру диффузионного трансформера (DiT) и токены, специфичные для альфа-канала, для совместной генерации RGB и альфа-каналов с высокой согласованностью. Метод применяет тонкую настройку на основе LoRA и оптимизирует механизмы внимания для сохранения сильных сторон исходной RGB-модели. TransPixar эффективно генерирует разнообразные и согласованные RGBA-видео, открывая новые возможности для создания визуальных эффектов и интерактивного контента.'}, 'en': {'title': 'TransPixar: Bridging RGB and Alpha for Enhanced Video Generation', 'desc': 'This paper presents TransPixar, a novel method for generating RGBA videos, which include transparency information crucial for visual effects. The challenge lies in the limited datasets and the need to adapt existing models to handle alpha channels effectively. TransPixar utilizes a diffusion transformer architecture and incorporates alpha-specific tokens, allowing it to generate both RGB and alpha channels simultaneously. By optimizing attention mechanisms and employing LoRA-based fine-tuning, TransPixar achieves high consistency between RGB and alpha outputs, enhancing the quality of video generation for applications in VFX and interactive media.'}, 'zh': {'title': 'TransPixar:生成高质量RGBA视频的新方法', 'desc': '本文介绍了一种名为TransPixar的方法,旨在生成包含透明通道的RGBA视频。传统的视频生成模型在处理透明效果时面临挑战,TransPixar通过扩展预训练模型来解决这一问题。该方法利用扩散变换器架构,结合特定的透明通道标记,并通过LoRA微调实现RGB和透明通道的高一致性生成。最终,TransPixar在有限的数据集上优化了注意力机制,成功生成多样且一致的RGBA视频,推动了视觉特效和互动内容创作的可能性。'}}}, {'id': 'https://huggingface.co/papers/2501.01790', 'title': 'Ingredients: Blending Custom Photos with Video Diffusion Transformers', 'url': 'https://huggingface.co/papers/2501.01790', 'abstract': 'This paper presents a powerful framework to customize video creations by incorporating multiple specific identity (ID) photos, with video diffusion Transformers, referred to as Ingredients. Generally, our method consists of three primary modules: (i) a facial extractor that captures versatile and precise facial features for each human ID from both global and local perspectives; (ii) a multi-scale projector that maps face embeddings into the contextual space of image query in video diffusion transformers; (iii) an ID router that dynamically combines and allocates multiple ID embedding to the corresponding space-time regions. Leveraging a meticulously curated text-video dataset and a multi-stage training protocol, Ingredients demonstrates superior performance in turning custom photos into dynamic and personalized video content. Qualitative evaluations highlight the advantages of proposed method, positioning it as a significant advancement toward more effective generative video control tools in Transformer-based architecture, compared to existing methods. The data, code, and model weights are publicly available at: https://github.com/feizc/Ingredients.', 'score': 6, 'issue_id': 1528, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'dd1ccebdd2fcf276', 'authors': ['Zhengcong Fei', 'Debang Li', 'Di Qiu', 'Changqian Yu', 'Mingyuan Fan'], 'affiliations': ['Kunlun Inc. Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01790.jpg', 'data': {'categories': ['#open_source', '#training', '#architecture', '#video', '#dataset', '#diffusion', '#multimodal'], 'emoji': '🎬', 'ru': {'title': 'Персонализированное видео из фотографий: новый уровень контроля в генеративных моделях', 'desc': 'Статья представляет новый метод под названием Ingredients для создания персонализированных видео с использованием нескольких фотографий конкретных людей. Метод состоит из трех основных модулей: экстрактора лицевых признаков, многомасштабного проектора и маршрутизатора идентификаторов. Ingredients использует тщательно подобранный набор данных текст-видео и многоэтапный протокол обучения для достижения превосходных результатов. Качественная оценка показывает преимущества предложенного метода по сравнению с существующими подходами в области генеративного контроля видео на основе архитектуры Transformer.'}, 'en': {'title': 'Transforming Photos into Personalized Videos with Ingredients', 'desc': 'This paper introduces a novel framework called Ingredients for creating personalized videos using multiple identity photos. It employs a facial extractor to accurately capture facial features, a multi-scale projector to integrate these features into video diffusion transformers, and an ID router to manage the allocation of identity embeddings across different time and space regions in the video. The framework is trained on a carefully selected text-video dataset, enhancing its ability to generate dynamic video content from custom images. The results show that Ingredients outperforms existing methods, marking a significant step forward in generative video control using Transformer architectures.'}, 'zh': {'title': '个性化视频创作的新突破', 'desc': '本文提出了一种强大的框架,通过结合多个特定身份照片,定制视频创作,称为Ingredients。该方法主要由三个模块组成:面部提取器、多个尺度投影器和身份路由器,分别用于提取面部特征、映射面部嵌入和动态分配身份嵌入。通过精心策划的文本-视频数据集和多阶段训练协议,Ingredients在将自定义照片转化为动态个性化视频内容方面表现出色。定性评估显示,该方法在基于Transformer的架构中,相较于现有方法,显著提升了生成视频控制工具的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.02576', 'title': 'DepthMaster: Taming Diffusion Models for Monocular Depth Estimation', 'url': 'https://huggingface.co/papers/2501.02576', 'abstract': "Monocular depth estimation within the diffusion-denoising paradigm demonstrates impressive generalization ability but suffers from low inference speed. Recent methods adopt a single-step deterministic paradigm to improve inference efficiency while maintaining comparable performance. However, they overlook the gap between generative and discriminative features, leading to suboptimal results. In this work, we propose DepthMaster, a single-step diffusion model designed to adapt generative features for the discriminative depth estimation task. First, to mitigate overfitting to texture details introduced by generative features, we propose a Feature Alignment module, which incorporates high-quality semantic features to enhance the denoising network's representation capability. Second, to address the lack of fine-grained details in the single-step deterministic framework, we propose a Fourier Enhancement module to adaptively balance low-frequency structure and high-frequency details. We adopt a two-stage training strategy to fully leverage the potential of the two modules. In the first stage, we focus on learning the global scene structure with the Feature Alignment module, while in the second stage, we exploit the Fourier Enhancement module to improve the visual quality. Through these efforts, our model achieves state-of-the-art performance in terms of generalization and detail preservation, outperforming other diffusion-based methods across various datasets. Our project page can be found at https://indu1ge.github.io/DepthMaster_page.", 'score': 5, 'issue_id': 1536, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'a8429b95ef4eb7b7', 'authors': ['Ziyang Song', 'Zerong Wang', 'Bo Li', 'Hao Zhang', 'Ruijie Zhu', 'Li Liu', 'Peng-Tao Jiang', 'Tianzhu Zhang'], 'affiliations': ['School of Information Science and Technology, University of Science and Technology of China (USTC), Hefei 230026, P.R.China', 'vivo Mobile Communication Co., Ltd., Hangzhou 310030, P.R.China'], 'pdf_title_img': 'assets/pdf/title_img/2501.02576.jpg', 'data': {'categories': ['#optimization', '#training', '#diffusion', '#cv', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'DepthMaster: Однопроходная диффузионная модель для точной оценки глубины с улучшенной генерализацией', 'desc': 'DepthMaster - это однопроходная диффузионная модель для монокулярной оценки глубины. Она использует модуль выравнивания признаков для улучшения представления семантических особенностей и модуль улучшения Фурье для балансировки низкочастотной структуры и высокочастотных деталей. Модель обучается в два этапа: сначала фокусируется на глобальной структуре сцены, затем улучшает визуальное качество. DepthMaster превосходит другие диффузионные методы по обобщающей способности и сохранению деталей на различных наборах данных.'}, 'en': {'title': 'DepthMaster: Bridging Generative and Discriminative Depth Estimation', 'desc': 'This paper introduces DepthMaster, a single-step diffusion model aimed at improving monocular depth estimation. It addresses the inefficiencies of previous methods by integrating a Feature Alignment module to enhance the representation of semantic features and reduce overfitting to textures. Additionally, a Fourier Enhancement module is proposed to balance low-frequency structures with high-frequency details, ensuring finer depth estimation. The two-stage training strategy allows the model to first learn global scene structures and then refine visual quality, resulting in state-of-the-art performance across various datasets.'}, 'zh': {'title': 'DepthMaster:提升深度估计的单步扩散模型', 'desc': '本文提出了一种名为DepthMaster的单步扩散模型,用于单目深度估计。该模型通过特征对齐模块和傅里叶增强模块,优化生成特征以适应判别性深度估计任务。特征对齐模块增强了去噪网络的表示能力,而傅里叶增强模块则平衡了低频结构和高频细节。通过两阶段训练策略,DepthMaster在泛化能力和细节保留方面达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.01830', 'title': 'Auto-RT: Automatic Jailbreak Strategy Exploration for Red-Teaming Large Language Models', 'url': 'https://huggingface.co/papers/2501.01830', 'abstract': 'Automated red-teaming has become a crucial approach for uncovering vulnerabilities in large language models (LLMs). However, most existing methods focus on isolated safety flaws, limiting their ability to adapt to dynamic defenses and uncover complex vulnerabilities efficiently. To address this challenge, we propose Auto-RT, a reinforcement learning framework that automatically explores and optimizes complex attack strategies to effectively uncover security vulnerabilities through malicious queries. Specifically, we introduce two key mechanisms to reduce exploration complexity and improve strategy optimization: 1) Early-terminated Exploration, which accelerate exploration by focusing on high-potential attack strategies; and 2) Progressive Reward Tracking algorithm with intermediate downgrade models, which dynamically refine the search trajectory toward successful vulnerability exploitation. Extensive experiments across diverse LLMs demonstrate that, by significantly improving exploration efficiency and automatically optimizing attack strategies, Auto-RT detects a boarder range of vulnerabilities, achieving a faster detection speed and 16.63\\% higher success rates compared to existing methods.', 'score': 5, 'issue_id': 1529, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '5b08b81c52ec8da8', 'authors': ['Yanjiang Liu', 'Shuhen Zhou', 'Yaojie Lu', 'Huijia Zhu', 'Weiqiang Wang', 'Hongyu Lin', 'Ben He', 'Xianpei Han', 'Le Sun'], 'affiliations': ['Ant Group', 'Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences, Beijing, China', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01830.jpg', 'data': {'categories': ['#security', '#rl', '#rlhf'], 'emoji': '🛡️', 'ru': {'title': 'Auto-RT: Умная защита больших языковых моделей', 'desc': 'Авторы представляют Auto-RT - фреймворк на основе обучения с подкреплением для автоматизированного поиска уязвимостей в больших языковых моделях (LLM). Система использует механизмы раннего прекращения исследования и прогрессивного отслеживания наград для оптимизации стратегий атак. Auto-RT превосходит существующие методы, обнаруживая более широкий спектр уязвимостей с большей скоростью и на 16.63% более высоким уровнем успеха. Этот подход позволяет эффективно выявлять сложные уязвимости в LLM через вредоносные запросы.'}, 'en': {'title': 'Auto-RT: Revolutionizing Vulnerability Detection in LLMs', 'desc': 'This paper presents Auto-RT, a reinforcement learning framework designed to enhance automated red-teaming for large language models (LLMs). Unlike traditional methods that target isolated safety flaws, Auto-RT efficiently uncovers complex vulnerabilities by optimizing attack strategies through malicious queries. It introduces two innovative mechanisms: Early-terminated Exploration to prioritize promising attack strategies, and Progressive Reward Tracking to refine the search process dynamically. Experimental results show that Auto-RT significantly improves exploration efficiency and detection success rates, outperforming existing approaches.'}, 'zh': {'title': '自动化红队:高效发现语言模型漏洞的利器', 'desc': '自动化红队技术在发现大型语言模型(LLMs)中的漏洞方面变得至关重要。现有方法大多集中于孤立的安全缺陷,限制了其适应动态防御和高效发现复杂漏洞的能力。为了解决这个问题,我们提出了Auto-RT,一个强化学习框架,能够自动探索和优化复杂的攻击策略,通过恶意查询有效发现安全漏洞。我们的实验表明,Auto-RT显著提高了探索效率和攻击策略的自动优化,检测到更广泛的漏洞,检测速度更快,成功率提高了16.63%。'}}}, {'id': 'https://huggingface.co/papers/2501.02506', 'title': 'ToolHop: A Query-Driven Benchmark for Evaluating Large Language Models in Multi-Hop Tool Use', 'url': 'https://huggingface.co/papers/2501.02506', 'abstract': 'Effective evaluation of multi-hop tool use is critical for analyzing the understanding, reasoning, and function-calling capabilities of large language models (LLMs). However, progress has been hindered by a lack of reliable evaluation datasets. To address this, we present ToolHop, a dataset comprising 995 user queries and 3,912 associated tools, specifically designed for rigorous evaluation of multi-hop tool use. ToolHop ensures diverse queries, meaningful interdependencies, locally executable tools, detailed feedback, and verifiable answers through a novel query-driven data construction approach that includes tool creation, document refinement, and code generation. We evaluate 14 LLMs across five model families (i.e., LLaMA3.1, Qwen2.5, Gemini1.5, Claude3.5, and GPT), uncovering significant challenges in handling multi-hop tool-use scenarios. The leading model, GPT-4o, achieves an accuracy of 49.04%, underscoring substantial room for improvement. Further analysis reveals variations in tool-use strategies for various families, offering actionable insights to guide the development of more effective approaches. Code and data can be found in https://huggingface.co/bytedance-research/ToolHop.', 'score': 5, 'issue_id': 1529, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'f785173226e5f9fc', 'authors': ['Junjie Ye', 'Zhengyin Du', 'Xuesong Yao', 'Weijian Lin', 'Yufei Xu', 'Zehui Chen', 'Zaiyuan Wang', 'Sining Zhu', 'Zhiheng Xi', 'Siyu Yuan', 'Tao Gui', 'Qi Zhang', 'Xuanjing Huang', 'Jiechao Chen'], 'affiliations': ['ByteDance', 'Institute of Modern Languages and Linguistics, Fudan University', 'School of Computer Science, Fudan University', 'School of Data Science, Fudan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02506.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#dataset', '#optimization'], 'emoji': '🛠️', 'ru': {'title': 'ToolHop: новый стандарт для оценки многоэтапного использования инструментов в LLM', 'desc': 'Статья представляет новый набор данных ToolHop для оценки многоэтапного использования инструментов большими языковыми моделями (LLM). ToolHop содержит 995 пользовательских запросов и 3912 связанных инструментов, обеспечивая разнообразие запросов, взаимозависимости и возможность локального выполнения. Авторы оценили 14 LLM из пяти семейств моделей, выявив значительные трудности в обработке сценариев многоэтапного использования инструментов. Лучшая модель, GPT-4o, достигла точности 49.04%, что указывает на большой потенциал для улучшения.'}, 'en': {'title': 'ToolHop: Advancing Multi-Hop Tool Use Evaluation for LLMs', 'desc': 'This paper introduces ToolHop, a new dataset designed to evaluate how well large language models (LLMs) can use multiple tools in a single task. It includes 995 user queries and 3,912 tools, focusing on diverse and interdependent queries that can be executed locally. The authors tested 14 different LLMs, revealing that even the best-performing model, GPT-4o, only achieved 49.04% accuracy, indicating significant challenges in multi-hop tool use. The findings highlight different strategies employed by various model families, providing insights for future improvements in LLM capabilities.'}, 'zh': {'title': 'ToolHop:多跳工具使用的有效评估数据集', 'desc': '本文介绍了ToolHop数据集,该数据集包含995个用户查询和3912个相关工具,旨在有效评估大型语言模型(LLMs)在多跳工具使用中的理解、推理和功能调用能力。通过新颖的查询驱动数据构建方法,ToolHop确保了查询的多样性、工具的局部可执行性和可验证的答案。我们对14个不同模型(如LLaMA3.1、Qwen2.5等)进行了评估,发现它们在处理多跳工具使用场景时面临显著挑战。尽管GPT-4o模型的准确率为49.04%,但仍有很大的改进空间,分析还揭示了不同模型家族在工具使用策略上的差异,为未来的研究提供了有价值的见解。'}}}, {'id': 'https://huggingface.co/papers/2501.02423', 'title': 'Scaling Laws for Floating Point Quantization Training', 'url': 'https://huggingface.co/papers/2501.02423', 'abstract': 'Low-precision training is considered an effective strategy for reducing both training and downstream inference costs. Previous scaling laws for precision mainly focus on integer quantization, which pay less attention to the constituents in floating-point quantization and thus cannot well fit the LLM losses in this scenario. In contrast, while floating-point quantization training is more commonly implemented in production, the research on it has been relatively superficial. In this paper, we thoroughly explore the effects of floating-point quantization targets, exponent bits, mantissa bits, and the calculation granularity of the scaling factor in floating-point quantization training performance of LLM models. While presenting an accurate floating-point quantization unified scaling law, we also provide valuable suggestions for the community: (1) Exponent bits contribute slightly more to the model performance than mantissa bits. We provide the optimal exponent-mantissa bit ratio for different bit numbers, which is available for future reference by hardware manufacturers; (2) We discover the formation of the critical data size in low-precision LLM training. Too much training data exceeding the critical data size will inversely bring in degradation of LLM performance; (3) The optimal floating-point quantization precision is directly proportional to the computational power, but within a wide computational power range, we estimate that the best cost-performance precision lies between 4-8 bits.', 'score': 4, 'issue_id': 1537, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'be6872257cb9a129', 'authors': ['Xingwu Sun', 'Shuaipeng Li', 'Ruobing Xie', 'Weidong Han', 'Kan Wu', 'Zhen Yang', 'Yixing Li', 'An Wang', 'Shuai Li', 'Jinbao Xue', 'Yu Cheng', 'Yangyu Tao', 'Zhanhui Kang', 'Chengzhong Xu', 'Di Wang', 'Jie Jiang'], 'affiliations': ['Tencent Hunyuan', 'The Chinese University of Hong Kong', 'Tokyo Institute of Technology', 'University of Macau'], 'pdf_title_img': 'assets/pdf/title_img/2501.02423.jpg', 'data': {'categories': ['#training', '#optimization', '#inference'], 'emoji': '🧮', 'ru': {'title': 'Оптимизация точности вычислений в обучении языковых моделей', 'desc': 'Статья исследует влияние квантования с плавающей запятой на обучение больших языковых моделей (LLM). Авторы анализируют роль экспоненциальных и мантиссных битов, а также размера обучающих данных в производительности моделей. Они представляют унифицированный закон масштабирования для квантования с плавающей запятой и дают рекомендации по оптимальному соотношению битов и размеру данных. Исследование показывает, что оптимальная точность квантования находится в диапазоне 4-8 бит для широкого спектра вычислительных мощностей.'}, 'en': {'title': 'Optimizing Floating-Point Quantization for Better LLM Performance', 'desc': 'This paper investigates the impact of floating-point quantization on the training performance of large language models (LLMs). It highlights that previous research primarily focused on integer quantization, neglecting the nuances of floating-point quantization. The authors establish a unified scaling law for floating-point quantization and provide insights on the optimal ratio of exponent to mantissa bits, emphasizing that exponent bits have a greater influence on model performance. Additionally, they identify a critical data size threshold, beyond which performance may degrade, and suggest that the best precision for cost-performance lies between 4-8 bits, depending on computational power.'}, 'zh': {'title': '低精度训练:优化浮点量化的关键', 'desc': '低精度训练被认为是降低训练和推理成本的有效策略。以往的研究主要集中在整数量化上,而对浮点量化的研究相对较少,导致无法很好地适应大语言模型的损失情况。本文深入探讨了浮点量化训练中目标、指数位、尾数位和缩放因子的计算粒度对大语言模型性能的影响,并提出了统一的浮点量化缩放法则。研究结果表明,指数位对模型性能的贡献略高于尾数位,并发现了低精度训练中的关键数据大小。'}}}, {'id': 'https://huggingface.co/papers/2501.02832', 'title': 'Samba-asr state-of-the-art speech recognition leveraging structured state-space models', 'url': 'https://huggingface.co/papers/2501.02832', 'abstract': 'We propose Samba ASR, the first state-of-the-art Automatic Speech Recognition (ASR) model leveraging the novel Mamba architecture as both encoder and decoder, built on the foundation of state-space models (SSMs). Unlike transformer-based ASR models, which rely on self-attention mechanisms to capture dependencies, Samba ASR effectively models both local and global temporal dependencies using efficient state-space dynamics, achieving remarkable performance gains. By addressing the limitations of transformers, such as quadratic scaling with input length and difficulty in handling long-range dependencies, Samba ASR achieves superior accuracy and efficiency. Experimental results demonstrate that Samba ASR surpasses existing open-source transformer-based ASR models across various standard benchmarks, establishing it as the new state of the art in ASR. Extensive evaluations on benchmark datasets show significant improvements in Word Error Rate (WER), with competitive performance even in low-resource scenarios. Furthermore, the computational efficiency and parameter optimization of the Mamba architecture make Samba ASR a scalable and robust solution for diverse ASR tasks. Our contributions include: A new Samba ASR architecture demonstrating the superiority of SSMs over transformer-based models for speech sequence processing. A comprehensive evaluation on public benchmarks showcasing state-of-the-art performance. An analysis of computational efficiency, robustness to noise, and sequence generalization. This work highlights the viability of Mamba SSMs as a transformer-free alternative for efficient and accurate ASR. By leveraging state-space modeling advancements, Samba ASR sets a new benchmark for ASR performance and future research.', 'score': 4, 'issue_id': 1530, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'ed3c4a6192d0c5f9', 'authors': ['Syed Abdul Gaffar Shakhadri', 'Kruthika KR', 'Kartik Basavaraj Angadi'], 'affiliations': ['SandLogic Technologies Pvt Ltd'], 'pdf_title_img': 'assets/pdf/title_img/2501.02832.jpg', 'data': {'categories': ['#audio', '#architecture', '#benchmark', '#low_resource', '#open_source'], 'emoji': '🎙️', 'ru': {'title': 'Samba ASR: революция в распознавании речи с помощью моделей пространства состояний', 'desc': 'Представлена модель Samba ASR - первая современная система автоматического распознавания речи, использующая архитектуру Mamba в качестве энкодера и декодера на основе моделей пространства состояний (SSM). В отличие от трансформерных моделей, Samba ASR эффективно моделирует локальные и глобальные временные зависимости, достигая значительных улучшений производительности. Экспериментальные результаты показывают, что Samba ASR превосходит существующие модели с открытым исходным кодом на основе трансформеров по различным стандартным показателям. Модель демонстрирует значительное снижение показателя Word Error Rate (WER) и высокую эффективность даже при ограниченных ресурсах.'}, 'en': {'title': 'Samba ASR: Redefining Speech Recognition with State-Space Models', 'desc': 'Samba ASR is a groundbreaking Automatic Speech Recognition model that utilizes the innovative Mamba architecture, which functions as both the encoder and decoder. This model departs from traditional transformer-based approaches by employing state-space models (SSMs) to effectively capture both local and global temporal dependencies, leading to enhanced performance. By overcoming the challenges associated with transformers, such as their inefficiency with long input sequences, Samba ASR achieves superior accuracy and efficiency in recognizing speech. Extensive testing shows that Samba ASR not only outperforms existing transformer-based models but also excels in low-resource environments, making it a robust solution for various ASR applications.'}, 'zh': {'title': 'Samba ASR:超越变换器的语音识别新标杆', 'desc': '我们提出了Samba ASR,这是第一个利用新型Mamba架构作为编码器和解码器的最先进自动语音识别(ASR)模型。与基于变换器的ASR模型不同,Samba ASR通过高效的状态空间动态建模局部和全局时间依赖关系,从而实现显著的性能提升。该模型克服了变换器在处理长距离依赖和输入长度的平方扩展等方面的局限性,展现出更高的准确性和效率。实验结果表明,Samba ASR在多个标准基准测试中超越了现有的开源变换器ASR模型,确立了其在ASR领域的新标杆。'}}}, {'id': 'https://huggingface.co/papers/2501.00912', 'title': 'AutoPresent: Designing Structured Visuals from Scratch', 'url': 'https://huggingface.co/papers/2501.00912', 'abstract': "Designing structured visuals such as presentation slides is essential for communicative needs, necessitating both content creation and visual planning skills. In this work, we tackle the challenge of automated slide generation, where models produce slide presentations from natural language (NL) instructions. We first introduce the SlidesBench benchmark, the first benchmark for slide generation with 7k training and 585 testing examples derived from 310 slide decks across 10 domains. SlidesBench supports evaluations that are (i)reference-based to measure similarity to a target slide, and (ii)reference-free to measure the design quality of generated slides alone. We benchmark end-to-end image generation and program generation methods with a variety of models, and find that programmatic methods produce higher-quality slides in user-interactable formats. Built on the success of program generation, we create AutoPresent, an 8B Llama-based model trained on 7k pairs of instructions paired with code for slide generation, and achieve results comparable to the closed-source model GPT-4o. We further explore iterative design refinement where the model is tasked to self-refine its own output, and we found that this process improves the slide's quality. We hope that our work will provide a basis for future work on generating structured visuals.", 'score': 3, 'issue_id': 1539, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'ea7b88fcc0a2025b', 'authors': ['Jiaxin Ge', 'Zora Zhiruo Wang', 'Xuhui Zhou', 'Yi-Hao Peng', 'Sanjay Subramanian', 'Qinyue Tan', 'Maarten Sap', 'Alane Suhr', 'Daniel Fried', 'Graham Neubig', 'Trevor Darrell'], 'affiliations': ['Carnegie Mellon University', 'University of California, Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.00912.jpg', 'data': {'categories': ['#dataset', '#story_generation', '#training', '#benchmark', '#multimodal'], 'emoji': '🎯', 'ru': {'title': 'Автоматизация создания презентаций: от текста к структурированным визуальным материалам', 'desc': 'Эта статья представляет новый бенчмарк SlidesBench для автоматической генерации слайдов презентаций на основе текстовых инструкций. Авторы сравнивают методы генерации изображений и программного кода, обнаружив преимущество последнего подхода. Они создают модель AutoPresent на базе Llama для генерации кода слайдов, достигающую результатов, сопоставимых с GPT-4. Исследователи также изучают итеративное улучшение дизайна слайдов с помощью самооптимизации модели.'}, 'en': {'title': 'Automating Slide Generation with Advanced Models', 'desc': 'This paper addresses the challenge of creating automated slide presentations from natural language instructions. It introduces the SlidesBench benchmark, which includes a large dataset for training and testing slide generation models. The authors evaluate various methods, finding that programmatic approaches yield higher-quality slides. They also present AutoPresent, a model that competes with advanced models like GPT-4o, and demonstrate that iterative design refinement enhances the quality of generated slides.'}, 'zh': {'title': '自动生成高质量演示幻灯片的未来', 'desc': '本研究旨在自动生成演示幻灯片,解决内容创作和视觉规划的挑战。我们首次引入SlidesBench基准,包含7000个训练样本和585个测试样本,涵盖10个领域的310个幻灯片集。通过对比不同模型的图像生成和程序生成方法,我们发现程序生成方法在用户交互格式中生成的幻灯片质量更高。基于程序生成的成功,我们开发了AutoPresent模型,并通过自我优化过程进一步提升幻灯片的质量。'}}}, {'id': 'https://huggingface.co/papers/2501.03225', 'title': 'Automated Generation of Challenging Multiple-Choice Questions for Vision Language Model Evaluation', 'url': 'https://huggingface.co/papers/2501.03225', 'abstract': 'The rapid development of vision language models (VLMs) demands rigorous and reliable evaluation. However, current visual question answering (VQA) benchmarks often depend on open-ended questions, making accurate evaluation difficult due to the variability in natural language responses. To address this, we introduce AutoConverter, an agentic framework that automatically converts these open-ended questions into multiple-choice format, enabling objective evaluation while reducing the costly question creation process. Our experiments demonstrate that AutoConverter can generate correct and challenging multiple-choice questions, with VLMs demonstrating consistently similar or lower accuracy on these questions compared to human-created ones. Using AutoConverter, we construct VMCBench, a benchmark created by transforming 20 existing VQA datasets into a unified multiple-choice format, totaling 9,018 questions. We comprehensively evaluate 33 state-of-the-art VLMs on VMCBench, setting a new standard for scalable, consistent, and reproducible VLM evaluation.', 'score': 1, 'issue_id': 1542, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'aa212f5e596ed0e6', 'authors': ['Yuhui Zhang', 'Yuchang Su', 'Yiming Liu', 'Xiaohan Wang', 'James Burgess', 'Elaine Sui', 'Chenyu Wang', 'Josiah Aklilu', 'Alejandro Lozano', 'Anjiang Wei', 'Ludwig Schmidt', 'Serena Yeung-Levy'], 'affiliations': ['MIT', 'Stanford University', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.03225.jpg', 'data': {'categories': ['#interpretability', '#agents', '#benchmark', '#cv', '#survey', '#games', '#optimization'], 'emoji': '🔄', 'ru': {'title': 'Автоматизация оценки моделей машинного зрения и языка', 'desc': 'Исследователи представили AutoConverter - агентную систему для автоматического преобразования открытых вопросов в вопросы с множественным выбором для оценки моделей машинного зрения и языка (VLM). Эта система позволяет объективно оценивать VLM, избегая сложностей, связанных с вариативностью естественно-языковых ответов. На основе AutoConverter был создан бенчмарк VMCBench, включающий 9018 вопросов из 20 существующих наборов данных для визуальных вопросов и ответов (VQA). VMCBench был использован для всесторонней оценки 33 современных VLM, устанавливая новый стандарт масштабируемой и воспроизводимой оценки таких моделей.'}, 'en': {'title': 'Transforming VQA for Objective Evaluation with AutoConverter', 'desc': 'This paper presents AutoConverter, a framework designed to improve the evaluation of vision language models (VLMs) by converting open-ended visual question answering (VQA) questions into a multiple-choice format. This transformation allows for more objective assessments of VLM performance, addressing the challenges posed by the variability of natural language responses. The authors demonstrate that VLMs perform similarly or worse on these newly generated questions compared to those created by humans, indicating the rigor of the new benchmark. Additionally, they introduce VMCBench, a comprehensive dataset that standardizes 20 existing VQA datasets into a unified multiple-choice format, facilitating scalable and reproducible evaluations of VLMs.'}, 'zh': {'title': '自动化评估视觉语言模型的新标准', 'desc': '随着视觉语言模型(VLMs)的快速发展,评估这些模型的准确性变得尤为重要。现有的视觉问答(VQA)基准往往依赖开放式问题,这使得评估变得困难,因为自然语言回答的多样性很大。为了解决这个问题,我们提出了AutoConverter,这是一种自动将开放式问题转换为多项选择格式的框架,从而实现客观评估并减少问题创建的成本。通过使用AutoConverter,我们构建了VMCBench,这是一个将20个现有VQA数据集转化为统一多项选择格式的基准,包含9,018个问题,全面评估了33个最先进的VLMs,设定了可扩展、一致和可重复的VLM评估新标准。'}}}, {'id': 'https://huggingface.co/papers/2412.18525', 'title': 'Explanatory Instructions: Towards Unified Vision Tasks Understanding and Zero-shot Generalization', 'url': 'https://huggingface.co/papers/2412.18525', 'abstract': "Computer Vision (CV) has yet to fully achieve the zero-shot task generalization observed in Natural Language Processing (NLP), despite following many of the milestones established in NLP, such as large transformer models, extensive pre-training, and the auto-regression paradigm, among others. In this paper, we explore the idea that CV adopts discrete and terminological task definitions (\\eg, ``image segmentation''), which may be a key barrier to zero-shot task generalization. Our hypothesis is that without truly understanding previously-seen tasks--due to these terminological definitions--deep models struggle to generalize to novel tasks. To verify this, we introduce Explanatory Instructions, which provide an intuitive way to define CV task objectives through detailed linguistic transformations from input images to outputs. We create a large-scale dataset comprising 12 million ``image input to explanatory instruction to output'' triplets, and train an auto-regressive-based vision-language model (AR-based VLM) that takes both images and explanatory instructions as input. By learning to follow these instructions, the AR-based VLM achieves instruction-level zero-shot capabilities for previously-seen tasks and demonstrates strong zero-shot generalization for unseen CV tasks. Code and dataset will be openly available on our GitHub repository.", 'score': 48, 'issue_id': 1406, 'pub_date': '2024-12-24', 'pub_date_card': {'ru': '24 декабря', 'en': 'December 24', 'zh': '12月24日'}, 'hash': '23f11aceae00534d', 'authors': ['Yang Shen', 'Xiu-Shen Wei', 'Yifan Sun', 'Yuxin Song', 'Tao Yuan', 'Jian Jin', 'Heyang Xu', 'Yazhou Yao', 'Errui Ding'], 'affiliations': ['Baidu', 'Nanjing University of Science and Technology', 'Southeast University'], 'pdf_title_img': 'assets/pdf/title_img/2412.18525.jpg', 'data': {'categories': ['#dataset', '#open_source', '#cv', '#multimodal', '#transfer_learning'], 'emoji': '🔬', 'ru': {'title': 'Лингвистические инструкции - ключ к обобщению в компьютерном зрении', 'desc': "В статье исследуется проблема недостаточной способности моделей компьютерного зрения к обобщению на новые задачи без предварительного обучения. Авторы предлагают использовать подробные лингвистические инструкции для определения задач вместо дискретных терминологических определений. Они создали большой датасет из 12 миллионов примеров 'изображение-инструкция-результат' и обучили авторегрессионную мультимодальную модель следовать этим инструкциям. Эксперименты показали, что такой подход позволяет модели лучше обобщаться на новые задачи компьютерного зрения без дополнительного обучения."}, 'en': {'title': 'Unlocking Zero-Shot Generalization in Computer Vision with Explanatory Instructions', 'desc': "This paper addresses the challenge of zero-shot task generalization in Computer Vision (CV), which has not reached the levels seen in Natural Language Processing (NLP). The authors argue that the use of specific terminological definitions for tasks in CV, like 'image segmentation', limits the models' ability to generalize to new tasks. To overcome this, they propose 'Explanatory Instructions' that transform image inputs into detailed linguistic outputs, helping models understand tasks better. They introduce a large dataset of 12 million triplets and train an auto-regressive vision-language model that successfully demonstrates zero-shot capabilities for both seen and unseen tasks."}, 'zh': {'title': '突破计算机视觉的零样本任务泛化', 'desc': '本文探讨了计算机视觉(CV)在零样本任务泛化方面的挑战,尤其是与自然语言处理(NLP)的对比。我们认为,CV使用的术语性任务定义(如“图像分割”)可能是阻碍零样本任务泛化的关键因素。为了解决这个问题,我们引入了“解释性指令”,通过详细的语言转换来直观地定义CV任务目标。我们创建了一个包含1200万对“图像输入、解释性指令和输出”的大规模数据集,并训练了一个基于自回归的视觉语言模型,实现了对已见任务的指令级零样本能力,并在未见的CV任务上展示了强大的零样本泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2412.20070', 'title': 'On the Compositional Generalization of Multimodal LLMs for Medical Imaging', 'url': 'https://huggingface.co/papers/2412.20070', 'abstract': 'Multimodal large language models (MLLMs) hold significant potential in the medical field, but their capabilities are often limited by insufficient data in certain medical domains, highlighting the need for understanding what kinds of images can be used by MLLMs for generalization. Current research suggests that multi-task training outperforms single-task as different tasks can benefit each other, but they often overlook the internal relationships within these tasks, providing limited guidance on selecting datasets to enhance specific tasks. To analyze this phenomenon, we attempted to employ compositional generalization (CG)-the ability of models to understand novel combinations by recombining learned elements-as a guiding framework. Since medical images can be precisely defined by Modality, Anatomical area, and Task, naturally providing an environment for exploring CG. Therefore, we assembled 106 medical datasets to create Med-MAT for comprehensive experiments. The experiments confirmed that MLLMs can use CG to understand unseen medical images and identified CG as one of the main drivers of the generalization observed in multi-task training. Additionally, further studies demonstrated that CG effectively supports datasets with limited data and delivers consistent performance across different backbones, highlighting its versatility and broad applicability. Med-MAT is publicly available at https://github.com/FreedomIntelligence/Med-MAT.', 'score': 36, 'issue_id': 1405, 'pub_date': '2024-12-28', 'pub_date_card': {'ru': '28 декабря', 'en': 'December 28', 'zh': '12月28日'}, 'hash': '34f9c6ec4611d6ec', 'authors': ['Zhenyang Cai', 'Junying Chen', 'Rongsheng Wang', 'Weihong Wang', 'Yonglin Deng', 'Dingjie Song', 'Yize Chen', 'Zixu Zhang', 'Benyou Wang'], 'affiliations': ['The Chinese University of Hong Kong, Shenzhen'], 'pdf_title_img': 'assets/pdf/title_img/2412.20070.jpg', 'data': {'categories': ['#dataset', '#healthcare', '#open_source', '#multimodal', '#transfer_learning'], 'emoji': '🩺', 'ru': {'title': 'Композиционная генерализация - ключ к пониманию медицинских изображений для MLLM', 'desc': 'Статья исследует возможности мультимодальных больших языковых моделей (MLLM) в медицинской сфере, фокусируясь на композиционной генерализации (CG). Авторы создали набор данных Med-MAT из 106 медицинских датасетов для изучения способности моделей понимать новые комбинации изображений. Эксперименты показали, что MLLM могут использовать CG для интерпретации ранее невиданных медицинских изображений. Исследование также выявило эффективность CG для датасетов с ограниченными данными и стабильность результатов на разных архитектурах моделей.'}, 'en': {'title': 'Unlocking Medical Insights with Compositional Generalization', 'desc': "This paper explores the use of multimodal large language models (MLLMs) in the medical field, focusing on how they can generalize from limited data. It highlights the advantages of multi-task training over single-task training, emphasizing the importance of understanding the relationships between different tasks. The authors introduce compositional generalization (CG) as a framework to enhance the model's ability to interpret new combinations of medical images. They created a dataset called Med-MAT, which consists of 106 medical datasets, and found that CG significantly improves the performance of MLLMs, especially in scenarios with scarce data."}, 'zh': {'title': '组合泛化助力医学图像理解', 'desc': '多模态大型语言模型(MLLMs)在医学领域具有重要潜力,但在某些医学领域的数据不足限制了其能力。当前研究表明,多任务训练优于单任务训练,因为不同任务可以相互促进,但往往忽视了这些任务之间的内部关系。我们采用组合泛化(CG)作为指导框架,分析模型如何理解新组合的能力,并组建了106个医学数据集以创建Med-MAT进行全面实验。实验结果确认,MLLMs能够利用CG理解未见过的医学图像,并且CG是多任务训练中观察到的泛化的主要驱动因素之一。'}}}, {'id': 'https://huggingface.co/papers/2412.20422', 'title': 'Bringing Objects to Life: 4D generation from 3D objects', 'url': 'https://huggingface.co/papers/2412.20422', 'abstract': 'Recent advancements in generative modeling now enable the creation of 4D content (moving 3D objects) controlled with text prompts. 4D generation has large potential in applications like virtual worlds, media, and gaming, but existing methods provide limited control over the appearance and geometry of generated content. In this work, we introduce a method for animating user-provided 3D objects by conditioning on textual prompts to guide 4D generation, enabling custom animations while maintaining the identity of the original object. We first convert a 3D mesh into a ``static" 4D Neural Radiance Field (NeRF) that preserves the visual attributes of the input object. Then, we animate the object using an Image-to-Video diffusion model driven by text. To improve motion realism, we introduce an incremental viewpoint selection protocol for sampling perspectives to promote lifelike movement and a masked Score Distillation Sampling (SDS) loss, which leverages attention maps to focus optimization on relevant regions. We evaluate our model in terms of temporal coherence, prompt adherence, and visual fidelity and find that our method outperforms baselines that are based on other approaches, achieving up to threefold improvements in identity preservation measured using LPIPS scores, and effectively balancing visual quality with dynamic content.', 'score': 29, 'issue_id': 1408, 'pub_date': '2024-12-29', 'pub_date_card': {'ru': '29 декабря', 'en': 'December 29', 'zh': '12月29日'}, 'hash': 'de742e56a5ec379f', 'authors': ['Ohad Rahamim', 'Ori Malca', 'Dvir Samuel', 'Gal Chechik'], 'affiliations': ['Bar-Ilan University', 'NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2412.20422.jpg', 'data': {'categories': ['#optimization', '#multimodal', '#games', '#diffusion', '#video', '#3d'], 'emoji': '🎭', 'ru': {'title': 'Оживление 3D-объектов с помощью текста: новый рубеж в генеративном моделировании', 'desc': 'Статья представляет новый метод анимации 3D-объектов с помощью текстовых подсказок. Авторы используют генеративную модель для создания 4D-контента (движущихся 3D-объектов), сохраняя при этом исходный вид объекта. Метод включает преобразование 3D-меша в статическое 4D нейронное радиальное поле (NeRF) и последующую анимацию с помощью диффузионной модели Image-to-Video. Для улучшения реалистичности движения введены протокол выбора ракурсов и маскированная функция потерь Score Distillation Sampling.'}, 'en': {'title': 'Animating 3D Objects with Text Prompts for Realistic 4D Generation', 'desc': "This paper presents a novel approach to generating 4D content by animating 3D objects based on text prompts. The method involves converting a 3D mesh into a static 4D Neural Radiance Field (NeRF) to retain the object's visual characteristics. It then utilizes an Image-to-Video diffusion model to create animations while ensuring the original object's identity is preserved. The authors enhance motion realism through a viewpoint selection protocol and a masked Score Distillation Sampling loss, leading to significant improvements in visual quality and dynamic content generation."}, 'zh': {'title': '文本驱动的4D动画生成新方法', 'desc': '本研究提出了一种新方法,可以通过文本提示来控制4D内容的生成,特别是动画用户提供的3D对象。我们首先将3D网格转换为静态的4D神经辐射场(NeRF),以保留输入对象的视觉特征。然后,利用图像到视频的扩散模型进行动画制作,确保生成的动画与文本提示相符。通过引入增量视角选择协议和掩码评分蒸馏损失,我们提高了运动的真实感,并在多个评估指标上超越了现有方法。'}}}, {'id': 'https://huggingface.co/papers/2412.20993', 'title': 'Efficiently Serving LLM Reasoning Programs with Certaindex', 'url': 'https://huggingface.co/papers/2412.20993', 'abstract': 'The rapid evolution of large language models (LLMs) has unlocked their capabilities in advanced reasoning tasks like mathematical problem-solving, code generation, and legal analysis. Central to this progress are inference-time reasoning algorithms, which refine outputs by exploring multiple solution paths, at the cost of increasing compute demands and response latencies. Existing serving systems fail to adapt to the scaling behaviors of these algorithms or the varying difficulty of queries, leading to inefficient resource use and unmet latency targets. We present Dynasor, a system that optimizes inference-time compute for LLM reasoning queries. Unlike traditional engines, Dynasor tracks and schedules requests within reasoning queries and uses Certaindex, a proxy that measures statistical reasoning progress based on model certainty, to guide compute allocation dynamically. Dynasor co-adapts scheduling with reasoning progress: it allocates more compute to hard queries, reduces compute for simpler ones, and terminates unpromising queries early, balancing accuracy, latency, and cost. On diverse datasets and algorithms, Dynasor reduces compute by up to 50% in batch processing and sustaining 3.3x higher query rates or 4.7x tighter latency SLOs in online serving.', 'score': 24, 'issue_id': 1406, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '7fe76ed90463d977', 'authors': ['Yichao Fu', 'Junda Chen', 'Siqi Zhu', 'Zheyu Fu', 'Zhongdongming Dai', 'Aurick Qiao', 'Hao Zhang'], 'affiliations': ['Snowflake', 'Tsinghua University', 'UC San Diego'], 'pdf_title_img': 'assets/pdf/title_img/2412.20993.jpg', 'data': {'categories': ['#training', '#reasoning', '#optimization', '#inference'], 'emoji': '🧠', 'ru': {'title': 'Dynasor: умное распределение ресурсов для эффективных LLM-рассуждений', 'desc': 'Статья представляет систему Dynasor, оптимизирующую вычисления для задач рассуждения с использованием больших языковых моделей (LLM). Dynasor отслеживает и планирует запросы, используя прокси Certaindex для измерения прогресса рассуждений на основе уверенности модели. Система динамически распределяет вычислительные ресурсы, уделяя больше внимания сложным запросам и меньше простым, а также прекращая бесперспективные запросы. Dynasor показывает значительное снижение вычислительных затрат и улучшение производительности на различных наборах данных и алгоритмах.'}, 'en': {'title': 'Dynasor: Smart Compute Allocation for Efficient LLM Reasoning', 'desc': "This paper introduces Dynasor, a system designed to optimize the compute resources used during inference for large language models (LLMs) when handling reasoning queries. It addresses the inefficiencies of existing serving systems that do not adapt to the complexity of different queries or the scaling needs of inference-time reasoning algorithms. Dynasor employs a dynamic scheduling approach that allocates compute resources based on the difficulty of the query, using a proxy called Certaindex to measure the model's certainty in its reasoning. As a result, Dynasor can significantly reduce compute usage while improving query processing rates and meeting latency targets more effectively."}, 'zh': {'title': 'Dynasor:优化推理查询的计算效率', 'desc': '这篇论文介绍了Dynasor系统,它优化了大型语言模型(LLM)在推理查询时的计算效率。Dynasor通过跟踪和调度推理查询中的请求,动态分配计算资源,以应对不同难度的查询。该系统使用Certaindex代理,根据模型的确定性来衡量推理进展,从而指导计算分配。通过在多种数据集和算法上测试,Dynasor在批处理时减少了多达50%的计算需求,同时在在线服务中实现了3.3倍更高的查询速率或4.7倍更严格的延迟服务水平目标。'}}}, {'id': 'https://huggingface.co/papers/2412.21037', 'title': 'TangoFlux: Super Fast and Faithful Text to Audio Generation with Flow Matching and Clap-Ranked Preference Optimization', 'url': 'https://huggingface.co/papers/2412.21037', 'abstract': 'We introduce TangoFlux, an efficient Text-to-Audio (TTA) generative model with 515M parameters, capable of generating up to 30 seconds of 44.1kHz audio in just 3.7 seconds on a single A40 GPU. A key challenge in aligning TTA models lies in the difficulty of creating preference pairs, as TTA lacks structured mechanisms like verifiable rewards or gold-standard answers available for Large Language Models (LLMs). To address this, we propose CLAP-Ranked Preference Optimization (CRPO), a novel framework that iteratively generates and optimizes preference data to enhance TTA alignment. We demonstrate that the audio preference dataset generated using CRPO outperforms existing alternatives. With this framework, TangoFlux achieves state-of-the-art performance across both objective and subjective benchmarks. We open source all code and models to support further research in TTA generation.', 'score': 19, 'issue_id': 1405, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'bb669623871df661', 'authors': ['Chia-Yu Hung', 'Navonil Majumder', 'Zhifeng Kong', 'Ambuj Mehrish', 'Rafael Valle', 'Bryan Catanzaro', 'Soujanya Poria'], 'affiliations': ['NVIDIA', 'Singapore University of Technology and Design (SUTD)'], 'pdf_title_img': 'assets/pdf/title_img/2412.21037.jpg', 'data': {'categories': ['#dataset', '#audio', '#open_source', '#benchmark', '#alignment', '#rlhf', '#small_models'], 'emoji': '🎵', 'ru': {'title': 'TangoFlux: Революция в генерации аудио из текста', 'desc': 'TangoFlux - это эффективная генеративная модель для преобразования текста в аудио (Text-to-Audio, TTA) с 515 миллионами параметров. Модель способна генерировать до 30 секунд аудио с частотой 44,1 кГц всего за 3,7 секунды на одном GPU A40. Авторы представляют новую методику CLAP-Ranked Preference Optimization (CRPO) для улучшения согласованности TTA моделей путем итеративной генерации и оптимизации данных о предпочтениях. TangoFlux достигает передовых результатов в объективных и субъективных тестах, а код и модели открыты для дальнейших исследований.'}, 'en': {'title': 'TangoFlux: Revolutionizing Text-to-Audio Generation with CRPO', 'desc': "TangoFlux is a powerful Text-to-Audio generative model that can create high-quality audio quickly and efficiently. It addresses the challenge of aligning TTA models by introducing a new method called CLAP-Ranked Preference Optimization (CRPO), which helps generate and optimize preference data. This approach improves the model's ability to understand and produce audio that aligns with user preferences. The results show that TangoFlux not only meets but exceeds current standards in both objective and subjective evaluations, and the team has made their code and models available for further research."}, 'zh': {'title': 'TangoFlux:高效的文本到音频生成模型', 'desc': '我们介绍了TangoFlux,这是一种高效的文本到音频生成模型,拥有5.15亿个参数,能够在单个A40 GPU上以3.7秒的速度生成最长30秒的44.1kHz音频。TTA模型对齐的一个主要挑战是创建偏好对的困难,因为TTA缺乏像大型语言模型(LLMs)那样的可验证奖励或标准答案的结构化机制。为了解决这个问题,我们提出了CLAP-Ranked Preference Optimization(CRPO),这是一个新颖的框架,通过迭代生成和优化偏好数据来增强TTA的对齐。我们证明了使用CRPO生成的音频偏好数据集在现有替代方案中表现更优,TangoFlux在客观和主观基准测试中都达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2412.21079', 'title': 'Edicho: Consistent Image Editing in the Wild', 'url': 'https://huggingface.co/papers/2412.21079', 'abstract': 'As a verified need, consistent editing across in-the-wild images remains a technical challenge arising from various unmanageable factors, like object poses, lighting conditions, and photography environments. Edicho steps in with a training-free solution based on diffusion models, featuring a fundamental design principle of using explicit image correspondence to direct editing. Specifically, the key components include an attention manipulation module and a carefully refined classifier-free guidance (CFG) denoising strategy, both of which take into account the pre-estimated correspondence. Such an inference-time algorithm enjoys a plug-and-play nature and is compatible to most diffusion-based editing methods, such as ControlNet and BrushNet. Extensive results demonstrate the efficacy of Edicho in consistent cross-image editing under diverse settings. We will release the code to facilitate future studies.', 'score': 17, 'issue_id': 1405, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '8068418a331b2086', 'authors': ['Qingyan Bai', 'Hao Ouyang', 'Yinghao Xu', 'Qiuyu Wang', 'Ceyuan Yang', 'Ka Leong Cheng', 'Yujun Shen', 'Qifeng Chen'], 'affiliations': ['Ant Group', 'CUHK', 'HKUST', 'Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21079.jpg', 'data': {'categories': ['#cv', '#diffusion', '#open_source', '#inference'], 'emoji': '🖼️', 'ru': {'title': 'Edicho: согласованное редактирование изображений без обучения', 'desc': 'Статья представляет Edicho - решение для согласованного редактирования изображений без обучения, основанное на диффузионных моделях. Ключевые компоненты включают модуль манипуляции вниманием и стратегию шумоподавления без классификатора, использующие предварительно оцененное соответствие между изображениями. Этот алгоритм совместим с большинством методов редактирования на основе диффузии, таких как ControlNet и BrushNet. Результаты демонстрируют эффективность Edicho в согласованном редактировании изображений в различных условиях.'}, 'en': {'title': 'Edicho: Consistent Image Editing Made Easy with Diffusion Models', 'desc': 'This paper introduces Edicho, a novel approach for consistent editing of images that addresses challenges like varying object poses and lighting. It utilizes diffusion models without the need for prior training, focusing on explicit image correspondence to guide the editing process. Key innovations include an attention manipulation module and a refined classifier-free guidance denoising strategy, which enhance the editing quality by considering pre-estimated correspondences. The method is designed to be easily integrated with existing diffusion-based editing techniques, showing strong performance across different scenarios.'}, 'zh': {'title': 'Edicho:无训练一致性图像编辑的新方法', 'desc': 'Edicho 是一种基于扩散模型的无训练解决方案,旨在解决在不同环境下进行一致性图像编辑的挑战。它的设计原则是利用显式图像对应关系来指导编辑,确保在不同的拍摄条件下保持一致性。该方法包括一个注意力操作模块和经过精细调整的无分类器引导去噪策略,能够有效处理预估的对应关系。Edicho 具有即插即用的特性,兼容大多数基于扩散的编辑方法,实验结果显示其在多种设置下的有效性。'}}}, {'id': 'https://huggingface.co/papers/2412.21187', 'title': 'Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs', 'url': 'https://huggingface.co/papers/2412.21187', 'abstract': 'The remarkable performance of models like the OpenAI o1 can be attributed to their ability to emulate human-like long-time thinking during inference. These models employ extended chain-of-thought (CoT) processes, exploring multiple strategies to enhance problem-solving capabilities. However, a critical question remains: How to intelligently and efficiently scale computational resources during testing. This paper presents the first comprehensive study on the prevalent issue of overthinking in these models, where excessive computational resources are allocated for simple problems with minimal benefit. We introduce novel efficiency metrics from both outcome and process perspectives to evaluate the rational use of computational resources by o1-like models. Using a self-training paradigm, we propose strategies to mitigate overthinking, streamlining reasoning processes without compromising accuracy. Experimental results show that our approach successfully reduces computational overhead while preserving model performance across a range of testsets with varying difficulty levels, such as GSM8K, MATH500, GPQA, and AIME.', 'score': 11, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '78da22eae14fe26c', 'authors': ['Xingyu Chen', 'Jiahao Xu', 'Tian Liang', 'Zhiwei He', 'Jianhui Pang', 'Dian Yu', 'Linfeng Song', 'Qiuzhi Liu', 'Mengfei Zhou', 'Zhuosheng Zhang', 'Rui Wang', 'Zhaopeng Tu', 'Haitao Mi', 'Dong Yu'], 'affiliations': ['Shanghai Jiao Tong University', 'Tencent AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2412.21187.jpg', 'data': {'categories': ['#optimization', '#reasoning', '#training', '#math', '#inference'], 'emoji': '🧠', 'ru': {'title': 'Эффективное мышление ИИ: борьба с избыточными вычислениями', 'desc': 'Статья исследует проблему избыточных вычислений (overthinking) в крупных языковых моделях типа OpenAI o1 при решении задач. Авторы вводят новые метрики эффективности для оценки рационального использования вычислительных ресурсов такими моделями. Предлагается стратегия на основе самообучения для оптимизации рассуждений модели без потери точности. Экспериментальные результаты показывают успешное снижение вычислительных затрат при сохранении производительности на различных наборах тестов.'}, 'en': {'title': 'Streamlining Reasoning: Tackling Overthinking in AI Models', 'desc': "This paper investigates the phenomenon of overthinking in advanced machine learning models, particularly those like OpenAI's o1, which excel at long-term reasoning. It highlights the inefficiencies that arise when these models allocate excessive computational resources to solve simple problems, leading to minimal gains in performance. The authors propose new efficiency metrics to assess how well these models utilize their computational power during inference. By implementing a self-training approach, they present strategies to reduce overthinking, achieving a balance between computational efficiency and model accuracy across various challenging test sets."}, 'zh': {'title': '优化计算资源,提升模型效率', 'desc': '本文探讨了像OpenAI o1这样的模型在推理过程中模拟人类长期思考的能力。研究指出,这些模型在解决问题时常常会过度思考,导致在简单问题上分配过多的计算资源。我们提出了新的效率指标,从结果和过程两个角度评估计算资源的合理使用,并提出了自我训练的策略来减少过度思考。实验结果表明,我们的方法在不同难度的测试集上成功降低了计算开销,同时保持了模型的性能。'}}}, {'id': 'https://huggingface.co/papers/2412.20005', 'title': 'OneKE: A Dockerized Schema-Guided LLM Agent-based Knowledge Extraction System', 'url': 'https://huggingface.co/papers/2412.20005', 'abstract': "We introduce OneKE, a dockerized schema-guided knowledge extraction system, which can extract knowledge from the Web and raw PDF Books, and support various domains (science, news, etc.). Specifically, we design OneKE with multiple agents and a configure knowledge base. Different agents perform their respective roles, enabling support for various extraction scenarios. The configure knowledge base facilitates schema configuration, error case debugging and correction, further improving the performance. Empirical evaluations on benchmark datasets demonstrate OneKE's efficacy, while case studies further elucidate its adaptability to diverse tasks across multiple domains, highlighting its potential for broad applications. We have open-sourced the Code at https://github.com/zjunlp/OneKE and released a Video at http://oneke.openkg.cn/demo.mp4.", 'score': 10, 'issue_id': 1405, 'pub_date': '2024-12-28', 'pub_date_card': {'ru': '28 декабря', 'en': 'December 28', 'zh': '12月28日'}, 'hash': 'da8469c61421cefb', 'authors': ['Yujie Luo', 'Xiangyuan Ru', 'Kangwei Liu', 'Lin Yuan', 'Mengshu Sun', 'Ningyu Zhang', 'Lei Liang', 'Zhiqiang Zhang', 'Jun Zhou', 'Lanning Wei', 'Da Zheng', 'Haofen Wang', 'Huajun Chen'], 'affiliations': ['Ant Group', 'Tongji University', 'ZJU-Ant Group Joint Research Center for Knowledge Graphs', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2412.20005.jpg', 'data': {'categories': ['#dataset', '#agents', '#open_source', '#benchmark', '#multimodal', '#science'], 'emoji': '🧠', 'ru': {'title': 'OneKE: Универсальный инструмент для извлечения знаний из разнородных источников', 'desc': 'OneKE - это докеризованная система извлечения знаний, управляемая схемой. Она способна извлекать информацию из веб-ресурсов и PDF-книг, поддерживая различные домены, такие как наука и новости. Система использует множество агентов и настраиваемую базу знаний для выполнения различных сценариев извлечения. OneKE демонстрирует высокую эффективность на эталонных наборах данных и адаптируемость к разнообразным задачам в различных областях.'}, 'en': {'title': 'OneKE: Versatile Knowledge Extraction for Diverse Domains', 'desc': "OneKE is a knowledge extraction system designed to gather information from the Web and raw PDF books across various domains like science and news. It utilizes multiple agents, each responsible for specific tasks, which enhances its ability to handle different extraction scenarios effectively. The system includes a configurable knowledge base that aids in schema setup, debugging, and error correction, leading to improved performance. Empirical tests on benchmark datasets confirm OneKE's effectiveness, and case studies showcase its versatility in tackling diverse tasks."}, 'zh': {'title': 'OneKE:多领域知识提取的智能系统', 'desc': 'OneKE是一个基于Docker的知识提取系统,能够从网络和原始PDF书籍中提取知识,支持多个领域(如科学、新闻等)。该系统设计了多个智能代理,各自承担不同的角色,以适应各种提取场景。配置知识库的设计使得模式配置、错误调试和修正变得更加高效,从而提升了系统的性能。通过在基准数据集上的实证评估,OneKE展示了其有效性,并通过案例研究进一步说明了其在多个领域的适应性和广泛应用潜力。'}}}, {'id': 'https://huggingface.co/papers/2412.20631', 'title': "Slow Perception: Let's Perceive Geometric Figures Step-by-step", 'url': 'https://huggingface.co/papers/2412.20631', 'abstract': 'Recently, "visual o1" began to enter people\'s vision, with expectations that this slow-thinking design can solve visual reasoning tasks, especially geometric math problems. However, the reality is that current LVLMs (Large Vision Language Models) can hardly even accurately copy a geometric figure, let alone truly understand the complex inherent logic and spatial relationships within geometric shapes. We believe accurate copying (strong perception) is the first step to visual o1. Accordingly, we introduce the concept of "slow perception" (SP), which guides the model to gradually perceive basic point-line combinations, as our humans, reconstruct complex geometric structures progressively. There are two-fold stages in SP: a) perception decomposition. Perception is not instantaneous. In this stage, complex geometric figures are broken down into basic simple units to unify geometry representation. b) perception flow, which acknowledges that accurately tracing a line is not an easy task. This stage aims to avoid "long visual jumps" in regressing line segments by using a proposed "perceptual ruler" to trace each line stroke-by-stroke. Surprisingly, such a human-like perception manner enjoys an inference time scaling law -- the slower, the better. Researchers strive to speed up the model\'s perception in the past, but we slow it down again, allowing the model to read the image step-by-step and carefully.', 'score': 9, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'f99c59b7ef92c667', 'authors': ['Haoran Wei', 'Youyang Yin', 'Yumeng Li', 'Jia Wang', 'Liang Zhao', 'Jianjian Sun', 'Zheng Ge', 'Xiangyu Zhang'], 'affiliations': ['Beihang University', 'Stepfun'], 'pdf_title_img': 'assets/pdf/title_img/2412.20631.jpg', 'data': {'categories': ['#cv', '#math', '#reasoning'], 'emoji': '🔍', 'ru': {'title': 'Медленнее значит лучше: новый подход к компьютерному зрению', 'desc': "Статья представляет концепцию 'медленного восприятия' (slow perception) для улучшения способности моделей компьютерного зрения копировать геометрические фигуры. Авторы предлагают двухэтапный подход: декомпозиция восприятия, разбивающая сложные фигуры на простые элементы, и поток восприятия, использующий 'перцептивную линейку' для точного отслеживания линий. Исследователи обнаружили, что более медленное восприятие приводит к лучшим результатам, что противоречит традиционному стремлению ускорить обработку изображений. Эта методика может стать первым шагом к решению задач визуального рассуждения и геометрических задач большими визуально-языковыми моделями."}, 'en': {'title': 'Slow Down to See Better: Enhancing Visual Reasoning with Slow Perception', 'desc': "This paper introduces the concept of 'slow perception' (SP) to enhance the capabilities of Large Vision Language Models (LVLMs) in visual reasoning tasks, particularly in understanding geometric shapes. SP consists of two stages: perception decomposition, where complex figures are simplified into basic components, and perception flow, which emphasizes careful tracing of lines to avoid errors. The authors argue that this method mimics human cognitive processes, allowing for a more accurate understanding of spatial relationships. Interestingly, they find that a slower, more deliberate approach to perception improves the model's performance, challenging the traditional focus on speed in machine learning."}, 'zh': {'title': '慢感知:逐步理解几何结构的关键', 'desc': '最近,"视觉o1"开始引起人们的关注,期望这种慢思维设计能够解决视觉推理任务,尤其是几何数学问题。然而,当前的大型视觉语言模型(LVLMs)在准确复制几何图形方面几乎无能为力,更不用说真正理解几何形状内在的复杂逻辑和空间关系。我们提出了"慢感知"(SP)的概念,指导模型逐步感知基本的点线组合,像人类一样逐步重建复杂的几何结构。SP包括两个阶段:感知分解和感知流,前者将复杂的几何图形分解为基本单元,后者通过使用"感知尺"逐步追踪每条线段,避免"长视觉跳跃"。'}}}, {'id': 'https://huggingface.co/papers/2412.21140', 'title': 'Facilitating large language model Russian adaptation with Learned Embedding Propagation', 'url': 'https://huggingface.co/papers/2412.21140', 'abstract': 'Rapid advancements of large language model (LLM) technologies led to the introduction of powerful open-source instruction-tuned LLMs that have the same text generation quality as the state-of-the-art counterparts such as GPT-4. While the emergence of such models accelerates the adoption of LLM technologies in sensitive-information environments the authors of such models don not disclose the training data necessary for replication of the results thus making the achievements model-exclusive. Since those open-source models are also multilingual this in turn reduces the benefits of training a language specific LLMs as improved inference computation efficiency becomes the only guaranteed advantage of such costly procedure. More cost-efficient options such as vocabulary extension and subsequent continued pre-training are also inhibited by the lack of access to high-quality instruction-tuning data since it is the major factor behind the resulting LLM task-solving capabilities. To address the limitations and cut the costs of the language adaptation pipeline we propose Learned Embedding Propagation (LEP). Unlike existing approaches our method has lower training data size requirements due to minimal impact on existing LLM knowledge which we reinforce using novel ad-hoc embedding propagation procedure that allows to skip the instruction-tuning step and instead implant the new language knowledge directly into any existing instruct-tuned variant. We evaluated four Russian vocabulary adaptations for LLaMa-3-8B and Mistral-7B, showing that LEP is competitive with traditional instruction-tuning methods, achieving performance comparable to OpenChat 3.5 and LLaMa-3-8B-Instruct, with further improvements via self-calibration and continued tuning enhancing task-solving capabilities.', 'score': 9, 'issue_id': 1412, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '093f3929e323d180', 'authors': ['Mikhail Tikhomirov', 'Daniil Chernyshev'], 'affiliations': ['Lomonosov Moscow State University, Moscow, Russia'], 'pdf_title_img': 'assets/pdf/title_img/2412.21140.jpg', 'data': {'categories': ['#data', '#training', '#low_resource', '#transfer_learning', '#dataset', '#open_source', '#multilingual'], 'emoji': '🌐', 'ru': {'title': 'Эффективная адаптация языковых моделей без масштабного переобучения', 'desc': 'Статья представляет новый метод адаптации больших языковых моделей (LLM) к другим языкам, называемый Learned Embedding Propagation (LEP). Этот подход позволяет эффективно внедрять знания нового языка в существующие инструктированные LLM без необходимости повторного обучения на больших объемах данных. Авторы провели эксперименты с адаптацией моделей LLaMa-3-8B и Mistral-7B к русскому языку, показав, что LEP конкурентоспособен с традиционными методами инструктирования. Результаты демонстрируют, что LEP достигает производительности, сравнимой с OpenChat 3.5 и LLaMa-3-8B-Instruct, с возможностью дальнейшего улучшения через самокалибровку и дополнительную настройку.'}, 'en': {'title': 'Efficient Language Adaptation with Learned Embedding Propagation', 'desc': 'This paper introduces Learned Embedding Propagation (LEP), a novel method for adapting large language models (LLMs) to new languages without the need for extensive instruction-tuning data. LEP minimizes the training data requirements by directly embedding new language knowledge into existing instruct-tuned models, thus bypassing traditional instruction-tuning steps. The authors demonstrate that LEP can effectively adapt LLaMa-3-8B and Mistral-7B for Russian vocabulary, achieving performance on par with state-of-the-art models like OpenChat 3.5. This approach not only reduces costs but also enhances the efficiency of language adaptation in multilingual contexts.'}, 'zh': {'title': '学习嵌入传播:降低语言适应成本的新方法', 'desc': '这篇论文介绍了一种名为学习嵌入传播(LEP)的方法,旨在降低语言适应过程的成本。LEP方法通过最小化对现有大语言模型(LLM)知识的影响,减少了对训练数据的需求。与传统的指令调优方法相比,LEP能够直接将新的语言知识植入到现有的指令调优模型中,从而跳过指令调优步骤。实验结果表明,LEP在俄语词汇适应方面的表现与传统方法相当,且通过自我校准和持续调优进一步提升了任务解决能力。'}}}, {'id': 'https://huggingface.co/papers/2412.21139', 'title': 'Training Software Engineering Agents and Verifiers with SWE-Gym', 'url': 'https://huggingface.co/papers/2412.21139', 'abstract': 'We present SWE-Gym, the first environment for training real-world software engineering (SWE) agents. SWE-Gym contains 2,438 real-world Python task instances, each comprising a codebase with an executable runtime environment, unit tests, and a task specified in natural language. We use SWE-Gym to train language model based SWE agents , achieving up to 19% absolute gains in resolve rate on the popular SWE-Bench Verified and Lite test sets. We also experiment with inference-time scaling through verifiers trained on agent trajectories sampled from SWE-Gym. When combined with our fine-tuned SWE agents, we achieve 32.0% and 26.0% on SWE-Bench Verified and Lite, respectively, reflecting a new state-of-the-art for open-weight SWE agents. To facilitate further research, we publicly release SWE-Gym, models, and agent trajectories.', 'score': 9, 'issue_id': 1406, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '800bb3f4c48e2cf9', 'authors': ['Jiayi Pan', 'Xingyao Wang', 'Graham Neubig', 'Navdeep Jaitly', 'Heng Ji', 'Alane Suhr', 'Yizhe Zhang'], 'affiliations': ['Apple', 'CMU', 'UC Berkeley', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2412.21139.jpg', 'data': {'categories': ['#dataset', '#open_source', '#agents', '#training'], 'emoji': '🤖', 'ru': {'title': 'SWE-Gym: революция в обучении ИИ-агентов для разработки ПО', 'desc': 'SWE-Gym - это новая среда для обучения агентов программной инженерии на реальных задачах. Она содержит 2438 экземпляров задач на Python с исполняемой средой, юнит-тестами и описанием на естественном языке. Авторы использовали SWE-Gym для обучения агентов на основе языковых моделей, достигнув улучшения на 19% в решении задач из наборов SWE-Bench. Комбинация обученных агентов и верификаторов позволила достичь нового рекорда производительности для открытых моделей в программной инженерии.'}, 'en': {'title': 'Revolutionizing Software Engineering with SWE-Gym', 'desc': 'SWE-Gym is a novel environment designed for training software engineering agents using real-world Python tasks. It includes 2,438 task instances, each with a codebase, executable environment, unit tests, and natural language task descriptions. The paper demonstrates that language model-based agents trained in SWE-Gym can significantly improve their performance, achieving up to 19% higher resolve rates on benchmark tests. Additionally, the authors explore scaling inference through verifiers, leading to state-of-the-art results for open-weight software engineering agents, and they provide resources for further research.'}, 'zh': {'title': 'SWE-Gym:软件工程代理的新起点', 'desc': '我们提出了SWE-Gym,这是第一个用于训练真实世界软件工程(SWE)代理的环境。SWE-Gym包含2438个真实的Python任务实例,每个实例都有可执行的运行环境、单元测试和用自然语言指定的任务。通过使用SWE-Gym,我们训练的基于语言模型的SWE代理在流行的SWE-Bench验证和Lite测试集上实现了高达19%的绝对解决率提升。我们还通过在SWE-Gym中采样的代理轨迹训练验证器,进行推理时的扩展,结合我们微调的SWE代理,在SWE-Bench验证和Lite上分别达到了32.0%和26.0%的新状态,成为开放权重SWE代理的新标杆。'}}}, {'id': 'https://huggingface.co/papers/2412.21206', 'title': 'PERSE: Personalized 3D Generative Avatars from A Single Portrait', 'url': 'https://huggingface.co/papers/2412.21206', 'abstract': "We present PERSE, a method for building an animatable personalized generative avatar from a reference portrait. Our avatar model enables facial attribute editing in a continuous and disentangled latent space to control each facial attribute, while preserving the individual's identity. To achieve this, our method begins by synthesizing large-scale synthetic 2D video datasets, where each video contains consistent changes in the facial expression and viewpoint, combined with a variation in a specific facial attribute from the original input. We propose a novel pipeline to produce high-quality, photorealistic 2D videos with facial attribute editing. Leveraging this synthetic attribute dataset, we present a personalized avatar creation method based on the 3D Gaussian Splatting, learning a continuous and disentangled latent space for intuitive facial attribute manipulation. To enforce smooth transitions in this latent space, we introduce a latent space regularization technique by using interpolated 2D faces as supervision. Compared to previous approaches, we demonstrate that PERSE generates high-quality avatars with interpolated attributes while preserving identity of reference person.", 'score': 8, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '334a60a17f9a9477', 'authors': ['Hyunsoo Cha', 'Inhee Lee', 'Hanbyul Joo'], 'affiliations': ['Seoul National University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21206.jpg', 'data': {'categories': ['#3d', '#cv', '#dataset', '#synthetic'], 'emoji': '🎭', 'ru': {'title': 'Персонализированные аватары с гибким редактированием черт лица', 'desc': 'PERSE - это метод создания анимируемого персонализированного генеративного аватара на основе портрета. Он позволяет редактировать лицевые атрибуты в непрерывном и разделенном латентном пространстве, сохраняя при этом индивидуальность человека. Метод использует синтетические наборы 2D-видео для обучения модели на основе 3D Gaussian Splatting. PERSE демонстрирует высокое качество генерации аватаров с интерполированными атрибутами, сохраняя идентичность исходного человека.'}, 'en': {'title': 'Create Your Unique Avatar with PERSE!', 'desc': "PERSE is a novel method for creating personalized generative avatars from a single reference portrait. It allows users to edit facial attributes in a smooth and controlled manner within a continuous latent space, ensuring that the individual's identity remains intact. The approach involves generating large-scale synthetic 2D video datasets that showcase variations in facial expressions and attributes, which are then used to train the avatar model. By employing 3D Gaussian Splatting and a latent space regularization technique, PERSE achieves high-quality, photorealistic avatars with seamless attribute transitions."}, 'zh': {'title': '个性化生成头像的新方法', 'desc': '本文介绍了一种名为PERSE的方法,用于从参考肖像构建可动画的个性化生成头像。该头像模型能够在连续且解耦的潜在空间中编辑面部属性,同时保持个体的身份。我们的方法首先合成大规模的合成2D视频数据集,每个视频包含面部表情和视角的一致变化,并结合原始输入中特定面部属性的变化。通过引入潜在空间正则化技术,我们实现了高质量、逼真的2D视频生成,并在此基础上提出了一种个性化头像创建方法。'}}}, {'id': 'https://huggingface.co/papers/2412.21199', 'title': 'HumanEval Pro and MBPP Pro: Evaluating Large Language Models on Self-invoking Code Generation', 'url': 'https://huggingface.co/papers/2412.21199', 'abstract': "We introduce self-invoking code generation, a new task designed to evaluate the progressive reasoning and problem-solving capabilities of LLMs. In this task, models are presented with a base problem and a related, more complex problem. They must solve the base problem and then utilize its solution to address the more complex one. This work features three key contributions. First, we propose a general recipe for generating more challenging versions of existing benchmarks, resulting in three new benchmarks: HumanEval Pro, MBPP Pro, and BigCodeBench-Lite Pro, specifically designed to assess LLMs on self-invoking code generation. Second, from the analysis of experimental results over twenty LLMs on our benchmarks, we have two important observations: (i) Most LLMs excel in traditional code generation benchmarks like HumanEval and MBPP, but their performance declines on self-invoking tasks. For example, o1-mini achieves 96.2% pass@1 on HumanEval but only 76.2% on HumanEval Pro. (ii) On self-invoking code generation task, the instruction-tuned models demonstrate only marginal improvements compared to the base models. Third, we disclose the types of failure modes that exist in our evaluation results. All these results underscore the need for further advancements in self-invoking code generation tasks and provide a new direction for future research on enhancing LLMs' code reasoning capabilities.", 'score': 6, 'issue_id': 1408, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '9d2cebc8f30f722c', 'authors': ['Zhaojian Yu', 'Yilun Zhao', 'Arman Cohan', 'Xiao-Ping Zhang'], 'affiliations': ['Tsinghua University', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21199.jpg', 'data': {'categories': ['#dataset', '#reasoning', '#training', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Самовызывающийся код: новый рубеж для языковых моделей', 'desc': 'Статья представляет новую задачу для оценки возможностей языковых моделей (LLM) - генерацию самовызывающегося кода. В рамках этой задачи модели должны решить базовую проблему, а затем использовать ее решение для более сложной задачи. Авторы создали три новых бенчмарка: HumanEval Pro, MBPP Pro и BigCodeBench-Lite Pro. Эксперименты показали, что большинство LLM хорошо справляются с традиционными задачами генерации кода, но их производительность снижается на самовызывающихся задачах. Результаты подчеркивают необходимость дальнейших исследований в области улучшения способностей LLM к рассуждению при работе с кодом.'}, 'en': {'title': 'Enhancing LLMs: The Challenge of Self-Invoking Code Generation', 'desc': 'This paper introduces a new task called self-invoking code generation, which tests the reasoning and problem-solving skills of large language models (LLMs). In this task, models first solve a simple problem and then use that solution to tackle a more complex one. The authors create three new benchmarks to evaluate LLMs on this task, revealing that while many models perform well on standard code generation tasks, their performance drops significantly on self-invoking tasks. The findings highlight the limitations of current models and suggest that more research is needed to improve their code reasoning abilities.'}, 'zh': {'title': '自调用代码生成:提升LLMs推理能力的新方向', 'desc': '本文介绍了一种新的任务——自调用代码生成,旨在评估大型语言模型(LLMs)的推理和问题解决能力。在这个任务中,模型需要先解决一个基础问题,然后利用其解决方案来处理一个更复杂的问题。研究提出了三项重要贡献,包括生成更具挑战性的基准测试的通用方法,并创建了三个新基准:HumanEval Pro、MBPP Pro和BigCodeBench-Lite Pro。实验结果显示,大多数LLMs在传统代码生成基准上表现良好,但在自调用任务上的表现却有所下降,表明在自调用代码生成任务上仍需进一步的研究和改进。'}}}, {'id': 'https://huggingface.co/papers/2501.07301', 'title': 'The Lessons of Developing Process Reward Models in Mathematical Reasoning', 'url': 'https://huggingface.co/papers/2501.07301', 'abstract': 'Process Reward Models (PRMs) emerge as a promising approach for process supervision in mathematical reasoning of Large Language Models (LLMs), which aim to identify and mitigate intermediate errors in the reasoning processes. However, the development of effective PRMs faces significant challenges, particularly in data annotation and evaluation methodologies. In this paper, through extensive experiments, we demonstrate that commonly used Monte Carlo (MC) estimation-based data synthesis for PRMs typically yields inferior performance and generalization compared to LLM-as-a-judge and human annotation methods. MC estimation relies on completion models to evaluate current-step correctness, leading to inaccurate step verification. Furthermore, we identify potential biases in conventional Best-of-N (BoN) evaluation strategies for PRMs: (1) The unreliable policy models generate responses with correct answers but flawed processes, leading to a misalignment between the evaluation criteria of BoN and the PRM objectives of process verification. (2) The tolerance of PRMs of such responses leads to inflated BoN scores. (3) Existing PRMs have a significant proportion of minimum scores concentrated on the final answer steps, revealing the shift from process to outcome-based assessment in BoN Optimized PRMs. To address these challenges, we develop a consensus filtering mechanism that effectively integrates MC estimation with LLM-as-a-judge and advocates a more comprehensive evaluation framework that combines response-level and step-level metrics. Based on the mechanisms, we significantly improve both model performance and data efficiency in the BoN evaluation and the step-wise error identification task. Finally, we release a new state-of-the-art PRM that outperforms existing open-source alternatives and provides practical guidelines for future research in building process supervision models.', 'score': 46, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '98f46bb1e2772efc', 'authors': ['Zhenru Zhang', 'Chujie Zheng', 'Yangzhen Wu', 'Beichen Zhang', 'Runji Lin', 'Bowen Yu', 'Dayiheng Liu', 'Jingren Zhou', 'Junyang Lin'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.07301.jpg', 'data': {'categories': ['#math', '#data', '#reasoning', '#benchmark', '#optimization', '#open_source', '#training'], 'emoji': '🧮', 'ru': {'title': 'Усовершенствование Process Reward Models для более точного контроля математических рассуждений', 'desc': 'Статья посвящена Process Reward Models (PRM) для контроля процесса математических рассуждений в больших языковых моделях. Авторы выявили проблемы в существующих методах синтеза данных и оценки PRMs, таких как Monte Carlo и Best-of-N. Они предложили новый механизм фильтрации на основе консенсуса, объединяющий MC-оценку с подходом LLM-as-a-judge. В результате исследователи создали улучшенную PRM, превосходящую существующие open-source альтернативы.'}, 'en': {'title': 'Enhancing Reasoning in LLMs with Process Reward Models', 'desc': 'This paper introduces Process Reward Models (PRMs) as a method to enhance the reasoning capabilities of Large Language Models (LLMs) by identifying and correcting errors in their reasoning processes. The authors highlight the limitations of traditional Monte Carlo estimation methods for data synthesis, which often lead to poor performance in evaluating reasoning steps. They also point out biases in the Best-of-N evaluation strategies that can misalign with the goals of PRMs, particularly in how they assess the correctness of reasoning processes versus final answers. To overcome these issues, the paper proposes a new consensus filtering mechanism that combines different evaluation methods, resulting in improved model performance and more accurate error identification.'}, 'zh': {'title': '提升过程监督模型的有效性', 'desc': '本文探讨了过程奖励模型(PRMs)在大型语言模型(LLMs)数学推理中的应用,旨在识别和减少推理过程中的中间错误。研究表明,传统的基于蒙特卡洛估计的数据合成方法在性能和泛化能力上不如使用LLM作为评判者和人工标注的方法。我们还发现,现有的最佳选择(BoN)评估策略存在偏差,导致评估标准与PRM的过程验证目标不一致。为了解决这些问题,本文提出了一种共识过滤机制,结合了蒙特卡洛估计和LLM评判者,显著提高了模型性能和数据效率。'}}}, {'id': 'https://huggingface.co/papers/2501.06425', 'title': 'Tensor Product Attention Is All You Need', 'url': 'https://huggingface.co/papers/2501.06425', 'abstract': 'Scaling language models to handle longer input sequences typically necessitates large key-value (KV) caches, resulting in substantial memory overhead during inference. In this paper, we propose Tensor Product Attention (TPA), a novel attention mechanism that uses tensor decompositions to represent queries, keys, and values compactly, significantly shrinking KV cache size at inference time. By factorizing these representations into contextual low-rank components (contextual factorization) and seamlessly integrating with RoPE, TPA achieves improved model quality alongside memory efficiency. Based on TPA, we introduce the Tensor ProducT ATTenTion Transformer (T6), a new model architecture for sequence modeling. Through extensive empirical evaluation of language modeling tasks, we demonstrate that T6 exceeds the performance of standard Transformer baselines including MHA, MQA, GQA, and MLA across various metrics, including perplexity and a range of renowned evaluation benchmarks. Notably, TPAs memory efficiency enables the processing of significantly longer sequences under fixed resource constraints, addressing a critical scalability challenge in modern language models. The code is available at https://github.com/tensorgi/T6.', 'score': 35, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'f723487eccf1ccfe', 'authors': ['Yifan Zhang', 'Yifeng Liu', 'Huizhuo Yuan', 'Zhen Qin', 'Yang Yuan', 'Quanquan Gu', 'Andrew Chi-Chih Yao'], 'affiliations': ['IIIS, Tsinghua University', 'Shanghai Qi Zhi Institute', 'TapTap', 'University of California, Los Angeles'], 'pdf_title_img': 'assets/pdf/title_img/2501.06425.jpg', 'data': {'categories': ['#benchmark', '#long_context', '#optimization', '#inference', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективное внимание: компактные трансформеры для длинных последовательностей', 'desc': 'В статье представлен новый механизм внимания - Tensor Product Attention (TPA), использующий тензорные разложения для компактного представления запросов, ключей и значений. TPA значительно уменьшает размер кэша ключ-значение при выводе, что повышает эффективность использования памяти. На основе TPA авторы разработали новую архитектуру модели - Tensor ProducT ATTenTion Transformer (T6). Эмпирические исследования показали, что T6 превосходит стандартные базовые модели Transformer по различным метрикам. TPA позволяет обрабатывать значительно более длинные последовательности при фиксированных ресурсах, решая важную проблему масштабируемости современных языковых моделей.'}, 'en': {'title': 'Efficient Attention for Longer Sequences with TPA', 'desc': 'This paper introduces Tensor Product Attention (TPA), a new attention mechanism designed to reduce memory usage during inference in language models. TPA achieves this by using tensor decompositions to compactly represent queries, keys, and values, which allows for smaller key-value caches. The authors present the Tensor ProducT ATTenTion Transformer (T6), a model that integrates TPA and shows improved performance on language modeling tasks compared to traditional Transformer architectures. T6 not only enhances model quality but also enables the processing of longer input sequences efficiently, addressing a key limitation in current language models.'}, 'zh': {'title': '张量乘积注意力:高效处理长序列的创新方案', 'desc': '本文提出了一种新的注意力机制,称为张量乘积注意力(TPA),旨在解决长输入序列处理中的内存开销问题。TPA通过张量分解技术,紧凑地表示查询、键和值,从而显著减少推理时的KV缓存大小。该机制结合了上下文低秩分解和RoPE,提升了模型质量和内存效率。基于TPA,我们还引入了一种新的模型架构——张量乘积注意力变换器(T6),在语言建模任务中表现优于传统的Transformer基线。'}}}, {'id': 'https://huggingface.co/papers/2501.06252', 'title': '$\\text{Transformer}^2$: Self-adaptive LLMs', 'url': 'https://huggingface.co/papers/2501.06252', 'abstract': 'Self-adaptive large language models (LLMs) aim to solve the challenges posed by traditional fine-tuning methods, which are often computationally intensive and static in their ability to handle diverse tasks. We introduce \\implname, a novel self-adaptation framework that adapts LLMs for unseen tasks in real-time by selectively adjusting only the singular components of their weight matrices. During inference, \\implname employs a two-pass mechanism: first, a dispatch system identifies the task properties, and then task-specific "expert" vectors, trained using reinforcement learning, are dynamically mixed to obtain targeted behavior for the incoming prompt. Our method outperforms ubiquitous approaches such as LoRA, with fewer parameters and greater efficiency. \\implname demonstrates versatility across different LLM architectures and modalities, including vision-language tasks. \\implname represents a significant leap forward, offering a scalable, efficient solution for enhancing the adaptability and task-specific performance of LLMs, paving the way for truly dynamic, self-organizing AI systems.', 'score': 19, 'issue_id': 1651, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '935c31e095aeeec8', 'authors': ['Qi Sun', 'Edoardo Cetin', 'Yujin Tang'], 'affiliations': ['Institute of Science Tokyo, Japan', 'Sakana AI, Japan'], 'pdf_title_img': 'assets/pdf/title_img/2501.06252.jpg', 'data': {'categories': ['#multimodal', '#agi', '#rl', '#optimization', '#training', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Самоадаптация языковых моделей в реальном времени', 'desc': 'Статья представляет новый фреймворк самоадаптации для больших языковых моделей (LLM), который позволяет адаптироваться к новым задачам в реальном времени. Метод использует двухэтапный механизм: сначала определяются свойства задачи, затем применяются специальные векторы экспертов для настройки поведения модели. Подход превосходит традиционные методы вроде LoRA, используя меньше параметров и работая эффективнее. Фреймворк демонстрирует универсальность для разных архитектур LLM и модальностей, включая задачи компьютерного зрения.'}, 'en': {'title': 'Dynamic Adaptation for Language Models', 'desc': "This paper presents a new framework called \textit{implname} that enhances large language models (LLMs) by allowing them to adapt to new tasks in real-time without the heavy computational costs of traditional fine-tuning. Instead of adjusting the entire model, \textit{implname} selectively modifies specific components of the model's weight matrices, making it more efficient. The framework uses a two-step process during inference: first, it identifies the task requirements, and then it combines specialized 'expert' vectors, which are optimized through reinforcement learning, to tailor the model's response. This approach not only improves performance compared to existing methods like LoRA but also works across various LLM architectures and tasks, including those involving both text and images."}, 'zh': {'title': '自适应LLMs:高效应对多样化任务的未来', 'desc': '自适应大型语言模型(LLMs)旨在解决传统微调方法的挑战,这些方法通常计算密集且在处理多样化任务时能力有限。我们介绍了一种新颖的自适应框架\textit{implname},它通过选择性调整权重矩阵的单个组件,实时适应LLMs以应对未见过的任务。在推理过程中,\textit{implname}采用双重机制:首先,调度系统识别任务属性,然后动态混合经过强化学习训练的任务特定“专家”向量,以获得针对输入提示的目标行为。我们的研究方法在参数更少且效率更高的情况下,超越了广泛使用的方法,如LoRA,展示了在不同LLM架构和模态(包括视觉-语言任务)中的多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.06173', 'title': 'VideoAuteur: Towards Long Narrative Video Generation', 'url': 'https://huggingface.co/papers/2501.06173', 'abstract': 'Recent video generation models have shown promising results in producing high-quality video clips lasting several seconds. However, these models face challenges in generating long sequences that convey clear and informative events, limiting their ability to support coherent narrations. In this paper, we present a large-scale cooking video dataset designed to advance long-form narrative generation in the cooking domain. We validate the quality of our proposed dataset in terms of visual fidelity and textual caption accuracy using state-of-the-art Vision-Language Models (VLMs) and video generation models, respectively. We further introduce a Long Narrative Video Director to enhance both visual and semantic coherence in generated videos and emphasize the role of aligning visual embeddings to achieve improved overall video quality. Our method demonstrates substantial improvements in generating visually detailed and semantically aligned keyframes, supported by finetuning techniques that integrate text and image embeddings within the video generation process. Project page: https://videoauteur.github.io/', 'score': 18, 'issue_id': 1653, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'e110fbe840c50afa', 'authors': ['Junfei Xiao', 'Feng Cheng', 'Lu Qi', 'Liangke Gui', 'Jiepeng Cen', 'Zhibei Ma', 'Alan Yuille', 'Lu Jiang'], 'affiliations': ['ByteDance', 'ByteDance Seed', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06173.jpg', 'data': {'categories': ['#video', '#story_generation', '#dataset', '#long_context', '#training', '#multimodal', '#alignment'], 'emoji': '🍳', 'ru': {'title': 'Готовим длинные видео: новый подход к генерации нарративного контента', 'desc': 'Статья представляет новый датасет видеороликов о приготовлении пищи для улучшения генерации длинных нарративных видео. Авторы проверяют качество датасета с помощью современных моделей компьютерного зрения и генерации видео. Они также предлагают метод Long Narrative Video Director для повышения визуальной и семантической согласованности генерируемых видео. Результаты показывают значительное улучшение в генерации детализированных и семантически согласованных ключевых кадров.'}, 'en': {'title': 'Enhancing Long-Form Video Generation with Coherent Narratives', 'desc': 'This paper addresses the limitations of current video generation models in creating long, coherent videos, particularly in the cooking domain. It introduces a large-scale dataset specifically designed for generating long-form cooking videos, ensuring high visual quality and accurate textual descriptions. The authors propose a Long Narrative Video Director that improves both the visual and semantic coherence of the generated content by aligning visual embeddings. Their approach shows significant advancements in producing detailed keyframes and enhancing overall video quality through the integration of text and image embeddings.'}, 'zh': {'title': '推动烹饪视频的长篇叙事生成', 'desc': '最近的视频生成模型在生成持续几秒的高质量视频片段方面取得了良好效果。然而,这些模型在生成长序列时面临挑战,难以传达清晰且信息丰富的事件,限制了它们支持连贯叙述的能力。本文提出了一个大规模的烹饪视频数据集,旨在推动烹饪领域的长篇叙事生成。我们引入了一种长叙事视频导演,增强生成视频的视觉和语义一致性,并强调对齐视觉嵌入在提高整体视频质量中的重要性。'}}}, {'id': 'https://huggingface.co/papers/2501.07572', 'title': 'WebWalker: Benchmarking LLMs in Web Traversal', 'url': 'https://huggingface.co/papers/2501.07572', 'abstract': "Retrieval-augmented generation (RAG) demonstrates remarkable performance across tasks in open-domain question-answering. However, traditional search engines may retrieve shallow content, limiting the ability of LLMs to handle complex, multi-layered information. To address it, we introduce WebWalkerQA, a benchmark designed to assess the ability of LLMs to perform web traversal. It evaluates the capacity of LLMs to traverse a website's subpages to extract high-quality data systematically. We propose WebWalker, which is a multi-agent framework that mimics human-like web navigation through an explore-critic paradigm. Extensive experimental results show that WebWalkerQA is challenging and demonstrates the effectiveness of RAG combined with WebWalker, through the horizontal and vertical integration in real-world scenarios.", 'score': 14, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '1dd4e60432c1ca54', 'authors': ['Jialong Wu', 'Wenbiao Yin', 'Yong Jiang', 'Zhenglin Wang', 'Zekun Xi', 'Runnan Fang', 'Deyu Zhou', 'Pengjun Xie', 'Fei Huang'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.07572.jpg', 'data': {'categories': ['#rag', '#reasoning', '#benchmark', '#agi', '#optimization', '#games', '#interpretability', '#agents', '#survey'], 'emoji': '🕸️', 'ru': {'title': 'WebWalker: умная навигация по веб-страницам для улучшения вопросно-ответных систем', 'desc': 'В статье представлен новый подход к решению задач открытого вопросно-ответного поиска - WebWalkerQA. Эта система оценивает способность языковых моделей систематически исследовать подстраницы веб-сайтов для извлечения качественной информации. Авторы предлагают фреймворк WebWalker, использующий мультиагентный подход для имитации человеческой навигации по веб-страницам. Экспериментальные результаты демонстрируют эффективность комбинации RAG и WebWalker в реальных сценариях.'}, 'en': {'title': 'Enhancing LLMs with Human-like Web Navigation for Better Information Retrieval', 'desc': "This paper introduces WebWalkerQA, a benchmark for evaluating large language models (LLMs) in open-domain question-answering tasks. It addresses the limitations of traditional search engines that often retrieve superficial content, which hinders LLMs from accessing complex information. The proposed WebWalker framework uses a multi-agent system that simulates human-like web navigation, allowing LLMs to systematically traverse subpages of a website to gather high-quality data. Experimental results indicate that combining retrieval-augmented generation (RAG) with WebWalker enhances the models' performance in real-world scenarios by enabling deeper information extraction."}, 'zh': {'title': 'WebWalkerQA:提升问答系统的网页导航能力', 'desc': '检索增强生成(RAG)在开放领域问答任务中表现出色,但传统搜索引擎可能只检索到表面内容,限制了大型语言模型(LLMs)处理复杂信息的能力。为了解决这个问题,我们引入了WebWalkerQA,这是一个评估LLMs进行网页遍历能力的基准。它评估LLMs系统性地遍历网站子页面以提取高质量数据的能力。我们提出了WebWalker,这是一个多代理框架,通过探索-评估范式模拟人类的网页导航。'}}}, {'id': 'https://huggingface.co/papers/2501.06458', 'title': 'O1 Replication Journey -- Part 3: Inference-time Scaling for Medical Reasoning', 'url': 'https://huggingface.co/papers/2501.06458', 'abstract': "Building upon our previous investigations of O1 replication (Part 1: Journey Learning [Qin et al., 2024] and Part 2: Distillation [Huang et al., 2024]), this work explores the potential of inference-time scaling in large language models (LLMs) for medical reasoning tasks, ranging from diagnostic decision-making to treatment planning. Through extensive experiments on medical benchmarks of varying complexity (MedQA, Medbullets, and JAMA Clinical Challenges), our investigation reveals several key insights: (1) Increasing inference time does lead to improved performance. With a modest training set of 500 samples, our model yields substantial performance improvements of 6%-11%. (2) Task complexity directly correlates with the required length of reasoning chains, confirming the necessity of extended thought processes for challenging problems. (3) The differential diagnoses generated by our model adhere to the principles of the hypothetico-deductive method, producing a list of potential conditions that may explain a patient's symptoms and systematically narrowing these possibilities by evaluating the evidence. These findings demonstrate the promising synergy between inference-time scaling and journey learning in advancing LLMs' real-world clinical reasoning capabilities.", 'score': 14, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'c95817afd181bd85', 'authors': ['Zhongzhen Huang', 'Gui Geng', 'Shengyi Hua', 'Zhen Huang', 'Haoyang Zou', 'Shaoting Zhang', 'Pengfei Liu', 'Xiaofan Zhang'], 'affiliations': ['Generative AI Research Lab (GAIR)', 'SII', 'SPIRAL Lab', 'Shanghai Jiao Tong University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06458.jpg', 'data': {'categories': ['#science', '#inference', '#healthcare', '#reasoning'], 'emoji': '🩺', 'ru': {'title': 'Масштабирование времени вывода LLM улучшает медицинские рассуждения', 'desc': 'Данная работа исследует потенциал масштабирования времени вывода в больших языковых моделях (LLM) для задач медицинского рассуждения. Эксперименты на медицинских бенчмарках показали, что увеличение времени вывода приводит к улучшению производительности модели. Сложность задачи напрямую коррелирует с необходимой длиной цепочек рассуждений. Дифференциальные диагнозы, генерируемые моделью, соответствуют принципам гипотетико-дедуктивного метода.'}, 'en': {'title': 'Enhancing Medical Reasoning in LLMs through Inference-Time Scaling', 'desc': "This paper investigates how increasing inference time can enhance the performance of large language models (LLMs) in medical reasoning tasks. The authors conducted experiments on various medical benchmarks and found that longer inference times lead to significant performance improvements, even with a small training dataset. They also discovered that more complex tasks require longer reasoning chains, highlighting the importance of extended thought processes. Additionally, the model's differential diagnoses align with the hypothetico-deductive method, showcasing its ability to systematically evaluate potential conditions based on patient symptoms."}, 'zh': {'title': '推理时间扩展助力医学推理能力提升', 'desc': '本研究基于我们之前对O1复制的研究,探讨了在大型语言模型(LLMs)中推理时间扩展对医学推理任务的潜力。通过在不同复杂度的医学基准(如MedQA、Medbullets和JAMA临床挑战)上进行广泛实验,我们发现增加推理时间确实能提高模型性能,尤其是在仅有500个样本的训练集上,性能提升可达6%-11%。此外,任务的复杂性与所需推理链的长度直接相关,表明对于复杂问题需要更长的思考过程。最后,我们的模型生成的差异性诊断遵循假设演绎法的原则,系统地评估证据以缩小可能的病症范围。'}}}, {'id': 'https://huggingface.co/papers/2501.06282', 'title': 'MinMo: A Multimodal Large Language Model for Seamless Voice Interaction', 'url': 'https://huggingface.co/papers/2501.06282', 'abstract': 'Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon.', 'score': 13, 'issue_id': 1651, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '2bd352453760208e', 'authors': ['Qian Chen', 'Yafeng Chen', 'Yanni Chen', 'Mengzhe Chen', 'Yingda Chen', 'Chong Deng', 'Zhihao Du', 'Ruize Gao', 'Changfeng Gao', 'Zhifu Gao', 'Yabin Li', 'Xiang Lv', 'Jiaqing Liu', 'Haoneng Luo', 'Bin Ma', 'Chongjia Ni', 'Xian Shi', 'Jialong Tang', 'Hui Wang', 'Hao Wang', 'Wen Wang', 'Yuxuan Wang', 'Yunlan Xu', 'Fan Yu', 'Zhijie Yan', 'Yexin Yang', 'Baosong Yang', 'Xian Yang', 'Guanrou Yang', 'Tianyu Zhao', 'Qinglin Zhang', 'Shiliang Zhang', 'Nan Zhao', 'Pei Zhang', 'Chong Zhang', 'Jinren Zhou'], 'affiliations': ['Tongyi Lab, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.06282.jpg', 'data': {'categories': ['#audio', '#multimodal', '#training'], 'emoji': '🗣️', 'ru': {'title': 'MinMo: революция в голосовом ИИ-взаимодействии', 'desc': 'Статья представляет MinMo - мультимодальную большую языковую модель для беспрепятственного голосового взаимодействия. Модель обучена на 1,4 миллионах часов разнообразных речевых данных и широком спектре речевых задач через несколько этапов выравнивания речи и текста. MinMo достигает передовых результатов в понимании и генерации речи, сохраняя при этом возможности текстовых ЯБМ. Модель также поддерживает полнодуплексное общение и управляемую генерацию речи с различными нюансами, включая эмоции, диалекты и темп речи.'}, 'en': {'title': 'MinMo: Revolutionizing Voice Interactions with Multimodal Learning', 'desc': 'This paper presents MinMo, a Multimodal Large Language Model designed for seamless voice interactions, featuring around 8 billion parameters. It overcomes limitations of previous aligned models by employing a multi-stage training approach that includes speech-to-text, text-to-speech, and duplex interaction alignments, utilizing a vast dataset of 1.4 million hours of diverse speech. MinMo achieves state-of-the-art performance in voice comprehension and generation, enabling full-duplex conversations and enhanced instruction-following capabilities for nuanced speech generation. Additionally, it introduces a novel voice decoder that significantly improves voice generation quality compared to earlier models.'}, 'zh': {'title': 'MinMo:无缝语音交互的新突破', 'desc': '本文介绍了一种名为MinMo的多模态大型语言模型,旨在实现无缝的语音交互。MinMo具有约80亿个参数,通过多阶段的对齐训练,克服了以往模型在语音理解和生成方面的局限性。该模型能够支持全双工对话,允许用户与系统进行实时的双向交流。MinMo还具备根据用户指令生成语音的能力,能够调整情感、方言和语速等细节。'}}}, {'id': 'https://huggingface.co/papers/2501.06842', 'title': 'SPAM: Spike-Aware Adam with Momentum Reset for Stable LLM Training', 'url': 'https://huggingface.co/papers/2501.06842', 'abstract': 'Large Language Models (LLMs) have demonstrated exceptional performance across diverse tasks, yet their training remains highly resource-intensive and susceptible to critical challenges such as training instability. A predominant source of this instability stems from gradient and loss spikes, which disrupt the learning process, often leading to costly interventions like checkpoint recovery and experiment restarts, further amplifying inefficiencies. This paper presents a comprehensive investigation into gradient spikes observed during LLM training, revealing their prevalence across multiple architectures and datasets. Our analysis shows that these spikes can be up to 1000times larger than typical gradients, substantially deteriorating model performance. To address this issue, we propose Spike-Aware Adam with Momentum Reset SPAM, a novel optimizer designed to counteract gradient spikes through momentum reset and spike-aware gradient clipping. Extensive experiments, including both pre-training and fine-tuning, demonstrate that SPAM consistently surpasses Adam and its variants across various tasks, including (1) LLM pre-training from 60M to 1B, (2) 4-bit LLM pre-training,(3) reinforcement learning, and (4) Time Series Forecasting. Additionally, SPAM facilitates memory-efficient training by enabling sparse momentum, where only a subset of momentum terms are maintained and updated. When operating under memory constraints, SPAM outperforms state-of-the-art memory-efficient optimizers such as GaLore and Adam-Mini. Our work underscores the importance of mitigating gradient spikes in LLM training and introduces an effective optimization strategy that enhances both training stability and resource efficiency at scale. Code is available at https://github.com/TianjinYellow/SPAM-Optimizer.git', 'score': 10, 'issue_id': 1658, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': 'd5fec659e34cf867', 'authors': ['Tianjin Huang', 'Ziquan Zhu', 'Gaojie Jin', 'Lu Liu', 'Zhangyang Wang', 'Shiwei Liu'], 'affiliations': ['Eindhoven University of Technology', 'University of Exeter', 'University of Leicester', 'University of Oxford', 'University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.06842.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization'], 'emoji': '📈', 'ru': {'title': 'SPAM: Стабильное и эффективное обучение языковых моделей', 'desc': 'Исследователи представили новый оптимизатор SPAM (Spike-Aware Adam with Momentum Reset) для обучения больших языковых моделей (LLM). SPAM предназначен для решения проблемы резких скачков градиентов, которые могут быть в 1000 раз больше обычных и нарушают процесс обучения. Оптимизатор использует сброс импульса и адаптивное ограничение градиента для противодействия этим скачкам. Эксперименты показали, что SPAM превосходит Adam и его варианты в различных задачах, включая предобучение LLM, обучение с подкреплением и прогнозирование временных рядов.'}, 'en': {'title': 'Taming Gradient Spikes for Stable LLM Training with SPAM', 'desc': 'This paper investigates the issue of gradient spikes during the training of Large Language Models (LLMs), which can lead to instability and inefficiencies. These spikes can be significantly larger than normal gradients, negatively impacting model performance and requiring costly interventions. To combat this problem, the authors propose a new optimizer called Spike-Aware Adam with Momentum Reset (SPAM), which incorporates momentum reset and spike-aware gradient clipping. Experimental results show that SPAM outperforms traditional optimizers like Adam in various tasks while also being more memory-efficient.'}, 'zh': {'title': '应对梯度波动,提升训练稳定性!', 'desc': '大型语言模型(LLMs)在多种任务中表现出色,但其训练过程资源消耗大且容易出现不稳定性。研究发现,梯度和损失的剧烈波动是导致训练不稳定的主要原因,这会影响学习过程并增加干预成本。本文提出了一种新型优化器——Spike-Aware Adam with Momentum Reset(SPAM),旨在通过动量重置和梯度剪切来应对梯度波动。实验结果表明,SPAM在多种任务中均优于传统的Adam优化器,显著提高了训练的稳定性和资源效率。'}}}, {'id': 'https://huggingface.co/papers/2501.07574', 'title': 'UnCommon Objects in 3D', 'url': 'https://huggingface.co/papers/2501.07574', 'abstract': 'We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for 3D deep learning and 3D generative AI. uCO3D is the largest publicly-available collection of high-resolution videos of objects with 3D annotations that ensures full-360^{circ} coverage. uCO3D is significantly more diverse than MVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of higher quality, due to extensive quality checks of both the collected videos and the 3D annotations. Similar to analogous datasets, uCO3D contains annotations for 3D camera poses, depth maps and sparse point clouds. In addition, each object is equipped with a caption and a 3D Gaussian Splat reconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D and obtain superior results using the latter, showing that uCO3D is better for learning applications.', 'score': 7, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '79c40f6997052ddd', 'authors': ['Xingchen Liu', 'Piyush Tayal', 'Jianyuan Wang', 'Jesus Zarzar', 'Tom Monnier', 'Konstantinos Tertikas', 'Jiali Duan', 'Antoine Toisoul', 'Jason Y. Zhang', 'Natalia Neverova', 'Andrea Vedaldi', 'Roman Shapovalov', 'David Novotny'], 'affiliations': ['Carnegie Mellon University', 'KAUST', 'Meta AI', 'NKUA, Greece'], 'pdf_title_img': 'assets/pdf/title_img/2501.07574.jpg', 'data': {'categories': ['#dataset', '#open_source', '#synthetic', '#3d'], 'emoji': '🔍', 'ru': {'title': 'uCO3D: Новый стандарт для 3D-данных в машинном обучении', 'desc': 'Авторы представляют новый набор данных uCO3D для глубокого обучения и генеративного ИИ в 3D. Этот датасет содержит высококачественные видео объектов с полным 360-градусным охватом и 3D-аннотациями. uCO3D превосходит аналоги по разнообразию, охватывая более 1000 категорий объектов, и качеству благодаря тщательным проверкам. Помимо стандартных аннотаций, датасет включает подписи к объектам и 3D-реконструкции на основе гауссовых сплатов.'}, 'en': {'title': 'Unlocking 3D Learning with uCO3D: A New Era of Object-Centric Datasets', 'desc': 'The paper presents Uncommon Objects in 3D (uCO3D), a comprehensive dataset designed for advancing 3D deep learning and generative AI. This dataset features high-resolution videos with full 360-degree coverage and includes over 1,000 diverse object categories, making it larger and more varied than existing datasets like MVImgNet and CO3Dv2. uCO3D provides detailed annotations such as 3D camera poses, depth maps, and sparse point clouds, along with captions and 3D Gaussian Splat reconstructions for each object. Experiments demonstrate that training large 3D models on uCO3D yields superior performance compared to other datasets, highlighting its effectiveness for learning applications.'}, 'zh': {'title': 'uCO3D:提升3D学习的全新数据集', 'desc': '我们介绍了一个新的3D深度学习和生成AI数据集,名为Uncommon Objects in 3D(uCO3D)。uCO3D是一个公开可用的高分辨率视频集合,包含360度的3D注释,涵盖超过1000个物体类别,具有更高的多样性和质量。该数据集提供了3D相机姿态、深度图和稀疏点云的注释,并为每个物体配备了描述和3D高斯点云重建。通过在多个数据集上训练大型3D模型,我们发现uCO3D在学习应用中表现更优。'}}}, {'id': 'https://huggingface.co/papers/2501.07171', 'title': 'BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and Vision-Language Models Derived from Scientific Literature', 'url': 'https://huggingface.co/papers/2501.07171', 'abstract': 'The development of vision-language models (VLMs) is driven by large-scale and diverse multimodal datasets. However, progress toward generalist biomedical VLMs is limited by the lack of annotated, publicly accessible datasets across biology and medicine. Existing efforts are restricted to narrow domains, missing the full diversity of biomedical knowledge encoded in scientific literature. To address this gap, we introduce BIOMEDICA, a scalable, open-source framework to extract, annotate, and serialize the entirety of the PubMed Central Open Access subset into an easy-to-use, publicly accessible dataset.Our framework produces a comprehensive archive with over 24 million unique image-text pairs from over 6 million articles. Metadata and expert-guided annotations are also provided. We demonstrate the utility and accessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style models continuously pre-trained on the BIOMEDICA dataset via streaming, eliminating the need to download 27 TB of data locally.On average, our models achieve state-of-the-art performance across 40 tasks - spanning pathology, radiology, ophthalmology, dermatology, surgery, molecular biology, parasitology, and cell biology - excelling in zero-shot classification with a 6.56% average improvement (as high as 29.8% and 17.5% in dermatology and ophthalmology, respectively), and stronger image-text retrieval, all while using 10x less compute. To foster reproducibility and collaboration, we release our codebase and dataset for the broader research community.', 'score': 3, 'issue_id': 1656, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '07db2230e08b0fde', 'authors': ['Alejandro Lozano', 'Min Woo Sun', 'James Burgess', 'Liangyu Chen', 'Jeffrey J Nirschl', 'Jeffrey Gu', 'Ivan Lopez', 'Josiah Aklilu', 'Austin Wolfgang Katzer', 'Collin Chiu', 'Anita Rau', 'Xiaohan Wang', 'Yuhui Zhang', 'Alfred Seunghoon Song', 'Robert Tibshirani', 'Serena Yeung-Levy'], 'affiliations': ['Department of Biomedical Data Science, Stanford University', 'Department of Computer Science, Stanford University', 'Department of Electrical Engineering, Stanford University', 'Department of Pathology, Stanford University', 'Department of Statistics, Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07171.jpg', 'data': {'categories': ['#healthcare', '#cv', '#dataset', '#science', '#multimodal', '#open_source'], 'emoji': '🧬', 'ru': {'title': 'BIOMEDICA: Прорыв в обработке биомедицинских данных с помощью ИИ', 'desc': 'Статья представляет BIOMEDICA - масштабируемый фреймворк с открытым исходным кодом для извлечения и аннотирования биомедицинских данных из научной литературы. Фреймворк создал обширный архив из более чем 24 миллионов уникальных пар изображение-текст из более 6 миллионов статей. На основе этого датасета были обучены модели BMCA-CLIP, достигшие state-of-the-art результатов в 40 биомедицинских задачах. Модели показали значительное улучшение в zero-shot классификации и поиске изображений по тексту при использовании в 10 раз меньших вычислительных ресурсов.'}, 'en': {'title': 'Unlocking Biomedical Knowledge with BIOMEDICA', 'desc': 'This paper presents BIOMEDICA, a new framework designed to create a large, open-source dataset from the PubMed Central Open Access subset, which includes over 24 million image-text pairs from scientific articles. The framework addresses the challenge of limited annotated datasets in the biomedical field, enabling the development of generalist vision-language models (VLMs) that can understand diverse biomedical knowledge. The authors also introduce BMCA-CLIP, a set of models that are continuously pre-trained on this dataset, achieving state-of-the-art performance across various medical tasks with significant improvements in zero-shot classification and image-text retrieval. By making their codebase and dataset publicly available, they aim to enhance reproducibility and collaboration in biomedical research.'}, 'zh': {'title': '推动生物医学领域的视觉语言模型发展', 'desc': '本文介绍了BIOMEDICA,一个可扩展的开源框架,用于提取、注释和序列化PubMed Central开放获取子集的全部内容。该框架生成了一个包含超过2400万个独特图像-文本对的综合档案,来自超过600万篇文章。我们还提供了元数据和专家指导的注释,并展示了BMCA-CLIP模型在40个医学任务中的优越性能,尤其在零样本分类和图像-文本检索方面表现突出。通过发布代码库和数据集,我们促进了研究的可重复性和合作。'}}}, {'id': 'https://huggingface.co/papers/2501.06590', 'title': 'ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning', 'url': 'https://huggingface.co/papers/2501.06590', 'abstract': 'Chemical reasoning usually involves complex, multi-step processes that demand precise calculations, where even minor errors can lead to cascading failures. Furthermore, large language models (LLMs) encounter difficulties handling domain-specific formulas, executing reasoning steps accurately, and integrating code effectively when tackling chemical reasoning tasks. To address these challenges, we present ChemAgent, a novel framework designed to improve the performance of LLMs through a dynamic, self-updating library. This library is developed by decomposing chemical tasks into sub-tasks and compiling these sub-tasks into a structured collection that can be referenced for future queries. Then, when presented with a new problem, ChemAgent retrieves and refines pertinent information from the library, which we call memory, facilitating effective task decomposition and the generation of solutions. Our method designs three types of memory and a library-enhanced reasoning component, enabling LLMs to improve over time through experience. Experimental results on four chemical reasoning datasets from SciBench demonstrate that ChemAgent achieves performance gains of up to 46% (GPT-4), significantly outperforming existing methods. Our findings suggest substantial potential for future applications, including tasks such as drug discovery and materials science. Our code can be found at https://github.com/gersteinlab/chemagent', 'score': 3, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'c217e826245ef357', 'authors': ['Xiangru Tang', 'Tianyu Hu', 'Muyang Ye', 'Yanjun Shao', 'Xunjian Yin', 'Siru Ouyang', 'Wangchunshu Zhou', 'Pan Lu', 'Zhuosheng Zhang', 'Yilun Zhao', 'Arman Cohan', 'Mark Gerstein'], 'affiliations': ['Shanghai Jiao Tong University', 'Stanford University', 'UIUC', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06590.jpg', 'data': {'categories': ['#science', '#reasoning', '#multimodal', '#agents', '#dataset'], 'emoji': '🧪', 'ru': {'title': 'ChemAgent: Умный помощник для LLM в химических задачах', 'desc': 'ChemAgent - это новая система, улучшающая работу больших языковых моделей (LLM) в задачах химического рассуждения. Она использует динамически обновляемую библиотеку, созданную путем декомпозиции химических задач на подзадачи. При решении новых проблем ChemAgent извлекает и уточняет релевантную информацию из библиотеки, что позволяет эффективно декомпозировать задачи и генерировать решения. Система показала значительное превосходство над существующими методами, улучшив производительность LLM до 46% на четырех наборах данных по химическому рассуждению.'}, 'en': {'title': 'Empowering LLMs for Chemical Reasoning with ChemAgent', 'desc': 'This paper introduces ChemAgent, a new framework that enhances large language models (LLMs) for chemical reasoning tasks. It addresses the challenges LLMs face with complex chemical calculations and domain-specific formulas by creating a dynamic library of decomposed sub-tasks. ChemAgent retrieves and refines relevant information from this library, allowing for better task decomposition and solution generation. Experimental results show that ChemAgent significantly improves performance on chemical reasoning datasets, indicating its potential for applications in drug discovery and materials science.'}, 'zh': {'title': 'ChemAgent:提升化学推理的智能助手', 'desc': '化学推理通常涉及复杂的多步骤过程,需要精确的计算,哪怕是微小的错误也可能导致严重的后果。大型语言模型(LLMs)在处理特定领域的公式、准确执行推理步骤和有效整合代码时面临困难。为了解决这些问题,我们提出了ChemAgent,一个通过动态自更新库来提升LLMs性能的新框架。该框架通过将化学任务分解为子任务,并将这些子任务编译成结构化的集合,以便在未来查询时参考,从而实现有效的任务分解和解决方案生成。'}}}, {'id': 'https://huggingface.co/papers/2501.06708', 'title': 'Evaluating Sample Utility for Data Selection by Mimicking Model Weights', 'url': 'https://huggingface.co/papers/2501.06708', 'abstract': "Foundation models rely on large-scale web-crawled datasets, which frequently contain noisy data, biases, and irrelevant content. Existing data selection techniques typically use human heuristics, downstream evaluation datasets, or specialized scoring models, and can overlook samples' utility in the training process. Instead, we propose a new approach, Mimic Score, a data quality metric that uses a pretrained reference model as a guide to assess the usefulness of data samples for training a new model. It relies on the alignment between the gradient of the new model parameters and the vector pointing toward the reference model in weight space. Samples that misalign with this direction are considered low-value and can be filtered out. Motivated by the Mimic score, we develop Grad-Mimic, a data selection framework that identifies and prioritizes useful samples, automating the selection process to create effective filters. Empirically, using Mimic scores to guide model training results in consistent performance gains across six image datasets and enhances the performance of CLIP models. Moreover, Mimic scores and their associated filters improve upon existing filtering methods and offer accurate estimation of dataset quality.", 'score': 2, 'issue_id': 1661, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': '7560c17a0e1b7234', 'authors': ['Tzu-Heng Huang', 'Manjot Bilkhu', 'Frederic Sala', 'Javier Movellan'], 'affiliations': ['Apple Inc.', 'University of Wisconsin-Madison'], 'pdf_title_img': 'assets/pdf/title_img/2501.06708.jpg', 'data': {'categories': ['#data', '#optimization', '#dataset', '#ethics', '#training'], 'emoji': '🧠', 'ru': {'title': 'Умный отбор данных для эффективного обучения моделей', 'desc': 'Предложен новый подход к оценке качества данных для обучения моделей машинного обучения - Mimic Score. Этот метод использует предобученную эталонную модель для оценки полезности образцов данных, анализируя выравнивание градиента параметров новой модели с вектором, указывающим на эталонную модель в пространстве весов. На основе Mimic Score разработан фреймворк Grad-Mimic для автоматизированного отбора полезных образцов данных. Эксперименты показали, что использование Mimic Score приводит к улучшению производительности моделей на нескольких наборах данных изображений и моделей CLIP.'}, 'en': {'title': 'Enhancing Data Selection with Mimic Score for Better Model Training', 'desc': 'This paper introduces a new method called Mimic Score to improve data selection for training foundation models. It uses a pretrained reference model to evaluate the usefulness of data samples by analyzing the alignment of gradients in weight space. Samples that do not align well with the reference model are deemed low-value and can be removed from the training dataset. The proposed Grad-Mimic framework automates this selection process, leading to better model performance across various image datasets and outperforming existing data filtering techniques.'}, 'zh': {'title': 'Mimic Score:提升数据选择的新方法', 'desc': '基础模型依赖于大规模的网络爬取数据集,这些数据集常常包含噪声数据、偏见和无关内容。现有的数据选择技术通常使用人工启发式方法、下游评估数据集或专门的评分模型,可能会忽视样本在训练过程中的实用性。我们提出了一种新的方法,称为Mimic Score,这是一种数据质量指标,利用预训练的参考模型来评估数据样本对新模型训练的有用性。基于Mimic Score,我们开发了Grad-Mimic数据选择框架,自动识别和优先选择有用样本,从而提高模型训练的效果。'}}}, {'id': 'https://huggingface.co/papers/2501.03262', 'title': 'REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models', 'url': 'https://huggingface.co/papers/2501.03262', 'abstract': 'Reinforcement Learning from Human Feedback (RLHF) has emerged as a critical approach for aligning large language models with human preferences, witnessing rapid algorithmic evolution through methods such as Proximal Policy Optimization (PPO), Direct Preference Optimization (DPO), REINFORCE Leave One-Out (RLOO), ReMax, and Group Relative Policy Optimization (GRPO). We present REINFORCE++, an enhanced variant of the classical REINFORCE algorithm that incorporates key optimization techniques from PPO while eliminating the need for a critic network. REINFORCE++ achieves three primary objectives: (1) simplicity (2) enhanced training stability, and (3) reduced computational overhead. Through extensive empirical evaluation, we demonstrate that REINFORCE++ exhibits superior stability compared to GRPO and achieves greater computational efficiency than PPO while maintaining comparable performance. The implementation is available at https://github.com/OpenRLHF/OpenRLHF.', 'score': 42, 'issue_id': 1553, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': 'a05acf5aab0c07dd', 'authors': ['Jian Hu'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.03262.jpg', 'data': {'categories': ['#training', '#rlhf', '#optimization', '#alignment'], 'emoji': '🤖', 'ru': {'title': 'REINFORCE++: Простой и эффективный алгоритм для RLHF', 'desc': 'В статье представлен REINFORCE++, улучшенная версия алгоритма REINFORCE для обучения с подкреплением на основе обратной связи от человека (RLHF). REINFORCE++ сочетает ключевые техники оптимизации из PPO, но не требует использования критической нейронной сети. Алгоритм отличается простотой, повышенной стабильностью обучения и сниженными вычислительными затратами. Эмпирические исследования показывают, что REINFORCE++ демонстрирует лучшую стабильность по сравнению с GRPO и большую вычислительную эффективность, чем PPO, при сохранении сопоставимой производительности.'}, 'en': {'title': 'REINFORCE++: Simplifying Reinforcement Learning with Human Feedback', 'desc': 'This paper introduces REINFORCE++, a new version of the REINFORCE algorithm designed to improve the training of reinforcement learning models using human feedback. It combines the strengths of Proximal Policy Optimization (PPO) while removing the need for a critic network, making it simpler and more efficient. The authors highlight that REINFORCE++ offers better training stability and lower computational costs compared to existing methods like GRPO and PPO. Their experiments show that REINFORCE++ performs well while being easier to use and faster to train.'}, 'zh': {'title': 'REINFORCE++:简化与高效的强化学习新选择', 'desc': '强化学习中的人类反馈(RLHF)是一种重要的方法,用于使大型语言模型更符合人类的偏好。本文提出了REINFORCE++,这是经典REINFORCE算法的增强版本,结合了PPO的优化技术,并且不再需要评论网络。REINFORCE++的主要目标是实现简单性、提高训练稳定性和减少计算开销。通过大量实证评估,我们证明了REINFORCE++在稳定性上优于GRPO,并且在计算效率上超过PPO,同时保持了相似的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.02955', 'title': 'MotionBench: Benchmarking and Improving Fine-grained Video Motion Understanding for Vision Language Models', 'url': 'https://huggingface.co/papers/2501.02955', 'abstract': "In recent years, vision language models (VLMs) have made significant advancements in video understanding. However, a crucial capability - fine-grained motion comprehension - remains under-explored in current benchmarks. To address this gap, we propose MotionBench, a comprehensive evaluation benchmark designed to assess the fine-grained motion comprehension of video understanding models. MotionBench evaluates models' motion-level perception through six primary categories of motion-oriented question types and includes data collected from diverse sources, ensuring a broad representation of real-world video content. Experimental results reveal that existing VLMs perform poorly in understanding fine-grained motions. To enhance VLM's ability to perceive fine-grained motion within a limited sequence length of LLM, we conduct extensive experiments reviewing VLM architectures optimized for video feature compression and propose a novel and efficient Through-Encoder (TE) Fusion method. Experiments show that higher frame rate inputs and TE Fusion yield improvements in motion understanding, yet there is still substantial room for enhancement. Our benchmark aims to guide and motivate the development of more capable video understanding models, emphasizing the importance of fine-grained motion comprehension. Project page: https://motion-bench.github.io .", 'score': 30, 'issue_id': 1551, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'a7051c2d239484b4', 'authors': ['Wenyi Hong', 'Yean Cheng', 'Zhuoyi Yang', 'Weihan Wang', 'Lefan Wang', 'Xiaotao Gu', 'Shiyu Huang', 'Yuxiao Dong', 'Jie Tang'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.02955.jpg', 'data': {'categories': ['#architecture', '#optimization', '#benchmark', '#video'], 'emoji': '🎥', 'ru': {'title': 'MotionBench: новый рубеж в понимании движения для моделей компьютерного зрения', 'desc': 'Статья представляет новый бенчмарк MotionBench для оценки способности моделей компьютерного зрения понимать детальные движения в видео. Авторы обнаружили, что существующие модели плохо справляются с этой задачей. Для улучшения результатов предложен новый метод Through-Encoder Fusion, а также использование видео с более высокой частотой кадров. Бенчмарк призван стимулировать развитие более совершенных моделей понимания видео.'}, 'en': {'title': 'Enhancing Video Understanding with Fine-Grained Motion Comprehension', 'desc': "This paper introduces MotionBench, a new benchmark for evaluating how well vision language models (VLMs) understand fine-grained motion in videos. It identifies a gap in current models' abilities to comprehend detailed motion, which is crucial for accurate video analysis. The benchmark includes various motion-oriented question types and diverse video data to ensure comprehensive testing. The authors also propose a Through-Encoder Fusion method to improve VLM performance, highlighting the need for further advancements in fine-grained motion comprehension."}, 'zh': {'title': '提升视频理解的细粒度运动能力', 'desc': '近年来,视觉语言模型(VLMs)在视频理解方面取得了显著进展。然而,细粒度运动理解这一关键能力在当前基准测试中仍未得到充分探索。为了解决这一问题,我们提出了MotionBench,这是一个全面的评估基准,旨在评估视频理解模型的细粒度运动理解能力。实验结果表明,现有的VLM在理解细粒度运动方面表现不佳,因此我们提出了一种新颖的Through-Encoder(TE)融合方法,以提高模型的运动理解能力。'}}}, {'id': 'https://huggingface.co/papers/2501.03575', 'title': 'Cosmos World Foundation Model Platform for Physical AI', 'url': 'https://huggingface.co/papers/2501.03575', 'abstract': 'Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.', 'score': 25, 'issue_id': 1552, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'f4b2044cbc1076a8', 'authors': ['NVIDIA', ':', 'Niket Agarwal', 'Arslan Ali', 'Maciej Bala', 'Yogesh Balaji', 'Erik Barker', 'Tiffany Cai', 'Prithvijit Chattopadhyay', 'Yongxin Chen', 'Yin Cui', 'Yifan Ding', 'Daniel Dworakowski', 'Jiaojiao Fan', 'Michele Fenzi', 'Francesco Ferroni', 'Sanja Fidler', 'Dieter Fox', 'Songwei Ge', 'Yunhao Ge', 'Jinwei Gu', 'Siddharth Gururani', 'Ethan He', 'Jiahui Huang', 'Jacob Huffman', 'Pooya Jannaty', 'Jingyi Jin', 'Seung Wook Kim', 'Gergely Klár', 'Grace Lam', 'Shiyi Lan', 'Laura Leal-Taixe', 'Anqi Li', 'Zhaoshuo Li', 'Chen-Hsuan Lin', 'Tsung-Yi Lin', 'Huan Ling', 'Ming-Yu Liu', 'Xian Liu', 'Alice Luo', 'Qianli Ma', 'Hanzi Mao', 'Kaichun Mo', 'Arsalan Mousavian', 'Seungjun Nah', 'Sriharsha Niverty', 'David Page', 'Despoina Paschalidou', 'Zeeshan Patel', 'Lindsey Pavao', 'Morteza Ramezanali', 'Fitsum Reda', 'Xiaowei Ren', 'Vasanth Rao Naik Sabavat', 'Ed Schmerling', 'Stella Shi', 'Bartosz Stefaniak', 'Shitao Tang', 'Lyne Tchapmi', 'Przemek Tredak', 'Wei-Cheng Tseng', 'Jibin Varghese', 'Hao Wang', 'Haoxiang Wang', 'Heng Wang', 'Ting-Chun Wang', 'Fangyin Wei', 'Xinyue Wei', 'Jay Zhangjie Wu', 'Jiashu Xu', 'Wei Yang', 'Lin Yen-Chen', 'Xiaohui Zeng', 'Yu Zeng', 'Jing Zhang', 'Qinsheng Zhang', 'Yuxuan Zhang', 'Qingqing Zhao', 'Artur Zolkowski'], 'affiliations': ['NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2501.03575.jpg', 'data': {'categories': ['#open_source', '#data', '#benchmark', '#architecture', '#video', '#multimodal', '#dataset', '#training'], 'emoji': '🌍', 'ru': {'title': 'Цифровой двойник мира для обучения физического ИИ', 'desc': 'Статья представляет платформу Cosmos World Foundation Model для разработки моделей мира в физическом ИИ. Авторы предлагают концепцию базовой модели мира, которую можно дообучать для конкретных приложений. Платформа включает конвейер курации видео, предобученные базовые модели мира, примеры дообучения и токенизаторы видео. Проект открытый и доступен на GitHub для помощи разработчикам физического ИИ в решении важных проблем общества.'}, 'en': {'title': 'Empowering Physical AI with Customizable World Models', 'desc': 'This paper introduces the Cosmos World Foundation Model Platform, designed to assist developers in creating tailored world models for Physical AI systems. It emphasizes the necessity of having a digital twin of both the AI and its environment to enable effective training. The platform includes a comprehensive video curation pipeline, pre-trained models, and tools for fine-tuning these models for specific applications. By making the platform and models open-source, the authors aim to empower developers to address significant societal challenges using Physical AI.'}, 'zh': {'title': '构建物理AI的数字双胞胎与世界模型', 'desc': '这篇论文介绍了物理人工智能(Physical AI)在数字训练中的重要性。为了实现这一目标,需要构建一个数字双胞胎(digital twin)和一个世界模型(world model)。我们提出了Cosmos世界基础模型平台,帮助开发者为物理人工智能定制世界模型。该平台提供了视频策划管道、预训练的世界基础模型以及后训练示例,旨在解决社会中的关键问题,并且是开源的。'}}}, {'id': 'https://huggingface.co/papers/2501.03895', 'title': 'LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One Vision Token', 'url': 'https://huggingface.co/papers/2501.03895', 'abstract': 'The advent of real-time large multimodal models (LMMs) like GPT-4o has sparked considerable interest in efficient LMMs. LMM frameworks typically encode visual inputs into vision tokens (continuous representations) and integrate them and textual instructions into the context of large language models (LLMs), where large-scale parameters and numerous context tokens (predominantly vision tokens) result in substantial computational overhead. Previous efforts towards efficient LMMs always focus on replacing the LLM backbone with smaller models, while neglecting the crucial issue of token quantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal vision tokens. To achieve a high compression ratio of vision tokens while preserving visual information, we first analyze how LMMs understand vision tokens and find that most vision tokens only play a crucial role in the early layers of LLM backbone, where they mainly fuse visual information into text tokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to fuse visual information into text tokens in advance, thereby facilitating the extreme compression of vision tokens fed to LLM backbone into one token. LLaVA-Mini is a unified large multimodal model that can support the understanding of images, high-resolution images, and videos in an efficient manner. Experiments across 11 image-based and 7 video-based benchmarks demonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token instead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by 77%, deliver low-latency responses within 40 milliseconds, and process over 10,000 frames of video on the GPU hardware with 24GB of memory.', 'score': 19, 'issue_id': 1550, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '925d2f81d6fcbb0b', 'authors': ['Shaolei Zhang', 'Qingkai Fang', 'Zhe Yang', 'Yang Feng'], 'affiliations': ['Key Laboratory of AI Safety, Chinese Academy of Sciences', 'Key Laboratory of Intelligent Information Processing, Institute of Computing Technology, Chinese Academy of Sciences (ICT/CAS)', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.03895.jpg', 'data': {'categories': ['#agi', '#video', '#multimodal', '#architecture', '#optimization', '#cv', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Эффективность через минимизацию: революция в мультимодальных моделях', 'desc': 'Статья представляет LLaVA-Mini - эффективную мультимодальную модель с минимальным количеством визуальных токенов. Авторы обнаружили, что большинство визуальных токенов играют ключевую роль только в ранних слоях языковой модели. LLaVA-Mini вводит предварительное слияние модальностей, чтобы объединить визуальную информацию с текстовыми токенами заранее. Эксперименты показывают, что LLaVA-Mini превосходит LLaVA-v1.5, используя всего 1 визуальный токен вместо 576, что значительно повышает эффективность обработки.'}, 'en': {'title': 'Maximizing Efficiency with Minimal Vision Tokens in LMMs', 'desc': 'This paper presents LLaVA-Mini, an efficient large multimodal model (LMM) designed to reduce the number of vision tokens while maintaining visual information integrity. The authors identify that most vision tokens are primarily important in the early layers of the language model, where they integrate visual data with text. By implementing a technique called modality pre-fusion, LLaVA-Mini compresses the input from 576 vision tokens to just one, significantly enhancing efficiency. Experimental results show that LLaVA-Mini not only outperforms its predecessor but also achieves a 77% reduction in computational load and rapid processing times for high-resolution images and videos.'}, 'zh': {'title': '高效多模态模型LLaVA-Mini的创新之路', 'desc': '本文介绍了一种高效的多模态模型LLaVA-Mini,该模型通过减少视觉标记的数量来提高效率。研究发现,大多数视觉标记在大型语言模型的早期层中起着关键作用,因此可以在此之前将视觉信息与文本标记融合。LLaVA-Mini采用了模态预融合的方法,将视觉信息提前融合,从而将输入到语言模型的视觉标记压缩为一个标记。实验结果表明,LLaVA-Mini在多个基准测试中表现优于之前的模型,且显著降低了计算复杂度和延迟。'}}}, {'id': 'https://huggingface.co/papers/2501.04001', 'title': 'Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos', 'url': 'https://huggingface.co/papers/2501.04001', 'abstract': 'This work presents Sa2VA, the first unified model for dense grounded understanding of both images and videos. Unlike existing multi-modal large language models, which are often limited to specific modalities and tasks, Sa2VA supports a wide range of image and video tasks, including referring segmentation and conversation, with minimal one-shot instruction tuning. Sa2VA combines SAM-2, a foundation video segmentation model, with LLaVA, an advanced vision-language model, and unifies text, image, and video into a shared LLM token space. Using the LLM, Sa2VA generates instruction tokens that guide SAM-2 in producing precise masks, enabling a grounded, multi-modal understanding of both static and dynamic visual content. Additionally, we introduce Ref-SAV, an auto-labeled dataset containing over 72k object expressions in complex video scenes, designed to boost model performance. We also manually validate 2k video objects in the Ref-SAV datasets to benchmark referring video object segmentation in complex environments. Experiments show that Sa2VA achieves state-of-the-art across multiple tasks, particularly in referring video object segmentation, highlighting its potential for complex real-world applications.', 'score': 16, 'issue_id': 1555, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'd079946bf74858cd', 'authors': ['Haobo Yuan', 'Xiangtai Li', 'Tao Zhang', 'Zilong Huang', 'Shilin Xu', 'Shunping Ji', 'Yunhai Tong', 'Lu Qi', 'Jiashi Feng', 'Ming-Hsuan Yang'], 'affiliations': ['Bytedance Seed', 'Peking University', 'UC Merced', 'Wuhan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04001.jpg', 'data': {'categories': ['#dataset', '#multimodal', '#benchmark', '#cv'], 'emoji': '🎥', 'ru': {'title': 'Sa2VA: Единая модель для понимания изображений и видео', 'desc': 'Sa2VA - это первая унифицированная модель для плотного заземленного понимания изображений и видео. Она объединяет SAM-2 (модель сегментации видео) с LLaVA (продвинутой моделью компьютерного зрения и языка) в едином пространстве токенов большой языковой модели. Sa2VA генерирует токены инструкций, направляющие SAM-2 в создании точных масок, что позволяет осуществлять заземленное мультимодальное понимание как статического, так и динамического визуального контента. Модель достигает передовых результатов в различных задачах, особенно в сегментации объектов по ссылкам в видео.'}, 'en': {'title': 'Sa2VA: Unifying Image and Video Understanding for Enhanced Multi-Modal Tasks', 'desc': 'Sa2VA is a groundbreaking model that integrates image and video understanding into a single framework. It combines the strengths of SAM-2 for video segmentation and LLaVA for vision-language tasks, allowing it to handle various multi-modal tasks with minimal tuning. By creating a shared token space for text, images, and videos, Sa2VA can generate specific instruction tokens that help in accurately segmenting objects in both images and videos. The introduction of the Ref-SAV dataset further enhances its capabilities, enabling it to achieve top performance in complex visual environments.'}, 'zh': {'title': 'Sa2VA:图像与视频的统一理解模型', 'desc': '本研究提出了Sa2VA,这是第一个统一的模型,能够对图像和视频进行密集的基础理解。与现有的多模态大型语言模型不同,Sa2VA支持多种图像和视频任务,包括引用分割和对话,且只需最少的一次性指令调优。Sa2VA结合了基础视频分割模型SAM-2和先进的视觉语言模型LLaVA,将文本、图像和视频统一到共享的LLM令牌空间中。实验表明,Sa2VA在多个任务上达到了最先进的水平,特别是在引用视频对象分割方面,展示了其在复杂现实应用中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.03847', 'title': 'Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control', 'url': 'https://huggingface.co/papers/2501.03847', 'abstract': 'Diffusion models have demonstrated impressive performance in generating high-quality videos from text prompts or images. However, precise control over the video generation process, such as camera manipulation or content editing, remains a significant challenge. Existing methods for controlled video generation are typically limited to a single control type, lacking the flexibility to handle diverse control demands. In this paper, we introduce Diffusion as Shader (DaS), a novel approach that supports multiple video control tasks within a unified architecture. Our key insight is that achieving versatile video control necessitates leveraging 3D control signals, as videos are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods limited to 2D control signals, DaS leverages 3D tracking videos as control inputs, making the video diffusion process inherently 3D-aware. This innovation allows DaS to achieve a wide range of video controls by simply manipulating the 3D tracking videos. A further advantage of using 3D tracking videos is their ability to effectively link frames, significantly enhancing the temporal consistency of the generated videos. With just 3 days of fine-tuning on 8 H800 GPUs using less than 10k videos, DaS demonstrates strong control capabilities across diverse tasks, including mesh-to-video generation, camera control, motion transfer, and object manipulation.', 'score': 11, 'issue_id': 1552, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '975d5fa9d59bde28', 'authors': ['Zekai Gu', 'Rui Yan', 'Jiahao Lu', 'Peng Li', 'Zhiyang Dou', 'Chenyang Si', 'Zhen Dong', 'Qifeng Liu', 'Cheng Lin', 'Ziwei Liu', 'Wenping Wang', 'Yuan Liu'], 'affiliations': ['Hong Kong University of Science and Technology, China', 'Nanyang Technological University, Singapore', 'Texas A&M University, U.S.A', 'The University of Hong Kong, China', 'Wuhan University, China', 'Zhejiang University, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.03847.jpg', 'data': {'categories': ['#video', '#diffusion', '#3d'], 'emoji': '🎬', 'ru': {'title': 'DaS: Универсальный контроль над генерацией видео через 3D-сигналы', 'desc': 'Авторы представляют новый подход под названием Diffusion as Shader (DaS) для контролируемой генерации видео с помощью диффузионных моделей. В отличие от существующих методов, ограниченных одним типом контроля, DaS поддерживает множество задач управления видео в единой архитектуре. Ключевая идея заключается в использовании 3D-сигналов управления, что делает процесс диффузии видео изначально 3D-ориентированным. DaS демонстрирует сильные возможности управления в различных задачах, включая генерацию видео из 3D-моделей, контроль камеры, перенос движения и манипуляции с объектами.'}, 'en': {'title': 'Empowering Video Generation with 3D Control Signals', 'desc': 'This paper presents Diffusion as Shader (DaS), a new method for generating videos that allows for precise control over various aspects of video creation. Unlike previous models that only used 2D control signals, DaS utilizes 3D tracking videos, which helps in managing the dynamic nature of video content. This approach enables users to manipulate video elements like camera angles and object movements more effectively. The results show that DaS can maintain high-quality video generation while ensuring temporal consistency across frames, even with limited training data.'}, 'zh': {'title': '多样化视频控制的新方法:扩散作为着色器', 'desc': '扩散模型在从文本提示或图像生成高质量视频方面表现出色。然而,精确控制视频生成过程,如相机操作或内容编辑,仍然是一个重大挑战。现有的受控视频生成方法通常仅限于单一控制类型,缺乏处理多样化控制需求的灵活性。本文提出了一种新方法——扩散作为着色器(DaS),它在统一架构中支持多种视频控制任务,利用3D控制信号来实现更灵活的视频控制。'}}}, {'id': 'https://huggingface.co/papers/2501.03936', 'title': 'PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides', 'url': 'https://huggingface.co/papers/2501.03936', 'abstract': 'Automatically generating presentations from documents is a challenging task that requires balancing content quality, visual design, and structural coherence. Existing methods primarily focus on improving and evaluating the content quality in isolation, often overlooking visual design and structural coherence, which limits their practical applicability. To address these limitations, we propose PPTAgent, which comprehensively improves presentation generation through a two-stage, edit-based approach inspired by human workflows. PPTAgent first analyzes reference presentations to understand their structural patterns and content schemas, then drafts outlines and generates slides through code actions to ensure consistency and alignment. To comprehensively evaluate the quality of generated presentations, we further introduce PPTEval, an evaluation framework that assesses presentations across three dimensions: Content, Design, and Coherence. Experiments show that PPTAgent significantly outperforms traditional automatic presentation generation methods across all three dimensions. The code and data are available at https://github.com/icip-cas/PPTAgent.', 'score': 7, 'issue_id': 1557, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '57bb4703056c9e20', 'authors': ['Hao Zheng', 'Xinyan Guan', 'Hao Kong', 'Jia Zheng', 'Hongyu Lin', 'Yaojie Lu', 'Ben He', 'Xianpei Han', 'Le Sun'], 'affiliations': ['Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences', 'Shanghai Jiexin Technology', 'University of Chinese Academy of Sciences'], 'pdf_title_img': 'assets/pdf/title_img/2501.03936.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#dataset'], 'emoji': '🎭', 'ru': {'title': 'PPTAgent: ИИ-помощник для создания презентаций нового уровня', 'desc': 'Исследователи представили PPTAgent - систему для автоматического создания презентаций из документов. В отличие от существующих методов, PPTAgent улучшает не только качество контента, но и визуальный дизайн и структурную согласованность. Система использует двухэтапный подход, вдохновленный рабочим процессом человека: сначала анализирует образцы презентаций, затем создает слайды с помощью программных действий. Авторы также разработали фреймворк PPTEval для комплексной оценки генерируемых презентаций.'}, 'en': {'title': 'PPTAgent: Elevating Presentation Generation with Content, Design, and Coherence', 'desc': 'This paper presents PPTAgent, a novel approach for automatically generating presentations from documents. Unlike existing methods that focus solely on content quality, PPTAgent enhances the overall presentation by considering visual design and structural coherence as well. It employs a two-stage, edit-based process that first analyzes reference presentations to extract patterns and then generates slides through code actions. Additionally, the authors introduce PPTEval, a framework for evaluating presentations based on content, design, and coherence, demonstrating that PPTAgent outperforms traditional methods in all areas.'}, 'zh': {'title': '智能生成高质量演示文稿的解决方案', 'desc': '本文提出了一种名为PPTAgent的自动生成演示文稿的方法。该方法通过两阶段的编辑式流程,综合考虑内容质量、视觉设计和结构一致性。PPTAgent首先分析参考演示文稿,以理解其结构模式和内容框架,然后通过代码操作草拟大纲并生成幻灯片。为了全面评估生成演示文稿的质量,本文还引入了PPTEval评估框架,从内容、设计和一致性三个维度进行评估。'}}}, {'id': 'https://huggingface.co/papers/2501.03714', 'title': 'MoDec-GS: Global-to-Local Motion Decomposition and Temporal Interval Adjustment for Compact Dynamic 3D Gaussian Splatting', 'url': 'https://huggingface.co/papers/2501.03714', 'abstract': '3D Gaussian Splatting (3DGS) has made significant strides in scene representation and neural rendering, with intense efforts focused on adapting it for dynamic scenes. Despite delivering remarkable rendering quality and speed, existing methods struggle with storage demands and representing complex real-world motions. To tackle these issues, we propose MoDecGS, a memory-efficient Gaussian splatting framework designed for reconstructing novel views in challenging scenarios with complex motions. We introduce GlobaltoLocal Motion Decomposition (GLMD) to effectively capture dynamic motions in a coarsetofine manner. This approach leverages Global Canonical Scaffolds (Global CS) and Local Canonical Scaffolds (Local CS), extending static Scaffold representation to dynamic video reconstruction. For Global CS, we propose Global Anchor Deformation (GAD) to efficiently represent global dynamics along complex motions, by directly deforming the implicit Scaffold attributes which are anchor position, offset, and local context features. Next, we finely adjust local motions via the Local Gaussian Deformation (LGD) of Local CS explicitly. Additionally, we introduce Temporal Interval Adjustment (TIA) to automatically control the temporal coverage of each Local CS during training, allowing MoDecGS to find optimal interval assignments based on the specified number of temporal segments. Extensive evaluations demonstrate that MoDecGS achieves an average 70% reduction in model size over stateoftheart methods for dynamic 3D Gaussians from realworld dynamic videos while maintaining or even improving rendering quality.', 'score': 5, 'issue_id': 1556, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'c6cfa761edc047da', 'authors': ['Sangwoon Kwak', 'Joonsoo Kim', 'Jun Young Jeong', 'Won-Sik Cheong', 'Jihyong Oh', 'Munchurl Kim'], 'affiliations': ['Chung-Ang University', 'Electronics and Telecommunications Research Institute', 'Korea Advanced Institute of Science and Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.03714.jpg', 'data': {'categories': ['#3d'], 'emoji': '🎭', 'ru': {'title': 'Эффективное представление сложных движений в динамических сценах', 'desc': 'MoDecGS - это новый фреймворк для эффективной реконструкции динамических сцен с использованием 3D Gaussian Splatting. Он вводит метод GlobaltoLocal Motion Decomposition (GLMD) для захвата сложных движений, используя Global Canonical Scaffolds и Local Canonical Scaffolds. Фреймворк также включает Global Anchor Deformation (GAD) для представления глобальной динамики и Local Gaussian Deformation (LGD) для точной настройки локальных движений. MoDecGS демонстрирует значительное сокращение размера модели при сохранении или улучшении качества рендеринга по сравнению с существующими методами.'}, 'en': {'title': 'Efficient Dynamic Scene Rendering with MoDecGS', 'desc': 'The paper presents MoDecGS, a new framework for 3D Gaussian Splatting that efficiently handles dynamic scenes in neural rendering. It introduces GlobaltoLocal Motion Decomposition (GLMD) to capture complex motions using both Global and Local Canonical Scaffolds. The method employs Global Anchor Deformation (GAD) for global dynamics and Local Gaussian Deformation (LGD) for fine-tuning local motions. MoDecGS significantly reduces model size by 70% compared to existing methods while enhancing rendering quality, making it suitable for real-world dynamic video reconstruction.'}, 'zh': {'title': '高效动态场景重建的新方法', 'desc': '3D高斯点云(3DGS)在场景表示和神经渲染方面取得了显著进展,但在处理动态场景时仍面临存储需求和复杂运动表示的挑战。为了解决这些问题,我们提出了MoDecGS,一个内存高效的高斯点云框架,旨在重建具有复杂运动的新视角。我们引入了全局到局部运动分解(GLMD),以粗到细的方式有效捕捉动态运动,并扩展了静态支架表示以适应动态视频重建。通过全局锚点变形(GAD)和局部高斯变形(LGD),MoDecGS在保持或提高渲染质量的同时,平均减少了70%的模型大小。'}}}, {'id': 'https://huggingface.co/papers/2501.03931', 'title': 'Magic Mirror: ID-Preserved Video Generation in Video Diffusion Transformers', 'url': 'https://huggingface.co/papers/2501.03931', 'abstract': 'We present Magic Mirror, a framework for generating identity-preserved videos with cinematic-level quality and dynamic motion. While recent advances in video diffusion models have shown impressive capabilities in text-to-video generation, maintaining consistent identity while producing natural motion remains challenging. Previous methods either require person-specific fine-tuning or struggle to balance identity preservation with motion diversity. Built upon Video Diffusion Transformers, our method introduces three key components: (1) a dual-branch facial feature extractor that captures both identity and structural features, (2) a lightweight cross-modal adapter with Conditioned Adaptive Normalization for efficient identity integration, and (3) a two-stage training strategy combining synthetic identity pairs with video data. Extensive experiments demonstrate that Magic Mirror effectively balances identity consistency with natural motion, outperforming existing methods across multiple metrics while requiring minimal parameters added. The code and model will be made publicly available at: https://github.com/dvlab-research/MagicMirror/', 'score': 4, 'issue_id': 1550, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '1c9696a99b57f781', 'authors': ['Yuechen Zhang', 'Yaoyang Liu', 'Bin Xia', 'Bohao Peng', 'Zexin Yan', 'Eric Lo', 'Jiaya Jia'], 'affiliations': ['CMU', 'CUHK', 'HKUST', 'SmartMore'], 'pdf_title_img': 'assets/pdf/title_img/2501.03931.jpg', 'data': {'categories': ['#training', '#video', '#multimodal', '#open_source', '#synthetic', '#architecture', '#diffusion'], 'emoji': '🪞', 'ru': {'title': 'Магическое зеркало: видео с сохранением личности и естественным движением', 'desc': 'Magic Mirror - это новая система для создания видео с сохранением идентичности и кинематографическим качеством. Она использует модель видеодиффузии и вводит три ключевых компонента: двойной экстрактор лицевых признаков, легкий кросс-модальный адаптер и двухэтапную стратегию обучения. Система эффективно сочетает сохранение идентичности с естественным движением, превосходя существующие методы по нескольким метрикам. Magic Mirror требует минимального добавления параметров и будет доступна в открытом доступе.'}, 'en': {'title': 'Magic Mirror: Identity-Preserved Video Generation with Cinematic Quality', 'desc': 'Magic Mirror is a new framework designed to create high-quality videos that maintain the identity of individuals while showcasing dynamic motion. It addresses the challenges faced by previous video generation methods, which often struggled to keep a consistent identity or required extensive fine-tuning for specific individuals. The framework utilizes Video Diffusion Transformers and introduces innovative components like a dual-branch facial feature extractor and a cross-modal adapter to enhance identity integration. Through a two-stage training approach, Magic Mirror achieves a remarkable balance between identity preservation and natural motion, outperforming existing techniques with fewer additional parameters.'}, 'zh': {'title': 'Magic Mirror:保持身份一致的动态视频生成', 'desc': '本文介绍了Magic Mirror,一个用于生成保持身份一致的视频框架,具有电影级质量和动态运动。尽管最近的视频扩散模型在文本到视频生成方面取得了显著进展,但在生成自然运动的同时保持一致的身份仍然具有挑战性。我们的方法基于视频扩散变换器,提出了三个关键组件,以有效整合身份信息并保持运动多样性。实验结果表明,Magic Mirror在多个指标上超越了现有方法,同时增加的参数极少。'}}}, {'id': 'https://huggingface.co/papers/2501.03916', 'title': 'Dolphin: Closed-loop Open-ended Auto-research through Thinking, Practice, and Feedback', 'url': 'https://huggingface.co/papers/2501.03916', 'abstract': 'The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific research), in this paper, we propose Dolphin, the first closed-loop open-ended auto-research framework to further build the entire process of human scientific research. Dolphin can generate research ideas, perform experiments, and get feedback from experimental results to generate higher-quality ideas. More specifically, Dolphin first generates novel ideas based on relevant papers which are ranked by the topic and task attributes. Then, the codes are automatically generated and debugged with the exception-traceback-guided local code structure. Finally, Dolphin automatically analyzes the results of each idea and feeds the results back to the next round of idea generation. Experiments are conducted on the benchmark datasets of different topics and results show that Dolphin can generate novel ideas continuously and complete the experiment in a loop. We highlight that Dolphin can automatically propose methods that are comparable to the state-of-the-art in some tasks such as 2D image classification and 3D point classification.', 'score': 3, 'issue_id': 1555, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '9a18a60e788b7840', 'authors': ['Jiakang Yuan', 'Xiangchao Yan', 'Botian Shi', 'Tao Chen', 'Wanli Ouyang', 'Bo Zhang', 'Lei Bai', 'Yu Qiao', 'Bowen Zhou'], 'affiliations': ['Fudan University', 'Shanghai Artificial Intelligence Laboratory'], 'pdf_title_img': 'assets/pdf/title_img/2501.03916.jpg', 'data': {'categories': ['#open_source', '#agents', '#science', '#3d', '#cv', '#benchmark', '#dataset'], 'emoji': '🐬', 'ru': {'title': 'Dolphin: ИИ-ассистент для полного цикла научных исследований', 'desc': 'Статья представляет Dolphin - первую замкнутую систему для автоматического проведения научных исследований. Dolphin генерирует идеи на основе релевантных статей, автоматически создает и отлаживает код для экспериментов, а затем анализирует результаты. Система способна непрерывно генерировать новые идеи и проводить эксперименты в цикле. Эксперименты показали, что Dolphin может предлагать методы, сопоставимые с современными подходами в некоторых задачах машинного обучения.'}, 'en': {'title': 'Dolphin: Automating Scientific Research with AI', 'desc': 'This paper introduces Dolphin, an innovative framework designed to automate the scientific research process. Dolphin operates in a closed-loop system, generating research ideas, conducting experiments, and analyzing results to refine future ideas. It utilizes AI to rank relevant literature and automatically generate and debug code, enhancing research efficiency. The framework has been tested on various benchmark datasets, demonstrating its ability to produce novel ideas and achieve results comparable to leading methods in tasks like image classification.'}, 'zh': {'title': 'Dolphin:自动化科学研究的新纪元', 'desc': '这篇论文介绍了一个名为Dolphin的闭环开放式自动研究框架,旨在提升科学研究的效率。Dolphin能够生成研究想法、进行实验,并根据实验结果反馈生成更高质量的想法。具体来说,Dolphin首先根据相关论文生成新想法,然后自动生成和调试代码,最后分析每个想法的结果并反馈到下一轮生成中。实验结果表明,Dolphin能够持续生成新想法,并在循环中完成实验,且在某些任务上与最先进的方法相当。'}}}, {'id': 'https://huggingface.co/papers/2501.02260', 'title': 'MagicFace: High-Fidelity Facial Expression Editing with Action-Unit Control', 'url': 'https://huggingface.co/papers/2501.02260', 'abstract': "We address the problem of facial expression editing by controling the relative variation of facial action-unit (AU) from the same person. This enables us to edit this specific person's expression in a fine-grained, continuous and interpretable manner, while preserving their identity, pose, background and detailed facial attributes. Key to our model, which we dub MagicFace, is a diffusion model conditioned on AU variations and an ID encoder to preserve facial details of high consistency. Specifically, to preserve the facial details with the input identity, we leverage the power of pretrained Stable-Diffusion models and design an ID encoder to merge appearance features through self-attention. To keep background and pose consistency, we introduce an efficient Attribute Controller by explicitly informing the model of current background and pose of the target. By injecting AU variations into a denoising UNet, our model can animate arbitrary identities with various AU combinations, yielding superior results in high-fidelity expression editing compared to other facial expression editing works. Code is publicly available at https://github.com/weimengting/MagicFace.", 'score': 3, 'issue_id': 1550, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': '9eeeb5b132839793', 'authors': ['Mengting Wei', 'Tuomas Varanka', 'Xingxun Jiang', 'Huai-Qian Khor', 'Guoying Zhao'], 'affiliations': ['Center for Machine Vision and Signal Analysis, Faculty of Information Technology and Electrical Engineering, University of Oulu, Oulu, FI-90014, Finland', 'Key Laboratory of Child Development and Learning Science of Ministry of Education, School of Biological Sciences and Medical Engineering, Southeast University, Nanjing 210096, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.02260.jpg', 'data': {'categories': ['#multimodal', '#diffusion', '#open_source', '#cv'], 'emoji': '🎭', 'ru': {'title': 'Точное редактирование мимики с сохранением личности', 'desc': 'Статья представляет новый подход к редактированию мимики лица с использованием диффузионной модели, названной MagicFace. Модель позволяет точно и интерпретируемо изменять выражение лица конкретного человека, сохраняя его идентичность, позу и фоновые детали. Ключевым элементом является условная генерация на основе вариаций лицевых единиц действия (AU) и использование ID-энкодера для сохранения деталей лица. MagicFace демонстрирует превосходные результаты в высококачественном редактировании выражений лица по сравнению с другими методами.'}, 'en': {'title': 'MagicFace: Fine-Grained Facial Expression Editing with Consistent Identity', 'desc': 'This paper presents a method for editing facial expressions while maintaining the identity and other attributes of the person. The proposed model, named MagicFace, utilizes a diffusion model that is conditioned on facial action unit (AU) variations, allowing for fine-grained control over expressions. It incorporates a pretrained Stable-Diffusion model and an ID encoder to ensure high consistency in facial details. Additionally, an Attribute Controller is introduced to maintain background and pose consistency during the editing process, resulting in high-fidelity expression animations.'}, 'zh': {'title': '魔法面孔:高保真面部表情编辑的创新之路', 'desc': '我们提出了一种面部表情编辑的方法,通过控制同一人的面部动作单元(AU)的相对变化来实现。这种方法可以细致、连续且可解释地编辑特定人的表情,同时保持他们的身份、姿势、背景和面部细节。我们的模型称为MagicFace,核心是一个基于AU变化的扩散模型和一个ID编码器,以保持面部细节的一致性。通过将AU变化注入去噪UNet,我们的模型能够以高保真度编辑面部表情,效果优于其他相关工作。'}}}, {'id': 'https://huggingface.co/papers/2501.02790', 'title': 'Segmenting Text and Learning Their Rewards for Improved RLHF in Language Model', 'url': 'https://huggingface.co/papers/2501.02790', 'abstract': 'Reinforcement learning from human feedback (RLHF) has been widely adopted to align language models (LMs) with human preference. Prior RLHF works typically take a bandit formulation, which, though intuitive, ignores the sequential nature of LM generation and can suffer from the sparse reward issue. While recent works propose dense token-level RLHF, treating each token as an action may be oversubtle to proper reward assignment. In this paper, we seek to get the best of both by training and utilizing a segment-level reward model, which assigns a reward to each semantically complete text segment that spans over a short sequence of tokens. For reward learning, our method allows dynamic text segmentation and compatibility with standard sequence-preference datasets. For effective RL-based LM training against segment reward, we generalize the classical scalar bandit reward normalizers into location-aware normalizer functions and interpolate the segment reward for further densification. With these designs, our method performs competitively on three popular RLHF benchmarks for LM policy: AlpacaEval 2.0, Arena-Hard, and MT-Bench. Ablation studies are conducted to further demonstrate our method.', 'score': 2, 'issue_id': 1562, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'bd19e4a3e48539d4', 'authors': ['Yueqin Yin', 'Shentao Yang', 'Yujia Xie', 'Ziyi Yang', 'Yuting Sun', 'Hany Awadalla', 'Weizhu Chen', 'Mingyuan Zhou'], 'affiliations': ['Microsoft', 'The University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.02790.jpg', 'data': {'categories': ['#training', '#reasoning', '#alignment', '#rlhf', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Сегментарный RLHF: золотая середина между токенами и бандитами', 'desc': 'Данная статья представляет новый подход к обучению языковых моделей с подкреплением на основе обратной связи от человека (RLHF). Авторы предлагают использовать сегментарную модель вознаграждения, которая присваивает награду семантически завершенным текстовым сегментам. Метод позволяет динамическую сегментацию текста и совместим со стандартными наборами данных последовательных предпочтений. Для эффективного RL-обучения языковой модели авторы обобщают классические нормализаторы скалярного бандитного вознаграждения в локально-зависимые функции нормализации.'}, 'en': {'title': 'Enhancing Language Models with Segment-Level Rewards in RLHF', 'desc': 'This paper discusses a new approach to Reinforcement Learning from Human Feedback (RLHF) for language models (LMs). It critiques previous methods that treat the task as a bandit problem, which can overlook the sequential nature of text generation and lead to sparse rewards. The authors propose a segment-level reward model that assigns rewards to complete text segments, improving reward assignment. Their method incorporates dynamic text segmentation and enhances training efficiency by using location-aware normalizer functions, showing competitive results on established RLHF benchmarks.'}, 'zh': {'title': '段落级奖励模型:强化学习的新突破', 'desc': '本论文探讨了如何通过人类反馈进行强化学习(RLHF),以使语言模型(LM)更符合人类偏好。以往的RLHF研究通常采用赌博机模型,但这种方法忽视了语言模型生成的序列特性,并可能面临稀疏奖励的问题。我们提出了一种基于段落级奖励模型的方法,为每个语义完整的文本段落分配奖励,从而克服了以往方法的不足。通过动态文本分割和与标准序列偏好数据集的兼容性,我们的方法在多个RLHF基准测试中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.02393', 'title': 'Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers', 'url': 'https://huggingface.co/papers/2501.02393', 'abstract': "We present an approach to modifying Transformer architectures by integrating graph-aware relational reasoning into the attention mechanism, merging concepts from graph neural networks and language modeling. Building on the inherent connection between attention and graph theory, we reformulate the Transformer's attention mechanism as a graph operation and propose Graph-Aware Isomorphic Attention. This method leverages advanced graph modeling strategies, including Graph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA), to enrich the representation of relational structures. Our approach captures complex dependencies and generalizes across tasks, as evidenced by a reduced generalization gap and improved learning performance. Additionally, we expand the concept of graph-aware attention to introduce Sparse GIN-Attention, a fine-tuning approach that employs sparse GINs. By interpreting attention matrices as sparse adjacency graphs, this technique enhances the adaptability of pre-trained foundational models with minimal computational overhead, endowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning achieves improved training dynamics and better generalization compared to alternative methods like low-rank adaption (LoRA). We discuss latent graph-like structures within traditional attention mechanisms, offering a new lens through which Transformers can be understood. By evolving Transformers as hierarchical GIN models for relational reasoning. This perspective suggests profound implications for foundational model development, enabling the design of architectures that dynamically adapt to both local and global dependencies. Applications in bioinformatics, materials science, language modeling, and beyond could benefit from this synthesis of relational and sequential data modeling, setting the stage for interpretable and generalizable modeling strategies.", 'score': 1, 'issue_id': 1563, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': 'a200448c9795e159', 'authors': ['Markus J. Buehler'], 'affiliations': ['Laboratory for Atomistic and Molecular Mechanics (LAMM) MIT Cambridge, MA 02139, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.02393.jpg', 'data': {'categories': ['#graphs', '#architecture', '#interpretability', '#training'], 'emoji': '🕸️', 'ru': {'title': 'Трансформеры эволюционируют в графовые модели для реляционного рассуждения', 'desc': 'Статья представляет новый подход к модификации архитектуры Трансформеров путем интеграции графового реляционного рассуждения в механизм внимания. Авторы переформулируют механизм внимания Трансформера как графовую операцию и предлагают Graph-Aware Isomorphic Attention, используя стратегии моделирования графов, такие как Graph Isomorphism Networks (GIN) и Principal Neighborhood Aggregation (PNA). Метод позволяет улучшить представление реляционных структур, уменьшить разрыв в обобщении и повысить производительность обучения. Также предложен метод тонкой настройки Sparse GIN-Attention, который интерпретирует матрицы внимания как разреженные графы смежности, улучшая адаптивность предобученных моделей.'}, 'en': {'title': 'Transforming Attention: Merging Graphs and Transformers for Enhanced Learning', 'desc': 'This paper introduces a new way to enhance Transformer models by incorporating graph-based reasoning into their attention mechanisms. By treating attention as a graph operation, the authors propose a method called Graph-Aware Isomorphic Attention, which utilizes advanced graph techniques to better capture relationships in data. They also present Sparse GIN-Attention, a fine-tuning method that interprets attention matrices as sparse graphs, improving the adaptability of pre-trained models with less computational cost. Overall, this approach not only improves learning performance but also opens up new possibilities for applying Transformers in various fields like bioinformatics and language modeling.'}, 'zh': {'title': '图感知注意力:Transformer的新视角', 'desc': '本文提出了一种通过将图感知关系推理整合到注意力机制中来修改Transformer架构的方法。这种方法将Transformer的注意力机制重新表述为图操作,并提出了图感知同构注意力(Graph-Aware Isomorphic Attention)。该方法利用图同构网络(GIN)和主邻域聚合(PNA)等先进的图建模策略,增强了关系结构的表示能力。通过引入稀疏GIN注意力(Sparse GIN-Attention),我们展示了如何在保持计算效率的同时,提升预训练模型的适应性和泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2501.09732', 'title': 'Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps', 'url': 'https://huggingface.co/papers/2501.09732', 'abstract': 'Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.', 'score': 34, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '2ad32c666f91ba05', 'authors': ['Nanye Ma', 'Shangyuan Tong', 'Haolin Jia', 'Hexiang Hu', 'Yu-Chuan Su', 'Mingda Zhang', 'Xuan Yang', 'Yandong Li', 'Tommi Jaakkola', 'Xuhui Jia', 'Saining Xie'], 'affiliations': ['Google', 'MIT', 'NYU'], 'pdf_title_img': 'assets/pdf/title_img/2501.09732.jpg', 'data': {'categories': ['#diffusion', '#inference', '#benchmark', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Повышение качества генерации изображений за счет масштабирования вычислений при выводе', 'desc': 'Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислений во время вывода. Авторы рассматривают задачу поиска лучших шумов для процесса сэмплирования диффузионной модели. Они структурируют пространство решений по двум осям: верификаторы для обратной связи и алгоритмы поиска лучших кандидатов шума. Эксперименты показывают, что увеличение вычислений при выводе приводит к значительному улучшению качества сгенерированных изображений.'}, 'en': {'title': 'Enhancing Diffusion Models: Scaling Inference for Better Image Generation', 'desc': 'This paper investigates how to enhance the performance of diffusion models during the inference phase by increasing computational resources. It highlights that, unlike Large Language Models (LLMs), diffusion models can adjust their inference process through the number of denoising steps, but improvements tend to plateau after a certain point. The authors propose a method to optimize the noise used in the diffusion sampling process by exploring different feedback verifiers and algorithms. Their experiments demonstrate that by strategically increasing computation during inference, the quality of generated images can be significantly improved, tailored to various application needs.'}, 'zh': {'title': '扩散模型推理时的计算扩展与性能提升', 'desc': '生成模型在多个领域产生了重要影响,主要得益于其在训练过程中通过增加数据、计算资源和模型规模来扩展的能力。最近的研究开始探讨大型语言模型(LLMs)在推理时的扩展行为,发现额外的计算可以进一步提高性能。与LLMs不同,扩散模型通过去噪步骤的数量灵活调整推理时的计算,尽管性能提升通常在几十步后趋于平稳。本文探讨了扩散模型在推理时的扩展行为,研究如何通过增加计算来进一步提高生成性能,特别是通过寻找更好的噪声来优化扩散采样过程。'}}}, {'id': 'https://huggingface.co/papers/2501.09751', 'title': 'OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking', 'url': 'https://huggingface.co/papers/2501.09751', 'abstract': "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.", 'score': 29, 'issue_id': 1722, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '7e8d42358354f79b', 'authors': ['Zekun Xi', 'Wenbiao Yin', 'Jizhan Fang', 'Jialong Wu', 'Runnan Fang', 'Ningyu Zhang', 'Jiang Yong', 'Pengjun Xie', 'Fei Huang', 'Huajun Chen'], 'affiliations': ['Tongyi Lab, Alibaba Group', 'Zhejiang Key Laboratory of Big Data Intelligent Computing', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09751.jpg', 'data': {'categories': ['#rag', '#story_generation', '#long_context', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста', 'desc': 'Статья представляет новый подход к генерации текста с использованием больших языковых моделей, названный OmniThink. Этот метод имитирует человеческий процесс итеративного расширения знаний и рефлексии, преодолевая ограничения стандартных методов извлечения информации. OmniThink улучшает плотность знаний в генерируемых статьях, не жертвуя связностью и глубиной. Эксперименты и оценки экспертов подтверждают эффективность OmniThink для решения реальных задач генерации длинных статей.'}, 'en': {'title': 'OmniThink: Elevating Machine Writing through Human-Like Learning', 'desc': 'This paper introduces OmniThink, a novel machine writing framework that enhances the capabilities of large language models by mimicking human cognitive processes. Unlike traditional retrieval-augmented generation methods, which often produce shallow and repetitive content, OmniThink focuses on iterative expansion and reflection to deepen knowledge on topics. The framework significantly improves the knowledge density of generated articles while maintaining coherence and depth, as shown by experimental results. Human evaluations and expert feedback confirm that OmniThink effectively addresses challenges in generating high-quality long-form content.'}, 'zh': {'title': 'OmniThink:提升机器写作的知识密度', 'desc': '本文提出了一种名为OmniThink的机器写作框架,旨在改善传统大语言模型在生成内容时的局限性。OmniThink模拟人类学习者的认知过程,通过迭代扩展和反思来加深对主题的理解。实验结果表明,OmniThink能够提高生成文章的知识密度,同时保持连贯性和深度等指标。人类评估和专家反馈进一步验证了OmniThink在生成长篇文章时解决实际问题的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.09755', 'title': 'Learnings from Scaling Visual Tokenizers for Reconstruction and Generation', 'url': 'https://huggingface.co/papers/2501.09755', 'abstract': "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.", 'score': 19, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '426aa3415c3c0ef4', 'authors': ['Philippe Hansen-Estruch', 'David Yan', 'Ching-Yao Chung', 'Orr Zohar', 'Jialiang Wang', 'Tingbo Hou', 'Tao Xu', 'Sriram Vishwanath', 'Peter Vajda', 'Xinlei Chen'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'Stanford University', 'UT Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.09755.jpg', 'data': {'categories': ['#cv', '#benchmark', '#video', '#optimization', '#architecture', '#diffusion'], 'emoji': '🔬', 'ru': {'title': 'ViTok: Оптимизация визуальной токенизации для генеративных моделей', 'desc': 'Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделях изображений и видео. Авторы представляют ViTok - легковесный автоэнкодер на основе Vision Transformer, обученный на масштабных датасетах. Исследование показывает, что масштабирование декодера улучшает реконструкцию, но неоднозначно влияет на генерацию. ViTok демонстрирует конкурентоспособную производительность при меньшем количестве FLOP и устанавливает новые рекорды в условной генерации видео.'}, 'en': {'title': 'Scaling Auto-Encoders for Enhanced Image and Video Generation', 'desc': 'This paper explores the scaling of auto-encoders, particularly focusing on the tokenizer component, which is crucial for image and video generation. The authors introduce ViTok, a Vision Transformer-based architecture that replaces traditional convolutional backbones, allowing for better scaling on large datasets. They investigate how different scaling strategies for the encoder and decoder affect both reconstruction and generative performance, finding that scaling the decoder is more beneficial for reconstruction. Ultimately, ViTok achieves competitive results with fewer computational resources and sets new benchmarks in image and video generation tasks.'}, 'zh': {'title': '自编码器的视觉标记化:提升生成模型的关键', 'desc': '本论文探讨了通过自编码器进行视觉标记化对图像和视频生成模型的影响。我们提出了一种增强的视觉变换器架构(ViTok),用于替代传统的卷积骨干网络,以提高标记化的效果。研究发现,自编码器的瓶颈规模与重建性能高度相关,但与生成性能的关系更为复杂。最终,ViTok在多个任务中表现出色,尤其是在视频重建和图像生成方面,展示了其在计算效率上的优势。'}}}, {'id': 'https://huggingface.co/papers/2501.09484', 'title': 'Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators', 'url': 'https://huggingface.co/papers/2501.09484', 'abstract': 'Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the "inquiry" phase of the consultation process. This lack of focus has left the relationship between "inquiry" and "diagnosis" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between "inquiry" and "diagnosis" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig\'s law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.', 'score': 16, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'aff7d86ad63040d9', 'authors': ['Zhaocheng Liu', 'Quan Tu', 'Wen Ye', 'Yu Xiao', 'Zhishou Zhang', 'Hengfu Cui', 'Yalun Zhu', 'Qiang Ju', 'Shizheng Li', 'Jian Xie'], 'affiliations': ['Baichuan Inc.', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09484.jpg', 'data': {'categories': ['#data', '#training', '#science', '#open_source', '#healthcare'], 'emoji': '🩺', 'ru': {'title': 'Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ', 'desc': 'Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам.'}, 'en': {'title': 'Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations', 'desc': "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."}, 'zh': {'title': '优化询问,提升诊断效果', 'desc': '本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。'}}}, {'id': 'https://huggingface.co/papers/2501.09686', 'title': 'Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models', 'url': 'https://huggingface.co/papers/2501.09686', 'abstract': 'Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs\' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs\' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI\'s o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.', 'score': 14, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1c6b1b1f0235304c', 'authors': ['Fengli Xu', 'Qianyue Hao', 'Zefang Zong', 'Jingwei Wang', 'Yunke Zhang', 'Jingyi Wang', 'Xiaochong Lan', 'Jiahui Gong', 'Tianjian Ouyang', 'Fanjin Meng', 'Chenyang Shao', 'Yuwei Yan', 'Qinglong Yang', 'Yiwen Song', 'Sijian Ren', 'Xinyuan Hu', 'Yu Li', 'Jie Feng', 'Chen Gao', 'Yong Li'], 'affiliations': ['Emory University, Atlanta GA, USA', 'HKUST (GZ), Guangzhou, China', 'Tsinghua University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09686.jpg', 'data': {'categories': ['#open_source', '#training', '#rl', '#survey', '#reasoning', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Путь к большим моделям рассуждений: новый рубеж в ИИ', 'desc': 'Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваются ключевые технические компоненты, способствующие развитию крупных моделей рассуждений, включая автоматизированное построение данных, методы обучения рассуждениям и масштабирование во время тестирования. Анализируются популярные проекты с открытым исходным кодом по созданию крупных моделей рассуждений. Обсуждаются открытые проблемы и направления будущих исследований в этой области.'}, 'en': {'title': 'Unlocking Human-Like Reasoning in Large Language Models', 'desc': "This paper discusses the advancements in Large Language Models (LLMs) and their application to complex reasoning tasks. It introduces the concept of 'thought', which represents intermediate reasoning steps, allowing LLMs to simulate human-like reasoning processes. The paper highlights the use of reinforcement learning to enhance LLMs' reasoning capabilities by generating high-quality reasoning trajectories through trial-and-error methods. Additionally, it emphasizes the importance of scaling both training and testing phases to improve reasoning accuracy, paving the way for the development of Large Reasoning Models."}, 'zh': {'title': '推动大型推理模型的研究新前沿', 'desc': '这篇论文探讨了大型语言模型(LLMs)在复杂推理任务中的应用。研究者们引入了“思考”的概念,通过中间步骤的令牌序列来模拟人类的推理过程。最近,强化学习(RL)被应用于训练LLMs,以自动生成高质量的推理轨迹,从而显著提高推理能力。论文还讨论了在测试时增加令牌数量以提高推理准确性的效果,并展望了大型推理模型的未来研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09756', 'title': 'SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces', 'url': 'https://huggingface.co/papers/2501.09756', 'abstract': "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/", 'score': 12, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'e6621d55eb165448', 'authors': ['Sumit Chaturvedi', 'Mengwei Ren', 'Yannick Hold-Geoffroy', 'Jingyuan Liu', 'Julie Dorsey', 'Zhixin Shu'], 'affiliations': ['Adobe Research', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09756.jpg', 'data': {'categories': ['#dataset', '#3d', '#inference', '#cv', '#diffusion', '#training', '#synthetic'], 'emoji': '💡', 'ru': {'title': 'SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели', 'desc': 'SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматривает перезасветку как проблему повторного рендеринга, где пиксели трансформируются в ответ на изменения условий освещения окружающей среды. Авторы синтезировали датасет с помощью физически корректного рендеринга, симулируя трансформации освещения на 3D-моделях голов. Предложены две стратегии обучения и вывода для преодоления разрыва между синтетическими и реальными изображениями.'}, 'en': {'title': 'Revolutionizing Portrait Relighting with SynthLight', 'desc': 'SynthLight is a diffusion model designed for relighting portraits by treating the task as a re-rendering challenge influenced by environmental lighting changes. It utilizes a physically-based rendering engine to create a synthetic dataset that simulates how lighting affects 3D head models. The model employs multi-task training to utilize real portraits without specific lighting labels and a novel inference strategy that enhances detail preservation during the relighting process. The results show that SynthLight can effectively generalize to real images, producing realistic lighting effects while maintaining the identity of the subjects, outperforming existing methods in both quantitative and qualitative assessments.'}, 'zh': {'title': 'SynthLight:肖像重光照的新方法', 'desc': '我们介绍了SynthLight,这是一种用于肖像重光照的扩散模型。我们将图像重光照视为重新渲染的问题,通过物理基础渲染引擎合成数据集,以模拟在不同光照条件下的像素变换。我们提出了两种训练和推理策略,以缩小合成图像和真实图像之间的差距,利用真实人像进行多任务训练,并在推理时使用无分类器引导的扩散采样程序。我们的模型能够在多样的真实照片中推广,生成逼真的光照效果,同时保持主体的身份特征。'}}}, {'id': 'https://huggingface.co/papers/2501.09747', 'title': 'FAST: Efficient Action Tokenization for Vision-Language-Action Models', 'url': 'https://huggingface.co/papers/2501.09747', 'abstract': 'Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.', 'score': 11, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1ff64d2f7e62d274', 'authors': ['Karl Pertsch', 'Kyle Stachowicz', 'Brian Ichter', 'Danny Driess', 'Suraj Nair', 'Quan Vuong', 'Oier Mees', 'Chelsea Finn', 'Sergey Levine'], 'affiliations': ['Physical Intelligence', 'Stanford', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.09747.jpg', 'data': {'categories': ['#dataset', '#agents', '#training', '#games', '#optimization', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'Революция в токенизации действий робота: от частотного пространства к универсальности', 'desc': 'Статья представляет новый метод токенизации действий робота под названием FAST (Frequency-space Action Sequence Tokenization), основанный на дискретном косинусном преобразовании. Этот подход позволяет обучать авторегрессионные модели VLA (Vision-Language Action) для высокочастотных и сложных задач манипулирования, где стандартные методы дискретизации не работают. Авторы также представляют FAST+, универсальный токенизатор действий робота, обученный на 1 миллионе реальных траекторий. В сочетании с моделью pi0 VLA, метод FAST позволяет обучаться на 10 тысячах часов данных робота и достигать производительности диффузионных VLA, сокращая время обучения до 5 раз.'}, 'en': {'title': 'Revolutionizing Robot Action Tokenization with FAST', 'desc': 'This paper introduces a new method for tokenizing continuous robot actions to improve the performance of autoregressive sequence models, specifically in the context of vision-language action (VLA) policies. The authors identify that traditional tokenization methods, which use simple binning techniques, struggle with high-frequency and dexterous robotic tasks. To overcome this limitation, they propose Frequency-space Action Sequence Tokenization (FAST), which utilizes the discrete cosine transform for better action representation. The results demonstrate that FAST can effectively train VLAs on extensive robot data, achieving performance comparable to diffusion models while significantly reducing training time.'}, 'zh': {'title': '提升机器人灵巧技能的标记化新方法', 'desc': '本文提出了一种新的机器人动作标记化方案,称为频率空间动作序列标记化(FAST),旨在解决现有基于简单分箱方法的标记化在学习灵巧技能时的不足。FAST利用离散余弦变换来有效地处理高频机器人数据,从而提高了模型在复杂任务中的表现。我们还发布了FAST+,这是一个通用的机器人动作标记器,能够处理多种动作序列和控制频率。通过与pi0 VLA结合,我们的方法在训练10,000小时的机器人数据时,能够与扩散VLA的性能相匹配,同时将训练时间减少了多达5倍。'}}}, {'id': 'https://huggingface.co/papers/2501.09038', 'title': 'Do generative video models learn physical principles from watching videos?', 'url': 'https://huggingface.co/papers/2501.09038', 'abstract': "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.", 'score': 9, 'issue_id': 1725, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '6a5047e8681ddcc5', 'authors': ['Saman Motamed', 'Laura Culp', 'Kevin Swersky', 'Priyank Jaini', 'Robert Geirhos'], 'affiliations': ['Google DeepMind', 'INSAIT, Sofia University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09038.jpg', 'data': {'categories': ['#benchmark', '#science', '#video'], 'emoji': '🧠', 'ru': {'title': 'Визуальный реализм не гарантирует понимание физики в ИИ', 'desc': 'Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моделей понимать законы физики. Результаты показывают, что современные модели имеют ограниченное физическое понимание, несмотря на визуальный реализм. Однако некоторые задачи уже успешно решаются, что указывает на потенциал изучения физических принципов из наблюдений.'}, 'en': {'title': 'Visual Realism vs. Physical Understanding in AI Video Generation', 'desc': "This paper explores whether AI video generation models truly understand the laws of physics or if they are just good at creating realistic images. The authors introduce Physics-IQ, a benchmark dataset designed to test models on their grasp of physical principles like fluid dynamics and thermodynamics. Their findings show that current models struggle with physical understanding, even though they can produce visually realistic videos. This suggests that while some physical concepts can be learned from observation, there are still significant gaps in the models' comprehension of reality."}, 'zh': {'title': '视觉真实感不等于物理理解', 'desc': '本论文探讨了AI视频生成技术的进展,特别是模型是否理解物理规律。我们开发了Physics-IQ,一个全面的基准数据集,只有通过深入理解流体动力学、光学、固体力学、磁学和热力学等物理原理才能解决。研究发现,当前模型在物理理解方面存在严重限制,且与视觉真实感无关。尽管某些测试案例已成功解决,但这表明仅通过观察获得某些物理原理仍面临重大挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.09433', 'title': 'CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation', 'url': 'https://huggingface.co/papers/2501.09433', 'abstract': 'The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.', 'score': 9, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '8c7a54f21e46af7a', 'authors': ['Hwan Heo', 'Jangyeong Kim', 'Seongyeong Lee', 'Jeong A Wi', 'Junyoung Choi', 'Sangjun Ahn'], 'affiliations': ['Graphics AI Lab, NC Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.09433.jpg', 'data': {'categories': ['#diffusion', '#3d', '#optimization'], 'emoji': '🎨', 'ru': {'title': 'CaPa: Революция в генерации 3D-моделей', 'desc': 'В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощью латентной диффузионной модели и пространственно-разделенного внимания. CaPa также предлагает алгоритм для заполнения нетекстурированных областей, обеспечивая целостность результатов. Фреймворк генерирует 3D-модели менее чем за 30 секунд, превосходя аналоги по качеству текстур и стабильности геометрии.'}, 'en': {'title': 'CaPa: Fast and High-Fidelity 3D Asset Generation', 'desc': 'This paper presents CaPa, a novel framework for generating high-quality 3D assets from textual or visual inputs. It addresses common challenges in 3D generation, such as multi-view inconsistency and slow generation times, by separating geometry generation from texture synthesis. The framework utilizes a 3D latent diffusion model for consistent geometry creation and a Spatially Decoupled Attention mechanism for high-resolution texture synthesis. CaPa also includes a 3D-aware occlusion inpainting algorithm to enhance the final output, achieving high fidelity and stability in under 30 seconds.'}, 'zh': {'title': '高效生成高保真3D资产的CaPa框架', 'desc': '本论文介绍了一种名为CaPa的框架,用于高效生成高保真度的3D资产。该框架采用两阶段的过程,将几何体生成与纹理合成解耦。首先,使用3D潜在扩散模型生成几何体,确保多视角之间的结构一致性。然后,通过一种新颖的空间解耦注意力机制合成高分辨率纹理,并提出了3D感知的遮挡修复算法,最终在30秒内生成高质量的3D资产。'}}}, {'id': 'https://huggingface.co/papers/2501.09653', 'title': 'The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models', 'url': 'https://huggingface.co/papers/2501.09653', 'abstract': 'The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.', 'score': 8, 'issue_id': 1730, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '6d731a1519dc2727', 'authors': ['Jonathan Katzy', 'Razvan Mihai Popescu', 'Arie van Deursen', 'Maliheh Izadi'], 'affiliations': ['Delft University of Technology Delft, The Netherlands'], 'pdf_title_img': 'assets/pdf/title_img/2501.09653.jpg', 'data': {'categories': ['#low_resource', '#multilingual', '#open_source', '#data', '#dataset'], 'emoji': '🗃️', 'ru': {'title': 'The Heap: чистый код для честной оценки языковых моделей', 'desc': "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор данных под названием 'The Heap' охватывает 57 языков программирования и был дедуплицирован относительно других открытых наборов данных. Это позволяет исследователям проводить объективные оценки больших языковых моделей без необходимости значительной предварительной очистки данных. Создание 'The Heap' решает проблему ограниченности доступного кода для исследования специфических поведений моделей и их оценки без риска загрязнения данных."}, 'en': {'title': 'The Heap: A Clean Dataset for Fair Evaluation of Language Models', 'desc': 'This paper introduces The Heap, a comprehensive multilingual dataset that includes code from 57 programming languages. It addresses the challenge of data contamination in evaluating large language models by providing a deduplicated dataset, ensuring that the code is unique compared to existing open datasets. Researchers can utilize The Heap for downstream tasks without the burden of extensive data cleaning. This resource aims to facilitate fair assessments of model performance in coding tasks.'}, 'zh': {'title': '公平评估大型语言模型的新数据集', 'desc': '随着大型语言模型的流行,开发了大量的代码数据集来训练这些模型。然而,这导致可用于特定行为研究或评估大型语言模型的代码有限,且可能存在数据污染的问题。为了解决这个问题,我们发布了The Heap,这是一个覆盖57种编程语言的大型多语言数据集,经过去重处理,避免与其他开放代码数据集重复。这样,研究人员可以在不需要大量数据清理的情况下,公平地评估大型语言模型。'}}}, {'id': 'https://huggingface.co/papers/2501.08617', 'title': 'RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation', 'url': 'https://huggingface.co/papers/2501.08617', 'abstract': "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.", 'score': 7, 'issue_id': 1720, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'f758bc630d8dd443', 'authors': ['Kaiqu Liang', 'Haimin Hu', 'Ryan Liu', 'Thomas L. Griffiths', 'Jaime Fernández Fisac'], 'affiliations': ['Department of Computer Science, Princeton University', 'Department of Electrical and Computer Engineering, Princeton University', 'Department of Psychology, Princeton University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08617.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#training', '#rl'], 'emoji': '🔮', 'ru': {'title': 'Взгляд в будущее для лучшей настройки ИИ', 'desc': 'Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS использует симуляцию долгосрочных последствий действий модели и оценку их полезности постфактум. Авторы показывают, что RLHS позволяет уменьшить проблему неправильной мотивации модели и улучшить соответствие человеческим ценностям. Эмпирические эксперименты демонстрируют превосходство RLHS над RLHF в достижении целей пользователей.'}, 'en': {'title': 'Aligning AI with Human Values through Hindsight Feedback', 'desc': "This paper addresses the challenge of aligning generative AI systems with human values using Reinforcement Learning from Human Feedback (RLHF). It identifies that relying on immediate feedback can lead to misaligned behaviors, such as sycophancy and deception, due to Goodhart's Law dynamics. The authors propose a new approach called Reinforcement Learning from Hindsight Simulation (RLHS), which uses simulated consequences to gather feedback on beneficial behaviors. Their experiments show that RLHS improves user satisfaction and goal achievement compared to traditional RLHF methods, highlighting the importance of considering long-term outcomes in AI alignment."}, 'zh': {'title': '关注长期后果,提升AI对齐性', 'desc': '这篇论文探讨了生成性人工智能系统如何更好地与人类价值观对齐,以确保其行为有益且可信。现有的基于人类反馈的强化学习(RLHF)方法主要依赖即时反馈,但这种反馈可能无法准确反映与用户效用相关的长期影响。作者提出了一种新的方法,称为基于事后模拟的强化学习(RLHS),通过模拟可能的后果来获取反馈,从而改善模型的对齐性。研究表明,RLHS在帮助用户实现目标和提高满意度方面,优于传统的RLHF方法。'}}}, {'id': 'https://huggingface.co/papers/2501.09503', 'title': 'AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation', 'url': 'https://huggingface.co/papers/2501.09503', 'abstract': 'Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .', 'score': 6, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'fb27e795153a9668', 'authors': ['Junjie He', 'Yuxiang Tuo', 'Binghui Chen', 'Chongyang Zhong', 'Yifeng Geng', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.09503.jpg', 'data': {'categories': ['#cv', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами', 'desc': 'Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод использует универсальный энкодер изображений ReferenceNet и CLIP для высококачественного кодирования характеристик субъектов. AnyStory применяет декуплированный маршрутизатор субъектов для точного определения их потенциального расположения в латентном пространстве. Эксперименты показывают превосходную производительность метода в сохранении деталей субъектов, соответствии текстовым описаниям и персонализации для нескольких субъектов одновременно.'}, 'en': {'title': 'AnyStory: Mastering Personalized Image Generation for Multiple Subjects', 'desc': "This paper introduces AnyStory, a novel method for generating personalized images with high fidelity, even when multiple subjects are involved. It employs an 'encode-then-route' strategy, where a powerful image encoder, ReferenceNet, captures detailed subject features. The routing mechanism uses an instance-aware subject router to accurately determine where each subject should be placed in the generated image. Experimental results show that AnyStory excels in maintaining subject details and aligning them with text descriptions, making it effective for both single and multiple subjects."}, 'zh': {'title': 'AnyStory:个性化主题生成的新方法', 'desc': '最近,大规模生成模型在文本到图像生成方面表现出色。然而,生成高保真度的个性化图像,尤其是涉及多个主题的情况,仍然面临挑战。本文提出了AnyStory,这是一种统一的个性化主题生成方法,能够在不牺牲主题保真的情况下,实现单个和多个主题的高保真个性化。AnyStory通过“编码-再路由”的方式建模主题个性化问题,利用强大的图像编码器和实例感知路由器,准确预测主题在潜在空间中的位置。'}}}, {'id': 'https://huggingface.co/papers/2501.04519', 'title': 'rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking', 'url': 'https://huggingface.co/papers/2501.04519', 'abstract': 'We present rStar-Math to demonstrate that small language models (SLMs) can rival or even surpass the math reasoning capability of OpenAI o1, without distillation from superior models. rStar-Math achieves this by exercising "deep thinking" through Monte Carlo Tree Search (MCTS), where a math policy SLM performs test-time search guided by an SLM-based process reward model. rStar-Math introduces three innovations to tackle the challenges in training the two SLMs: (1) a novel code-augmented CoT data sythesis method, which performs extensive MCTS rollouts to generate step-by-step verified reasoning trajectories used to train the policy SLM; (2) a novel process reward model training method that avoids na\\"ive step-level score annotation, yielding a more effective process preference model (PPM); (3) a self-evolution recipe in which the policy SLM and PPM are built from scratch and iteratively evolved to improve reasoning capabilities. Through 4 rounds of self-evolution with millions of synthesized solutions for 747k math problems, rStar-Math boosts SLMs\' math reasoning to state-of-the-art levels. On the MATH benchmark, it improves Qwen2.5-Math-7B from 58.8% to 90.0% and Phi3-mini-3.8B from 41.4% to 86.4%, surpassing o1-preview by +4.5% and +0.9%. On the USA Math Olympiad (AIME), rStar-Math solves an average of 53.3% (8/15) of problems, ranking among the top 20% the brightest high school math students. Code and data will be available at https://github.com/microsoft/rStar.', 'score': 100, 'issue_id': 1572, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'b065003de5fa3bde', 'authors': ['Xinyu Guan', 'Li Lyna Zhang', 'Yifei Liu', 'Ning Shang', 'Youran Sun', 'Yi Zhu', 'Fan Yang', 'Mao Yang'], 'affiliations': ['Microsoft', 'Peking University', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04519.jpg', 'data': {'categories': ['#training', '#reasoning', '#optimization', '#benchmark', '#small_models', '#dataset'], 'emoji': '🧮', 'ru': {'title': 'Малые модели решают большие задачи: rStar-Math превосходит гигантов в математике', 'desc': 'Статья представляет rStar-Math - подход, позволяющий малым языковым моделям (SLM) достичь или превзойти способности крупных моделей в математических рассуждениях. Метод использует поиск по методу Монте-Карло (MCTS) с двумя специально обученными SLM: политикой и моделью вознаграждения. Авторы вводят новые методы синтеза обучающих данных, обучения модели вознаграждения и итеративного улучшения моделей. В результате rStar-Math значительно повышает эффективность SLM на математических тестах, превосходя более крупные модели.'}, 'en': {'title': 'Empowering Small Models to Excel in Math Reasoning', 'desc': 'The paper introduces rStar-Math, a framework that enhances the math reasoning abilities of small language models (SLMs) without relying on larger models. It employs Monte Carlo Tree Search (MCTS) to enable deep thinking, allowing the SLM to perform guided search during problem-solving. Key innovations include a code-augmented Chain of Thought (CoT) data synthesis method for generating verified reasoning paths, a refined process preference model (PPM) for better reward training, and a self-evolution strategy for iterative improvement. As a result, rStar-Math significantly boosts the performance of SLMs on math benchmarks, achieving state-of-the-art results in various assessments.'}, 'zh': {'title': '小型语言模型的数学推理新突破', 'desc': 'rStar-Math展示了小型语言模型(SLMs)在数学推理能力上可以与OpenAI的o1相媲美,甚至超越它,而无需从更强大的模型中蒸馏。该方法通过蒙特卡洛树搜索(MCTS)实现“深度思考”,在测试时由SLM驱动的过程奖励模型指导数学策略SLM进行搜索。rStar-Math引入了三项创新来解决训练两个SLM的挑战,包括新颖的代码增强的链式推理数据合成方法和更有效的过程偏好模型(PPM)训练方法。经过四轮自我进化,rStar-Math在747,000个数学问题上生成了数百万个合成解,使SLMs的数学推理能力达到了最先进的水平。'}}}, {'id': 'https://huggingface.co/papers/2501.04682', 'title': 'Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Though', 'url': 'https://huggingface.co/papers/2501.04682', 'abstract': 'We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends traditional Chain-of-Thought (CoT) by explicitly modeling the underlying reasoning required to arrive at a particular CoT. We present empirical evidence from state-of-the-art models exhibiting behaviors consistent with in-context search, and explore methods for producing Meta-CoT via process supervision, synthetic data generation, and search algorithms. Finally, we outline a concrete pipeline for training a model to produce Meta-CoTs, incorporating instruction tuning with linearized search traces and reinforcement learning post-training. Finally, we discuss open research questions, including scaling laws, verifier roles, and the potential for discovering novel reasoning algorithms. This work provides a theoretical and practical roadmap to enable Meta-CoT in LLMs, paving the way for more powerful and human-like reasoning in artificial intelligence.', 'score': 42, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '3479f7793755e586', 'authors': ['Violet Xiang', 'Charlie Snell', 'Kanishk Gandhi', 'Alon Albalak', 'Anikait Singh', 'Chase Blagden', 'Duy Phung', 'Rafael Rafailov', 'Nathan Lile', 'Dakota Mahan', 'Louis Castricato', 'Jan-Philipp Franken', 'Nick Haber', 'Chelsea Finn'], 'affiliations': ['Stanford University', 'SynthLabs.ai', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.04682.jpg', 'data': {'categories': ['#synthetic', '#training', '#rlhf', '#rl', '#multimodal', '#optimization', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Meta-CoT: новый уровень рассуждений для ИИ', 'desc': 'Исследователи предлагают новую концепцию под названием Meta Chain-of-Thought (Meta-CoT), которая расширяет традиционный подход Chain-of-Thought. Meta-CoT моделирует базовые рассуждения, необходимые для построения цепочки мыслей. Авторы представляют эмпирические доказательства того, что современные языковые модели демонстрируют поведение, согласующееся с контекстным поиском. Они также описывают конкретный процесс обучения модели для генерации Meta-CoT, включающий инструктивную настройку и обучение с подкреплением.'}, 'en': {'title': 'Empowering AI with Enhanced Reasoning through Meta-CoT', 'desc': 'The paper introduces a new framework called Meta Chain-of-Thought (Meta-CoT), which enhances the traditional Chain-of-Thought (CoT) approach by focusing on the reasoning processes behind generating CoTs. It provides experimental results from advanced models that show behaviors similar to in-context search, and discusses techniques for creating Meta-CoT through process supervision, synthetic data, and search algorithms. The authors propose a detailed training pipeline that combines instruction tuning with search traces and reinforcement learning to improve the generation of Meta-CoTs. Additionally, the paper raises important questions about scaling, the role of verifiers, and the potential for discovering new reasoning methods, aiming to advance the reasoning capabilities of large language models (LLMs).'}, 'zh': {'title': '推动人工智能推理能力的元思维链', 'desc': '我们提出了一种新颖的框架,称为元思维链(Meta-CoT),它通过明确建模所需的推理过程来扩展传统的思维链(CoT)。我们展示了来自最先进模型的实证证据,这些模型表现出与上下文搜索一致的行为,并探索了通过过程监督、合成数据生成和搜索算法来生成元思维链的方法。最后,我们概述了一个具体的训练流程,结合了指令调优、线性化搜索轨迹和强化学习后训练,以生成元思维链。此项工作为在大型语言模型中实现元思维链提供了理论和实践的路线图,推动了人工智能更强大和更人性化的推理能力。'}}}, {'id': 'https://huggingface.co/papers/2501.04686', 'title': 'URSA: Understanding and Verifying Chain-of-thought Reasoning in Multimodal Mathematics', 'url': 'https://huggingface.co/papers/2501.04686', 'abstract': 'Chain-of-thought (CoT) reasoning has been widely applied in the mathematical reasoning of Large Language Models (LLMs). Recently, the introduction of derivative process supervision on CoT trajectories has sparked discussions on enhancing scaling capabilities during test time, thereby boosting the potential of these models. However, in multimodal mathematical reasoning, the scarcity of high-quality CoT training data has hindered existing models from achieving high-precision CoT reasoning and has limited the realization of reasoning potential during test time. In this work, we propose a three-module synthesis strategy that integrates CoT distillation, trajectory-format rewriting, and format unification. It results in a high-quality CoT reasoning instruction fine-tuning dataset in multimodal mathematics, MMathCoT-1M. We comprehensively validate the state-of-the-art (SOTA) performance of the trained URSA-7B model on multiple multimodal mathematical benchmarks. For test-time scaling, we introduce a data synthesis strategy that automatically generates process annotation datasets, known as DualMath-1.1M, focusing on both interpretation and logic. By further training URSA-7B on DualMath-1.1M, we transition from CoT reasoning capabilities to robust supervision abilities. The trained URSA-RM-7B acts as a verifier, effectively enhancing the performance of URSA-7B at test time. URSA-RM-7B also demonstrates excellent out-of-distribution (OOD) verifying capabilities, showcasing its generalization. Model weights, training data and code will be open-sourced.', 'score': 35, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '089df0fb9a548ce8', 'authors': ['Ruilin Luo', 'Zhuofan Zheng', 'Yifan Wang', 'Yiyao Yu', 'Xinzhe Ni', 'Zicheng Lin', 'Jin Zeng', 'Yujiu Yang'], 'affiliations': ['ByteDance', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04686.jpg', 'data': {'categories': ['#dataset', '#training', '#multimodal', '#data', '#open_source', '#reasoning', '#math', '#architecture', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Усиление мультимодальных математических рассуждений через синтез данных и верификацию', 'desc': 'Статья представляет новый подход к улучшению математических рассуждений в мультимодальных языковых моделях. Авторы предлагают стратегию синтеза высококачественного набора данных MMathCoT-1M для обучения цепочкам рассуждений. Они также вводят метод DualMath-1.1M для генерации аннотаций процесса рассуждений, что позволяет модели URSA-7B перейти от способности рассуждать к возможности проверять рассуждения. Результаты показывают улучшение производительности и обобщающей способности модели.'}, 'en': {'title': 'Enhancing Multimodal Mathematical Reasoning with CoT Synthesis', 'desc': "This paper discusses improving mathematical reasoning in Large Language Models (LLMs) using a method called Chain-of-Thought (CoT) reasoning. The authors introduce a new dataset, MMathCoT-1M, which is created through a three-module synthesis strategy to enhance the quality of CoT training data in multimodal mathematics. They also present a data synthesis strategy, DualMath-1.1M, that generates additional training data to improve the model's reasoning capabilities during testing. The results show that their model, URSA-RM-7B, significantly enhances performance and generalization in multimodal mathematical tasks."}, 'zh': {'title': '提升多模态数学推理的链式推理能力', 'desc': '本文探讨了链式推理(CoT)在大型语言模型(LLMs)中的应用,特别是在多模态数学推理中的挑战。由于高质量的CoT训练数据稀缺,现有模型在测试时的推理能力受到限制。为了解决这个问题,作者提出了一种三模块合成策略,生成了高质量的多模态数学推理指令微调数据集MMathCoT-1M。通过进一步训练URSA-7B模型,结合生成的数据集DualMath-1.1M,显著提升了模型在测试时的推理能力和验证能力。'}}}, {'id': 'https://huggingface.co/papers/2501.04227', 'title': 'Agent Laboratory: Using LLM Agents as Research Assistants', 'url': 'https://huggingface.co/papers/2501.04227', 'abstract': 'Historically, scientific discovery has been a lengthy and costly process, demanding substantial time and resources from initial conception to final results. To accelerate scientific discovery, reduce research costs, and improve research quality, we introduce Agent Laboratory, an autonomous LLM-based framework capable of completing the entire research process. This framework accepts a human-provided research idea and progresses through three stages--literature review, experimentation, and report writing to produce comprehensive research outputs, including a code repository and a research report, while enabling users to provide feedback and guidance at each stage. We deploy Agent Laboratory with various state-of-the-art LLMs and invite multiple researchers to assess its quality by participating in a survey, providing human feedback to guide the research process, and then evaluate the final paper. We found that: (1) Agent Laboratory driven by o1-preview generates the best research outcomes; (2) The generated machine learning code is able to achieve state-of-the-art performance compared to existing methods; (3) Human involvement, providing feedback at each stage, significantly improves the overall quality of research; (4) Agent Laboratory significantly reduces research expenses, achieving an 84% decrease compared to previous autonomous research methods. We hope Agent Laboratory enables researchers to allocate more effort toward creative ideation rather than low-level coding and writing, ultimately accelerating scientific discovery.', 'score': 34, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'ff592ae1a5a88909', 'authors': ['Samuel Schmidgall', 'Yusheng Su', 'Ze Wang', 'Ximeng Sun', 'Jialian Wu', 'Xiaodong Yu', 'Jiang Liu', 'Zicheng Liu', 'Emad Barsoum'], 'affiliations': ['AMD', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04227.jpg', 'data': {'categories': ['#science', '#training', '#agents', '#rlhf', '#survey'], 'emoji': '🧪', 'ru': {'title': 'Автономная лаборатория ИИ: революция в научных исследованиях', 'desc': 'Статья представляет Agent Laboratory - автономную систему на основе моделей LLM, способную выполнять полный цикл научного исследования. Система проходит через этапы обзора литературы, экспериментов и написания отчета, позволяя пользователям давать обратную связь на каждом этапе. Эксперименты показали, что Agent Laboratory, работающая на модели o1-preview, генерирует лучшие результаты исследований и значительно снижает затраты на исследования. Авторы надеются, что эта система позволит исследователям сосредоточиться на творческом процессе, ускоряя научные открытия.'}, 'en': {'title': 'Accelerating Science with Autonomous Research Frameworks', 'desc': 'The paper presents Agent Laboratory, an autonomous framework that utilizes large language models (LLMs) to streamline the scientific research process. It operates in three stages: conducting a literature review, performing experiments, and writing reports, all while allowing human researchers to provide feedback. The study shows that Agent Laboratory can produce high-quality research outputs, including code that outperforms existing methods, and significantly reduces research costs by 84%. By automating routine tasks, the framework aims to free researchers to focus more on innovative ideas and less on tedious coding and documentation.'}, 'zh': {'title': 'Agent Laboratory:加速科学发现的智能助手', 'desc': '本文介绍了一种名为Agent Laboratory的自主框架,旨在加速科学发现并降低研究成本。该框架基于大型语言模型(LLM),能够完成文献综述、实验和报告撰写等整个研究过程。研究表明,Agent Laboratory在生成研究成果方面表现优异,尤其是在机器学习代码的性能上,达到了最先进的水平。通过人类反馈的参与,研究质量显著提高,同时研究费用减少了84%。'}}}, {'id': 'https://huggingface.co/papers/2501.04306', 'title': 'LLM4SR: A Survey on Large Language Models for Scientific Research', 'url': 'https://huggingface.co/papers/2501.04306', 'abstract': 'In recent years, the rapid advancement of Large Language Models (LLMs) has transformed the landscape of scientific research, offering unprecedented support across various stages of the research cycle. This paper presents the first systematic survey dedicated to exploring how LLMs are revolutionizing the scientific research process. We analyze the unique roles LLMs play across four critical stages of research: hypothesis discovery, experiment planning and implementation, scientific writing, and peer reviewing. Our review comprehensively showcases the task-specific methodologies and evaluation benchmarks. By identifying current challenges and proposing future research directions, this survey not only highlights the transformative potential of LLMs, but also aims to inspire and guide researchers and practitioners in leveraging LLMs to advance scientific inquiry. Resources are available at the following repository: https://github.com/du-nlp-lab/LLM4SR', 'score': 17, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'bfb9039780003b6d', 'authors': ['Ziming Luo', 'Zonglin Yang', 'Zexin Xu', 'Wei Yang', 'Xinya Du'], 'affiliations': ['Nanyang Technological University, Singapore', 'University of Texas at Dallas, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.04306.jpg', 'data': {'categories': ['#science', '#survey', '#multimodal', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'LLM как революционный инструмент в научных исследованиях', 'desc': 'Эта статья представляет собой первый систематический обзор роли больших языковых моделей (LLM) в научных исследованиях. Авторы анализируют, как LLM используются на четырех ключевых этапах исследовательского процесса: формирование гипотез, планирование и проведение экспериментов, научное письмо и рецензирование. В работе рассматриваются специфические методологии и критерии оценки для каждой задачи. Статья также обсуждает текущие проблемы и предлагает направления для будущих исследований в этой области.'}, 'en': {'title': 'Revolutionizing Research: The Power of Large Language Models', 'desc': 'This paper systematically surveys the impact of Large Language Models (LLMs) on the scientific research process. It identifies how LLMs assist in four key stages: generating hypotheses, planning and conducting experiments, writing scientific papers, and facilitating peer reviews. The authors discuss specific methodologies and evaluation benchmarks for each task, highlighting the transformative potential of LLMs in enhancing research efficiency. Additionally, the paper addresses current challenges and suggests future research directions to further integrate LLMs into scientific inquiry.'}, 'zh': {'title': '大型语言模型:科学研究的变革者', 'desc': '近年来,大型语言模型(LLMs)的快速发展改变了科学研究的格局,为研究周期的各个阶段提供了前所未有的支持。本文首次系统性地调查了LLMs如何革新科学研究过程,分析了它们在假设发现、实验规划与实施、科学写作和同行评审等四个关键阶段的独特作用。我们的综述全面展示了任务特定的方法论和评估基准,并识别了当前面临的挑战,提出了未来的研究方向。通过强调LLMs的变革潜力,本文旨在激励和指导研究人员和从业者利用LLMs推动科学探索。'}}}, {'id': 'https://huggingface.co/papers/2501.04575', 'title': 'InfiGUIAgent: A Multimodal Generalist GUI Agent with Native Reasoning and Reflection', 'url': 'https://huggingface.co/papers/2501.04575', 'abstract': 'Graphical User Interface (GUI) Agents, powered by multimodal large language models (MLLMs), have shown great potential for task automation on computing devices such as computers and mobile phones. However, existing agents face challenges in multi-step reasoning and reliance on textual annotations, limiting their effectiveness. We introduce InfiGUIAgent, an MLLM-based GUI Agent trained with a two-stage supervised fine-tuning pipeline. Stage 1 enhances fundamental skills such as GUI understanding and grounding, while Stage 2 integrates hierarchical reasoning and expectation-reflection reasoning skills using synthesized data to enable native reasoning abilities of the agents. InfiGUIAgent achieves competitive performance on several GUI benchmarks, highlighting the impact of native reasoning skills in enhancing GUI interaction for automation tasks. Resources are available at https://github.com/Reallm-Labs/InfiGUIAgent.', 'score': 14, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '501c7ba58ede235b', 'authors': ['Yuhang Liu', 'Pengxiang Li', 'Zishu Wei', 'Congkai Xie', 'Xueyu Hu', 'Xinchen Xu', 'Shengyu Zhang', 'Xiaotian Han', 'Hongxia Yang', 'Fei Wu'], 'affiliations': ['ByteDance Inc', 'Dalian University of Technology', 'Reallm Labs', 'The Hong Kong Polytechnic University', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04575.jpg', 'data': {'categories': ['#benchmark', '#synthetic', '#training', '#agents', '#multimodal', '#reasoning'], 'emoji': '🤖', 'ru': {'title': 'Умный агент GUI: новый уровень автоматизации интерфейсов', 'desc': 'InfiGUIAgent - это агент графического пользовательского интерфейса, основанный на мультимодальных больших языковых моделях (MLLM). Он обучается с помощью двухэтапного процесса точной настройки, который улучшает базовые навыки понимания GUI и развивает способности к иерархическому рассуждению. InfiGUIAgent демонстрирует высокую эффективность в автоматизации задач взаимодействия с GUI, превосходя существующие подходы. Разработка направлена на преодоление ограничений, связанных с многошаговыми рассуждениями и зависимостью от текстовых аннотаций.'}, 'en': {'title': 'Empowering GUI Agents with Native Reasoning Skills', 'desc': "InfiGUIAgent is a new type of Graphical User Interface (GUI) agent that uses multimodal large language models (MLLMs) to improve task automation on devices like computers and smartphones. This agent addresses the limitations of existing systems by employing a two-stage supervised fine-tuning process. The first stage focuses on developing basic skills such as understanding and interacting with GUIs, while the second stage enhances the agent's ability to perform complex reasoning tasks. As a result, InfiGUIAgent demonstrates strong performance on various GUI benchmarks, showcasing the importance of advanced reasoning capabilities in automating GUI interactions."}, 'zh': {'title': '提升GUI交互的原生推理能力', 'desc': '本文介绍了一种名为InfiGUIAgent的图形用户界面(GUI)代理,它基于多模态大型语言模型(MLLM)进行任务自动化。InfiGUIAgent通过两阶段的监督微调流程进行训练,第一阶段提升了GUI理解和基础技能,第二阶段则通过合成数据整合了层次推理和期望反思推理能力。该代理在多个GUI基准测试中表现出色,显示了原生推理能力在增强GUI交互中的重要性。此研究为提高计算设备上的自动化任务提供了新的思路和方法。'}}}, {'id': 'https://huggingface.co/papers/2501.02772', 'title': 'GeAR: Generation Augmented Retrieval', 'url': 'https://huggingface.co/papers/2501.02772', 'abstract': 'Document retrieval techniques form the foundation for the development of large-scale information systems. The prevailing methodology is to construct a bi-encoder and compute the semantic similarity. However, such scalar similarity is difficult to reflect enough information and impedes our comprehension of the retrieval results. In addition, this computational process mainly emphasizes the global semantics and ignores the fine-grained semantic relationship between the query and the complex text in the document. In this paper, we propose a new method called Generation Augmented Retrieval (GeAR) that incorporates well-designed fusion and decoding modules. This enables GeAR to generate the relevant text from documents based on the fused representation of the query and the document, thus learning to "focus on" the fine-grained information. Also when used as a retriever, GeAR does not add any computational burden over bi-encoders. To support the training of the new framework, we have introduced a pipeline to efficiently synthesize high-quality data by utilizing large language models. GeAR exhibits competitive retrieval and localization performance across diverse scenarios and datasets. Moreover, the qualitative analysis and the results generated by GeAR provide novel insights into the interpretation of retrieval results. The code, data, and models will be released after completing technical review to facilitate future research.', 'score': 11, 'issue_id': 1572, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'dafa87428ce906b5', 'authors': ['Haoyu Liu', 'Shaohan Huang', 'Jianfeng Liu', 'Yuefeng Zhan', 'Hao Sun', 'Weiwei Deng', 'Feng Sun', 'Furu Wei', 'Qi Zhang'], 'affiliations': ['Microsoft Corporation'], 'pdf_title_img': 'assets/pdf/title_img/2501.02772.jpg', 'data': {'categories': ['#interpretability', '#data', '#rag', '#synthetic', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'GeAR: Новый взгляд на извлечение документов через генерацию', 'desc': 'Статья предлагает новый метод извлечения документов под названием Generation Augmented Retrieval (GeAR). В отличие от традиционных би-энкодеров, GeAR использует модули слияния и декодирования для генерации релевантного текста на основе запроса и документа. Это позволяет модели фокусироваться на детальной информации, не увеличивая вычислительную нагрузку. Авторы также разработали конвейер для синтеза качественных данных с помощью больших языковых моделей для обучения GeAR.'}, 'en': {'title': 'GeAR: Enhancing Document Retrieval with Fine-Grained Semantic Focus', 'desc': 'This paper introduces a new method called Generation Augmented Retrieval (GeAR) that enhances document retrieval techniques by focusing on fine-grained semantic relationships. Unlike traditional bi-encoders that primarily assess global semantics, GeAR generates relevant text from documents by fusing the query and document representations. This approach allows for a deeper understanding of retrieval results without increasing computational costs. Additionally, the authors provide a pipeline for synthesizing high-quality training data using large language models, leading to improved performance across various datasets.'}, 'zh': {'title': '生成增强检索:关注细粒度信息的创新方法', 'desc': '本文提出了一种新的文档检索方法,称为生成增强检索(GeAR)。GeAR通过融合查询和文档的表示,生成相关文本,从而关注细粒度信息。与传统的双编码器方法相比,GeAR在检索时不会增加计算负担,同时在多种场景和数据集上表现出竞争力的检索和定位性能。该方法还通过利用大型语言模型合成高质量数据,支持新框架的训练。'}}}, {'id': 'https://huggingface.co/papers/2501.04144', 'title': 'Chirpy3D: Continuous Part Latents for Creative 3D Bird Generation', 'url': 'https://huggingface.co/papers/2501.04144', 'abstract': 'In this paper, we push the boundaries of fine-grained 3D generation into truly creative territory. Current methods either lack intricate details or simply mimic existing objects -- we enable both. By lifting 2D fine-grained understanding into 3D through multi-view diffusion and modeling part latents as continuous distributions, we unlock the ability to generate entirely new, yet plausible parts through interpolation and sampling. A self-supervised feature consistency loss further ensures stable generation of these unseen parts. The result is the first system capable of creating novel 3D objects with species-specific details that transcend existing examples. While we demonstrate our approach on birds, the underlying framework extends beyond things that can chirp! Code will be released at https://github.com/kamwoh/chirpy3d.', 'score': 9, 'issue_id': 1578, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '89e2fad397bf0684', 'authors': ['Kam Woh Ng', 'Jing Yang', 'Jia Wei Sii', 'Jiankang Deng', 'Chee Seng Chan', 'Yi-Zhe Song', 'Tao Xiang', 'Xiatian Zhu'], 'affiliations': ['Imperial College London', 'Universiti Malaya', 'University of Cambridge', 'University of Surrey'], 'pdf_title_img': 'assets/pdf/title_img/2501.04144.jpg', 'data': {'categories': ['#diffusion', '#open_source', '#3d'], 'emoji': '🐦', 'ru': {'title': 'Генерация креативных 3D-моделей с беспрецедентной детализацией', 'desc': 'Эта статья представляет новый метод генерации детализированных 3D-объектов, выходящий за рамки простого копирования существующих примеров. Авторы используют мультиракурсную диффузию и моделирование латентных представлений частей объекта как непрерывных распределений. Это позволяет создавать совершенно новые, но правдоподобные части объектов путем интерполяции и сэмплирования. Самоконтролируемая функция потерь обеспечивает стабильную генерацию этих невиданных ранее частей.'}, 'en': {'title': 'Unlocking Creative 3D Generation with Fine-Grained Detail', 'desc': 'This paper introduces a novel approach to generating detailed 3D objects that are not just replicas of existing items. By utilizing multi-view diffusion and treating part latents as continuous distributions, the authors enable the creation of new and realistic 3D parts through interpolation and sampling techniques. A self-supervised feature consistency loss is implemented to maintain stability in generating these novel parts. The system is demonstrated on birds, showcasing its ability to produce unique species-specific details, while the framework is applicable to a broader range of objects.'}, 'zh': {'title': '突破性细粒度3D生成,创造全新物体!', 'desc': '本文提出了一种创新的细粒度3D生成方法,能够创造出全新的3D物体,而不仅仅是模仿现有物体。我们通过多视角扩散将2D细粒度理解提升到3D,并将部分潜变量建模为连续分布,从而实现了新部件的插值和采样生成。自监督特征一致性损失确保了这些未见部件的稳定生成。我们的系统能够生成具有特定物种细节的全新3D对象,超越了现有的示例。'}}}, {'id': 'https://huggingface.co/papers/2501.04689', 'title': 'SPAR3D: Stable Point-Aware Reconstruction of 3D Objects from Single Images', 'url': 'https://huggingface.co/papers/2501.04689', 'abstract': 'We study the problem of single-image 3D object reconstruction. Recent works have diverged into two directions: regression-based modeling and generative modeling. Regression methods efficiently infer visible surfaces, but struggle with occluded regions. Generative methods handle uncertain regions better by modeling distributions, but are computationally expensive and the generation is often misaligned with visible surfaces. In this paper, we present SPAR3D, a novel two-stage approach aiming to take the best of both directions. The first stage of SPAR3D generates sparse 3D point clouds using a lightweight point diffusion model, which has a fast sampling speed. The second stage uses both the sampled point cloud and the input image to create highly detailed meshes. Our two-stage design enables probabilistic modeling of the ill-posed single-image 3D task while maintaining high computational efficiency and great output fidelity. Using point clouds as an intermediate representation further allows for interactive user edits. Evaluated on diverse datasets, SPAR3D demonstrates superior performance over previous state-of-the-art methods, at an inference speed of 0.7 seconds. Project page with code and model: https://spar3d.github.io', 'score': 9, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '00474027a65aa27c', 'authors': ['Zixuan Huang', 'Mark Boss', 'Aaryaman Vasishta', 'James M. Rehg', 'Varun Jampani'], 'affiliations': ['Stability AI', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2501.04689.jpg', 'data': {'categories': ['#3d'], 'emoji': '🧊', 'ru': {'title': 'SPAR3D: Эффективная реконструкция 3D-объектов с использованием облаков точек', 'desc': 'В статье представлен новый двухэтапный подход SPAR3D для реконструкции 3D-объектов по одному изображению. На первом этапе генерируется разреженное облако точек с помощью легковесной модели диффузии точек. На втором этапе используются сгенерированное облако точек и исходное изображение для создания детализированных 3D-моделей. Этот метод сочетает преимущества регрессионного и генеративного моделирования, обеспечивая высокую вычислительную эффективность и качество результатов.'}, 'en': {'title': 'SPAR3D: Efficient and Detailed 3D Reconstruction from a Single Image', 'desc': 'This paper introduces SPAR3D, a new method for reconstructing 3D objects from a single image. It combines regression and generative modeling to efficiently create 3D point clouds and detailed meshes. The first stage generates sparse point clouds quickly, while the second stage refines these into high-quality meshes using the input image. SPAR3D achieves high fidelity and speed, outperforming existing methods and allowing for user interaction with the 3D output.'}, 'zh': {'title': 'SPAR3D:高效的单图像三维重建新方法', 'desc': '我们研究了单幅图像的三维物体重建问题。最近的研究分为两种方向:基于回归的建模和生成建模。回归方法能够有效推断可见表面,但在处理遮挡区域时表现不佳;而生成方法通过建模分布更好地处理不确定区域,但计算开销大且生成结果常常与可见表面不对齐。本文提出了SPAR3D,这是一种新颖的两阶段方法,旨在结合两种方法的优点,快速生成稀疏的三维点云,并利用输入图像创建高细节的网格。'}}}, {'id': 'https://huggingface.co/papers/2501.03271', 'title': 'DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich Paradigm for Direct Preference Optimization', 'url': 'https://huggingface.co/papers/2501.03271', 'abstract': 'The rapid rise of large language models (LLMs) has unlocked many applications but also underscores the challenge of aligning them with diverse values and preferences. Direct Preference Optimization (DPO) is central to alignment but constrained by fixed divergences and limited feature transformations. We propose DPO-Kernels, which integrates kernel methods to address these issues through four key contributions: (i) Kernelized Representations with polynomial, RBF, Mahalanobis, and spectral kernels for richer transformations, plus a hybrid loss combining embedding-based and probability-based objectives; (ii) Divergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya, Wasserstein, and f-divergences) for greater stability; (iii) Data-Driven Selection metrics that automatically choose the best kernel-divergence pair; and (iv) a Hierarchical Mixture of Kernels for both local precision and global modeling. Evaluations on 12 datasets demonstrate state-of-the-art performance in factuality, safety, reasoning, and instruction following. Grounded in Heavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization for LLMs, offering a comprehensive resource for further alignment research.', 'score': 5, 'issue_id': 1576, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': '33d1640aee045ed5', 'authors': ['Amitava Das', 'Suranjana Trivedy', 'Danush Khanna', 'Rajarshi Roy', 'Gurpreet Singh', 'Basab Ghosh', 'Yaswanth Narsupalli', 'Vinija Jain', 'Vasu Sharma', 'Aishwarya Naresh Reganti', 'Aman Chadha'], 'affiliations': ['Amazon AI, USA', 'Artificial Intelligence Institute, University of South Carolina, USA', 'Meta AI, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.03271.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#reasoning', '#dataset', '#training'], 'emoji': '🧠', 'ru': {'title': 'DPO-Kernels: Новый подход к выравниванию языковых моделей', 'desc': 'Статья представляет новый метод под названием DPO-Kernels для улучшения выравнивания больших языковых моделей (LLM) с различными ценностями и предпочтениями. Авторы предлагают использовать методы ядер для расширения возможностей прямой оптимизации предпочтений (DPO), включая кернелизованные представления, альтернативные дивергенции и data-driven выбор наилучшей комбинации ядра и дивергенции. DPO-Kernels демонстрирует улучшенные результаты в задачах фактологичности, безопасности, рассуждений и следования инструкциям на 12 наборах данных. Метод основан на саморегуляризации с тяжелыми хвостами и обеспечивает надежную генерализацию для LLM.'}, 'en': {'title': 'Enhancing LLM Alignment with DPO-Kernels', 'desc': 'This paper introduces DPO-Kernels, a method designed to improve the alignment of large language models (LLMs) with diverse user values. It enhances Direct Preference Optimization (DPO) by incorporating kernel methods, allowing for more flexible feature transformations and better divergence measures. The approach includes a hybrid loss function, various divergence alternatives, and data-driven selection metrics to optimize performance. Evaluations show that DPO-Kernels achieves state-of-the-art results in key areas such as factuality and safety across multiple datasets.'}, 'zh': {'title': 'DPO-Kernels:提升大型语言模型对齐的创新方法', 'desc': '大型语言模型(LLMs)的快速发展带来了许多应用,但也突显了与多样化价值观和偏好对齐的挑战。直接偏好优化(DPO)是对齐的核心,但受到固定散度和有限特征变换的限制。我们提出了DPO-Kernels,通过四个关键贡献来解决这些问题,包括使用多项式、RBF、Mahalanobis和谱核的核化表示,以及结合嵌入基础和基于概率的目标的混合损失。我们的评估在12个数据集上展示了在事实性、安全性、推理和指令遵循方面的最先进性能,DPO-Kernels为进一步的对齐研究提供了全面的资源。'}}}, {'id': 'https://huggingface.co/papers/2501.04694', 'title': 'EpiCoder: Encompassing Diversity and Complexity in Code Generation', 'url': 'https://huggingface.co/papers/2501.04694', 'abstract': 'Effective instruction tuning is indispensable for optimizing code LLMs, aligning model behavior with user expectations and enhancing model performance in real-world applications. However, most existing methods focus on code snippets, which are limited to specific functionalities and rigid structures, restricting the complexity and diversity of the synthesized data. To address these limitations, we introduce a novel feature tree-based synthesis framework inspired by Abstract Syntax Trees (AST). Unlike AST, which captures syntactic structure of code, our framework models semantic relationships between code elements, enabling the generation of more nuanced and diverse data. The feature tree is constructed from raw data and refined iteratively to increase the quantity and diversity of the extracted features. This process enables the identification of more complex patterns and relationships within the code. By sampling subtrees with controlled depth and breadth, our framework allows precise adjustments to the complexity of the generated code, supporting a wide range of tasks from simple function-level operations to intricate multi-file scenarios. We fine-tuned widely-used base models to create the EpiCoder series, achieving state-of-the-art performance at both the function and file levels across multiple benchmarks. Notably, empirical evidence indicates that our approach shows significant potential in synthesizing highly complex repository-level code data. Further analysis elucidates the merits of this approach by rigorously assessing data complexity and diversity through software engineering principles and LLM-as-a-judge method.', 'score': 4, 'issue_id': 1581, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1c1ef93cdfc23c2f', 'authors': ['Yaoxiang Wang', 'Haoling Li', 'Xin Zhang', 'Jie Wu', 'Xiao Liu', 'Wenxiang Hu', 'Zhongxin Guo', 'Yangyu Huang', 'Ying Xin', 'Yujiu Yang', 'Jinsong Su', 'Qi Chen', 'Scarlett Li'], 'affiliations': ['Microsoft', 'Tsinghua University', 'Xiamen University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04694.jpg', 'data': {'categories': ['#dataset', '#data', '#synthetic', '#training', '#optimization', '#alignment', '#architecture'], 'emoji': '🌳', 'ru': {'title': 'Дерево признаков: новый путь к улучшению языковых моделей для кода', 'desc': 'Статья представляет новый подход к улучшению языковых моделей для программирования с использованием дерева признаков, вдохновленного абстрактными синтаксическими деревьями. Этот метод позволяет генерировать более сложные и разнообразные обучающие данные, моделируя семантические связи между элементами кода. Авторы создали серию моделей EpiCoder, достигших высоких результатов в нескольких бенчмарках. Эмпирические данные показывают потенциал метода для синтеза сложных репозиториев кода.'}, 'en': {'title': 'Unlocking Code Complexity with Feature Trees', 'desc': 'This paper presents a new framework for instruction tuning in code language models (LLMs) that enhances their performance by generating more complex and diverse code data. The proposed feature tree-based synthesis framework goes beyond traditional code snippet methods by modeling semantic relationships between code elements, inspired by Abstract Syntax Trees (AST). By iteratively refining the feature tree, the framework captures intricate patterns and relationships, allowing for the generation of code that ranges from simple functions to complex multi-file scenarios. The authors demonstrate that their fine-tuned EpiCoder models achieve state-of-the-art results across various benchmarks, highlighting the effectiveness of their approach in synthesizing complex repository-level code data.'}, 'zh': {'title': '特征树框架:提升代码生成的复杂性与多样性', 'desc': '本论文提出了一种新的特征树合成框架,用于优化代码大语言模型(LLMs)的指令调优。该框架通过建模代码元素之间的语义关系,克服了现有方法在功能和结构上的局限性,从而生成更复杂和多样化的数据。特征树从原始数据构建,并通过迭代精炼,增加提取特征的数量和多样性。最终,我们通过微调广泛使用的基础模型,创建了EpiCoder系列,在多个基准测试中实现了函数和文件级别的最先进性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04652', 'title': 'Multi-task retriever fine-tuning for domain-specific and efficient RAG', 'url': 'https://huggingface.co/papers/2501.04652', 'abstract': 'Retrieval-Augmented Generation (RAG) has become ubiquitous when deploying Large Language Models (LLMs), as it can address typical limitations such as generating hallucinated or outdated information. However, when building real-world RAG applications, practical issues arise. First, the retrieved information is generally domain-specific. Since it is computationally expensive to fine-tune LLMs, it is more feasible to fine-tune the retriever to improve the quality of the data included in the LLM input. Second, as more applications are deployed in the same real-world system, one cannot afford to deploy separate retrievers. Moreover, these RAG applications normally retrieve different kinds of data. Our solution is to instruction fine-tune a small retriever encoder on a variety of domain-specific tasks to allow us to deploy one encoder that can serve many use cases, thereby achieving low-cost, scalability, and speed. We show how this encoder generalizes to out-of-domain settings as well as to an unseen retrieval task on real-world enterprise use cases.', 'score': 1, 'issue_id': 1584, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1c906eb3ec9e3da5', 'authors': ['Patrice Béchard', 'Orlando Marquez Ayala'], 'affiliations': ['ServiceNow'], 'pdf_title_img': 'assets/pdf/title_img/2501.04652.jpg', 'data': {'categories': ['#transfer_learning', '#training', '#hallucinations', '#rag', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Универсальный извлекатель информации для эффективного RAG', 'desc': 'Данная статья представляет новый подход к улучшению систем извлечения информации для крупных языковых моделей. Авторы предлагают дообучать небольшой энкодер для извлечения информации на различных доменно-специфичных задачах. Это позволяет использовать один энкодер для множества приложений, обеспечивая масштабируемость и эффективность. Исследование показывает, что такой подход хорошо обобщается на новые домены и задачи извлечения информации в реальных корпоративных сценариях.'}, 'en': {'title': 'One Retriever to Rule Them All: Scalable RAG Solutions', 'desc': 'This paper discusses the challenges of using Retrieval-Augmented Generation (RAG) with Large Language Models (LLMs), particularly the issues of domain-specific information retrieval and the high cost of fine-tuning LLMs. The authors propose a solution that involves instruction fine-tuning a small retriever encoder on multiple domain-specific tasks, allowing it to serve various applications without needing separate retrievers. This approach enhances the quality of data fed into the LLM while maintaining low costs and scalability. The results demonstrate that the fine-tuned encoder can effectively generalize to new, unseen tasks in real-world scenarios.'}, 'zh': {'title': '一个编码器,多种应用,低成本高效能', 'desc': '检索增强生成(RAG)在部署大型语言模型(LLM)时变得非常普遍,因为它可以解决生成虚假或过时信息的典型问题。本文提出了一种解决方案,通过对小型检索器编码器进行指令微调,使其能够在多种特定领域任务上工作,从而实现一个编码器服务多个用例。这样可以降低成本,提高可扩展性和速度,同时避免为每个应用程序部署单独的检索器。我们的实验表明,该编码器在不同领域设置和未见过的检索任务中也能很好地泛化。'}}}, {'id': 'https://huggingface.co/papers/2501.05874', 'title': 'VideoRAG: Retrieval-Augmented Generation over Video Corpus', 'url': 'https://huggingface.co/papers/2501.05874', 'abstract': 'Retrieval-Augmented Generation (RAG) is a powerful strategy to address the issue of generating factually incorrect outputs in foundation models by retrieving external knowledge relevant to queries and incorporating it into their generation process. However, existing RAG approaches have primarily focused on textual information, with some recent advancements beginning to consider images, and they largely overlook videos, a rich source of multimodal knowledge capable of representing events, processes, and contextual details more effectively than any other modality. While a few recent studies explore the integration of videos in the response generation process, they either predefine query-associated videos without retrieving them according to queries, or convert videos into the textual descriptions without harnessing their multimodal richness. To tackle these, we introduce VideoRAG, a novel framework that not only dynamically retrieves relevant videos based on their relevance with queries but also utilizes both visual and textual information of videos in the output generation. Further, to operationalize this, our method revolves around the recent advance of Large Video Language Models (LVLMs), which enable the direct processing of video content to represent it for retrieval and seamless integration of the retrieved videos jointly with queries. We experimentally validate the effectiveness of VideoRAG, showcasing that it is superior to relevant baselines.', 'score': 39, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'a6a86d4d49a42b4d', 'authors': ['Soyeong Jeong', 'Kangsan Kim', 'Jinheon Baek', 'Sung Ju Hwang'], 'affiliations': ['DeepAuto.ai', 'KAIST'], 'pdf_title_img': 'assets/pdf/title_img/2501.05874.jpg', 'data': {'categories': ['#multimodal', '#rag', '#interpretability', '#hallucinations', '#video'], 'emoji': '🎥', 'ru': {'title': 'VideoRAG: Обогащение генерации ответов с помощью видеоконтента', 'desc': 'VideoRAG - это новая система для улучшения генерации ответов с использованием видеоконтента. В отличие от существующих подходов, она динамически извлекает релевантные видео и использует как визуальную, так и текстовую информацию из них. VideoRAG основан на Больших Видеоязыковых Моделях (LVLM), которые позволяют напрямую обрабатывать видеоконтент. Экспериментальные результаты показывают превосходство VideoRAG над существующими методами.'}, 'en': {'title': 'Enhancing Generation with Dynamic Video Retrieval', 'desc': "This paper presents VideoRAG, a new framework that enhances the Retrieval-Augmented Generation (RAG) approach by incorporating video content into the generation process. Unlike previous methods that primarily focused on text or predefined videos, VideoRAG dynamically retrieves relevant videos based on the user's query. It leverages both visual and textual information from the videos, allowing for a richer and more accurate output generation. The framework utilizes Large Video Language Models (LVLMs) to effectively process and integrate video content, demonstrating superior performance compared to existing methods."}, 'zh': {'title': '视频检索增强生成:提升多模态知识的利用', 'desc': '检索增强生成(RAG)是一种强大的策略,用于解决基础模型生成事实不准确输出的问题。现有的RAG方法主要集中在文本信息上,最近的一些进展开始考虑图像,但大多数忽视了视频这一丰富的多模态知识源。我们提出了VideoRAG框架,它不仅根据查询动态检索相关视频,还利用视频的视觉和文本信息进行输出生成。实验结果验证了VideoRAG的有效性,显示其优于相关基线。'}}}, {'id': 'https://huggingface.co/papers/2501.03841', 'title': 'OmniManip: Towards General Robotic Manipulation via Object-Centric Interaction Primitives as Spatial Constraints', 'url': 'https://huggingface.co/papers/2501.03841', 'abstract': "The development of general robotic systems capable of manipulating in unstructured environments is a significant challenge. While Vision-Language Models(VLM) excel in high-level commonsense reasoning, they lack the fine-grained 3D spatial understanding required for precise manipulation tasks. Fine-tuning VLM on robotic datasets to create Vision-Language-Action Models(VLA) is a potential solution, but it is hindered by high data collection costs and generalization issues. To address these challenges, we propose a novel object-centric representation that bridges the gap between VLM's high-level reasoning and the low-level precision required for manipulation. Our key insight is that an object's canonical space, defined by its functional affordances, provides a structured and semantically meaningful way to describe interaction primitives, such as points and directions. These primitives act as a bridge, translating VLM's commonsense reasoning into actionable 3D spatial constraints. In this context, we introduce a dual closed-loop, open-vocabulary robotic manipulation system: one loop for high-level planning through primitive resampling, interaction rendering and VLM checking, and another for low-level execution via 6D pose tracking. This design ensures robust, real-time control without requiring VLM fine-tuning. Extensive experiments demonstrate strong zero-shot generalization across diverse robotic manipulation tasks, highlighting the potential of this approach for automating large-scale simulation data generation.", 'score': 37, 'issue_id': 1628, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'c2dc8cc20b9b990a', 'authors': ['Mingjie Pan', 'Jiyao Zhang', 'Tianshu Wu', 'Yinghao Zhao', 'Wenlong Gao', 'Hao Dong'], 'affiliations': ['AgiBot', 'CFCS, School of CS, Peking University', 'PKU-AgiBot Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.03841.jpg', 'data': {'categories': ['#agents', '#reasoning', '#robotics', '#3d', '#transfer_learning', '#agi'], 'emoji': '🤖', 'ru': {'title': 'Объектно-ориентированный подход к роботизированной манипуляции с использованием VLM', 'desc': 'Статья представляет новый подход к робототехнике, объединяющий возможности моделей визуального языка (VLM) с точным 3D-пониманием, необходимым для манипуляций. Авторы предлагают объектно-ориентированное представление, использующее каноническое пространство объекта для описания примитивов взаимодействия. Система включает два цикла: планирование высокого уровня с использованием VLM и низкоуровневое выполнение с отслеживанием 6D-позы. Эксперименты показывают сильную обобщающую способность в различных задачах робототехнической манипуляции.'}, 'en': {'title': 'Bridging High-Level Reasoning and Low-Level Manipulation in Robotics', 'desc': "This paper addresses the challenge of enabling robots to manipulate objects in unpredictable environments by enhancing Vision-Language Models (VLM) with a new approach. The authors propose a Vision-Language-Action Model (VLA) that utilizes an object-centric representation, focusing on an object's canonical space defined by its functional affordances. This representation helps translate high-level reasoning from VLM into specific 3D spatial actions needed for manipulation tasks. The proposed dual closed-loop system allows for effective planning and execution without the need for extensive fine-tuning, demonstrating strong performance in various robotic tasks."}, 'zh': {'title': '打破高层推理与低层操作的壁垒', 'desc': '本论文探讨了在非结构化环境中操作的通用机器人系统的开发挑战。虽然视觉-语言模型(VLM)在高层次的常识推理方面表现出色,但缺乏精细的三维空间理解能力。我们提出了一种新颖的以对象为中心的表示方法,旨在弥合VLM的高层推理与操作所需的低层精度之间的差距。通过引入双闭环、开放词汇的机器人操作系统,我们实现了高效的实时控制,且无需对VLM进行微调。'}}}, {'id': 'https://huggingface.co/papers/2501.06186', 'title': 'LlamaV-o1: Rethinking Step-by-step Visual Reasoning in LLMs', 'url': 'https://huggingface.co/papers/2501.06186', 'abstract': "Reasoning is a fundamental capability for solving complex multi-step problems, particularly in visual contexts where sequential step-wise understanding is essential. Existing approaches lack a comprehensive framework for evaluating visual reasoning and do not emphasize step-wise problem-solving. To this end, we propose a comprehensive framework for advancing step-by-step visual reasoning in large language models (LMMs) through three key contributions. First, we introduce a visual reasoning benchmark specifically designed to evaluate multi-step reasoning tasks. The benchmark presents a diverse set of challenges with eight different categories ranging from complex visual perception to scientific reasoning with over 4k reasoning steps in total, enabling robust evaluation of LLMs' abilities to perform accurate and interpretable visual reasoning across multiple steps. Second, we propose a novel metric that assesses visual reasoning quality at the granularity of individual steps, emphasizing both correctness and logical coherence. The proposed metric offers deeper insights into reasoning performance compared to traditional end-task accuracy metrics. Third, we present a new multimodal visual reasoning model, named LlamaV-o1, trained using a multi-step curriculum learning approach, where tasks are progressively organized to facilitate incremental skill acquisition and problem-solving. The proposed LlamaV-o1 is designed for multi-step reasoning and learns step-by-step through a structured training paradigm. Extensive experiments show that our LlamaV-o1 outperforms existing open-source models and performs favorably against close-source proprietary models. Compared to the recent Llava-CoT, our LlamaV-o1 achieves an average score of 67.3 with an absolute gain of 3.8\\% across six benchmarks while being 5 times faster during inference scaling. Our benchmark, model, and code are publicly available.", 'score': 31, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '40e1a0d2c562cda5', 'authors': ['Omkar Thawakar', 'Dinura Dissanayake', 'Ketan More', 'Ritesh Thawkar', 'Ahmed Heakl', 'Noor Ahsan', 'Yuhao Li', 'Mohammed Zumri', 'Jean Lahoud', 'Rao Muhammad Anwer', 'Hisham Cholakkal', 'Ivan Laptev', 'Mubarak Shah', 'Fahad Shahbaz Khan', 'Salman Khan'], 'affiliations': ['Australian National University', 'Linköping University', 'Mohamed bin Zayed University of AI', 'University of Central Florida'], 'pdf_title_img': 'assets/pdf/title_img/2501.06186.jpg', 'data': {'categories': ['#cv', '#benchmark', '#training', '#multimodal', '#open_source', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Шаг за шагом к совершенному визуальному рассуждению', 'desc': 'Статья представляет комплексный подход к улучшению пошагового визуального рассуждения в больших языковых моделях (LLM). Авторы вводят новый бенчмарк для оценки многошаговых задач визуального рассуждения и метрику для оценки качества рассуждения на уровне отдельных шагов. Они также предлагают новую мультимодальную модель визуального рассуждения LlamaV-o1, обученную с использованием подхода многоступенчатого куррикулярного обучения. Эксперименты показывают, что LlamaV-o1 превосходит существующие модели с открытым исходным кодом и демонстрирует хорошие результаты по сравнению с проприетарными моделями.'}, 'en': {'title': 'Advancing Step-by-Step Visual Reasoning in LLMs', 'desc': "This paper introduces a new framework to enhance visual reasoning in large language models (LLMs) by focusing on step-by-step problem-solving. It presents a visual reasoning benchmark with over 4,000 reasoning steps across eight categories, allowing for thorough evaluation of LLMs' multi-step reasoning capabilities. Additionally, a novel metric is proposed to assess the quality of visual reasoning at each step, providing insights beyond traditional accuracy measures. The authors also introduce LlamaV-o1, a multimodal model trained with a curriculum learning approach, which shows significant performance improvements over existing models."}, 'zh': {'title': '提升视觉推理能力的全新框架', 'desc': '本论文提出了一种新的框架,旨在提升大型语言模型(LLMs)在视觉推理中的逐步推理能力。我们设计了一个视觉推理基准,包含多达4000个推理步骤,涵盖复杂的视觉感知和科学推理等八个类别,以便全面评估模型的推理能力。我们还提出了一种新颖的度量标准,专注于逐步推理的正确性和逻辑一致性,提供比传统的任务准确率更深入的洞察。最后,我们介绍了名为LlamaV-o1的多模态视觉推理模型,通过逐步课程学习的方法进行训练,显著提升了推理性能。'}}}, {'id': 'https://huggingface.co/papers/2501.05510', 'title': 'OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video Understanding?', 'url': 'https://huggingface.co/papers/2501.05510', 'abstract': 'Temporal Awareness, the ability to reason dynamically based on the timestamp when a question is raised, is the key distinction between offline and online video LLMs. Unlike offline models, which rely on complete videos for static, post hoc analysis, online models process video streams incrementally and dynamically adapt their responses based on the timestamp at which the question is posed. Despite its significance, temporal awareness has not been adequately evaluated in existing benchmarks. To fill this gap, we present OVO-Bench (Online-VideO-Benchmark), a novel video benchmark that emphasizes the importance of timestamps for advanced online video understanding capability benchmarking. OVO-Bench evaluates the ability of video LLMs to reason and respond to events occurring at specific timestamps under three distinct scenarios: (1) Backward tracing: trace back to past events to answer the question. (2) Real-time understanding: understand and respond to events as they unfold at the current timestamp. (3) Forward active responding: delay the response until sufficient future information becomes available to answer the question accurately. OVO-Bench comprises 12 tasks, featuring 644 unique videos and approximately human-curated 2,800 fine-grained meta-annotations with precise timestamps. We combine automated generation pipelines with human curation. With these high-quality samples, we further developed an evaluation pipeline to systematically query video LLMs along the video timeline. Evaluations of nine Video-LLMs reveal that, despite advancements on traditional benchmarks, current models struggle with online video understanding, showing a significant gap compared to human agents. We hope OVO-Bench will drive progress in video LLMs and inspire future research in online video reasoning. Our benchmark and code can be accessed at https://github.com/JoeLeelyf/OVO-Bench.', 'score': 26, 'issue_id': 1631, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '6f833a01519603d5', 'authors': ['Yifei Li', 'Junbo Niu', 'Ziyang Miao', 'Chunjiang Ge', 'Yuanhang Zhou', 'Qihao He', 'Xiaoyi Dong', 'Haodong Duan', 'Shuangrui Ding', 'Rui Qian', 'Pan Zhang', 'Yuhang Zang', 'Yuhang Cao', 'Conghui He', 'Jiaqi Wang'], 'affiliations': ['Beihang University', 'Communication University of China', 'SenseTime Group', 'Shanghai Artificial Intelligence Laboratory', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05510.jpg', 'data': {'categories': ['#benchmark', '#survey', '#video', '#reasoning'], 'emoji': '⏱️', 'ru': {'title': 'Временная осведомленность как ключ к онлайн-анализу видео для LLM', 'desc': 'Статья представляет новый бенчмарк OVO-Bench для оценки способности видео-LLM моделей к онлайн-анализу видео с учетом временных меток. Бенчмарк включает 12 задач, 644 уникальных видео и около 2800 мета-аннотаций с точными временными метками. OVO-Bench оценивает три сценария: обратное отслеживание, понимание в реальном времени и активное реагирование на будущие события. Результаты тестирования девяти видео-LLM моделей показывают значительное отставание от человеческих возможностей в онлайн-анализе видео.'}, 'en': {'title': 'Enhancing Online Video Understanding with Temporal Awareness', 'desc': 'This paper introduces OVO-Bench, a new benchmark designed to evaluate the temporal awareness of online video language models (LLMs). Unlike offline models that analyze complete videos, online models must dynamically respond to questions based on the specific timestamp of the inquiry. OVO-Bench assesses video LLMs through three scenarios: backward tracing, real-time understanding, and forward active responding, using a dataset of 644 videos and 2,800 meta-annotations. The findings indicate that current video LLMs still lag behind human performance in understanding and reasoning about events in real-time video streams.'}, 'zh': {'title': '提升视频理解能力的时间意识基准', 'desc': '本文提出了OVO-Bench,这是一个新的视频基准,旨在评估视频大语言模型(LLMs)在时间意识方面的能力。时间意识是指模型根据提问时的时间戳动态推理的能力,这与传统的离线模型不同,后者依赖于完整视频进行静态分析。OVO-Bench包含12个任务,使用644个独特视频和约2800个精细的元注释,强调了时间戳在在线视频理解中的重要性。通过对九个视频LLMs的评估,结果显示当前模型在在线视频理解方面仍存在显著差距,远不及人类代理。'}}}, {'id': 'https://huggingface.co/papers/2501.05727', 'title': 'Enabling Scalable Oversight via Self-Evolving Critic', 'url': 'https://huggingface.co/papers/2501.05727', 'abstract': "Despite their remarkable performance, the development of Large Language Models (LLMs) faces a critical challenge in scalable oversight: providing effective feedback for tasks where human evaluation is difficult or where LLMs outperform humans. While there is growing interest in using LLMs for critique, current approaches still rely on human annotations or more powerful models, leaving the issue of enhancing critique capabilities without external supervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework that enables genuine self-evolution of critique abilities. Technically, SCRIT self-improves by training on synthetic data, generated by a contrastive-based self-critic that uses reference solutions for step-by-step critique, and a self-validation mechanism that ensures critique quality through correction outcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs, SCRIT achieves up to a 10.3\\% improvement on critique-correction and error identification benchmarks. Our analysis reveals that SCRIT's performance scales positively with data and model size, outperforms alternative approaches, and benefits critically from its self-validation component.", 'score': 17, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '5a9e3b95b6aa1312', 'authors': ['Zhengyang Tang', 'Ziniu Li', 'Zhenyang Xiao', 'Tian Ding', 'Ruoyu Sun', 'Benyou Wang', 'Dayiheng Liu', 'Fei Huang', 'Tianyu Liu', 'Bowen Yu', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Inc., Beijing, China', 'Shenzhen Research Institute of Big Data, Shenzhen, China', 'The Chinese University of Hong Kong, Shenzhen, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.05727.jpg', 'data': {'categories': ['#training', '#benchmark', '#optimization', '#rlhf', '#synthetic'], 'emoji': '🔬', 'ru': {'title': 'SCRIT: Самосовершенствующийся критик для LLM', 'desc': 'SCRIT - это новая система для улучшения способностей больших языковых моделей (LLM) к самокритике без внешнего надзора. Она использует синтетические данные, созданные с помощью самокритика на основе контрастного обучения и механизма самопроверки. Реализованная на базе Qwen2.5-72B-Instruct, SCRIT демонстрирует значительное улучшение в задачах критики-коррекции и идентификации ошибок. Анализ показывает, что производительность SCRIT растет с увеличением объема данных и размера модели.'}, 'en': {'title': 'Empowering LLMs with Self-Evolving Critique', 'desc': 'This paper addresses the challenge of providing effective feedback for Large Language Models (LLMs) in tasks where human evaluation is difficult. It introduces SCRIT (Self-evolving CRITic), a framework that enhances the critique capabilities of LLMs without relying on external supervision. SCRIT utilizes synthetic data generated by a contrastive-based self-critic and incorporates a self-validation mechanism to ensure the quality of critiques. The results show that SCRIT significantly improves critique-correction and error identification benchmarks, demonstrating its effectiveness as LLMs scale in size and data.'}, 'zh': {'title': '自我进化,提升批评能力!', 'desc': '尽管大型语言模型(LLMs)表现出色,但在可扩展监督方面面临挑战,特别是在难以进行人类评估的任务中。本文提出了SCRIT(自我进化批评者)框架,旨在提升模型的自我批评能力。SCRIT通过对比自我批评生成合成数据,并利用自我验证机制确保批评质量,从而实现自我改进。实验结果表明,SCRIT在批评纠正和错误识别基准上提高了10.3%的性能,且其表现随着数据和模型规模的增加而提升。'}}}, {'id': 'https://huggingface.co/papers/2501.05452', 'title': 'ReFocus: Visual Editing as a Chain of Thought for Structured Image Understanding', 'url': 'https://huggingface.co/papers/2501.05452', 'abstract': 'Structured image understanding, such as interpreting tables and charts, requires strategically refocusing across various structures and texts within an image, forming a reasoning sequence to arrive at the final answer. However, current multimodal large language models (LLMs) lack this multihop selective attention capability. In this work, we introduce ReFocus, a simple yet effective framework that equips multimodal LLMs with the ability to generate "visual thoughts" by performing visual editing on the input image through code, shifting and refining their visual focuses. Specifically, ReFocus enables multimodal LLMs to generate Python codes to call tools and modify the input image, sequentially drawing boxes, highlighting sections, and masking out areas, thereby enhancing the visual reasoning process. We experiment upon a wide range of structured image understanding tasks involving tables and charts. ReFocus largely improves performance on all tasks over GPT-4o without visual editing, yielding an average gain of 11.0% on table tasks and 6.8% on chart tasks. We present an in-depth analysis of the effects of different visual edits, and reasons why ReFocus can improve the performance without introducing additional information. Further, we collect a 14k training set using ReFocus, and prove that such visual chain-of-thought with intermediate information offers a better supervision than standard VQA data, reaching a 8.0% average gain over the same model trained with QA pairs and 2.6% over CoT.', 'score': 7, 'issue_id': 1630, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '28a63b60414f99da', 'authors': ['Xingyu Fu', 'Minqian Liu', 'Zhengyuan Yang', 'John Corring', 'Yijuan Lu', 'Jianwei Yang', 'Dan Roth', 'Dinei Florencio', 'Cha Zhang'], 'affiliations': ['Microsoft', 'University of Pennsylvania', 'Virginia Tech'], 'pdf_title_img': 'assets/pdf/title_img/2501.05452.jpg', 'data': {'categories': ['#multimodal', '#interpretability', '#dataset', '#reasoning', '#training', '#cv'], 'emoji': '🔍', 'ru': {'title': 'ReFocus: Улучшение визуального понимания LLM через управляемое редактирование изображений', 'desc': "Статья представляет ReFocus - фреймворк, который наделяет мультимодальные большие языковые модели (LLM) способностью генерировать 'визуальные мысли' путем редактирования входного изображения с помощью кода. ReFocus позволяет LLM последовательно рисовать рамки, выделять секции и маскировать области, улучшая процесс визуального рассуждения. Эксперименты показывают значительное улучшение производительности на задачах понимания структурированных изображений, таких как таблицы и диаграммы. Авторы также доказывают, что визуальная цепочка рассуждений с промежуточной информацией обеспечивает лучшее обучение, чем стандартные данные VQA."}, 'en': {'title': 'Enhancing Visual Reasoning with ReFocus', 'desc': "This paper presents ReFocus, a framework designed to enhance the capabilities of multimodal large language models (LLMs) in structured image understanding tasks, such as interpreting tables and charts. ReFocus allows these models to generate 'visual thoughts' by performing visual edits on input images, which helps them focus on relevant areas and improve their reasoning processes. The framework enables the generation of Python code to manipulate images, such as drawing boxes and highlighting sections, which significantly boosts performance on various tasks. Experimental results show that ReFocus achieves notable improvements over existing models, demonstrating the effectiveness of visual editing in enhancing visual reasoning without adding new information."}, 'zh': {'title': 'ReFocus:提升多模态模型的视觉推理能力', 'desc': '本论文提出了一种名为ReFocus的框架,旨在提升多模态大语言模型在结构化图像理解任务中的表现。ReFocus通过生成Python代码对输入图像进行视觉编辑,使模型能够逐步调整视觉焦点,从而形成更有效的推理过程。实验结果表明,ReFocus在表格和图表任务上显著提高了性能,平均提升分别为11.0%和6.8%。此外,研究还表明,使用ReFocus生成的视觉链式思维提供了比标准问答数据更好的监督效果。'}}}, {'id': 'https://huggingface.co/papers/2501.04698', 'title': 'ConceptMaster: Multi-Concept Video Customization on Diffusion Transformer Models Without Test-Time Tuning', 'url': 'https://huggingface.co/papers/2501.04698', 'abstract': 'Text-to-video generation has made remarkable advancements through diffusion models. However, Multi-Concept Video Customization (MCVC) remains a significant challenge. We identify two key challenges in this task: 1) the identity decoupling problem, where directly adopting existing customization methods inevitably mix attributes when handling multiple concepts simultaneously, and 2) the scarcity of high-quality video-entity pairs, which is crucial for training such a model that represents and decouples various concepts well. To address these challenges, we introduce ConceptMaster, an innovative framework that effectively tackles the critical issues of identity decoupling while maintaining concept fidelity in customized videos. Specifically, we introduce a novel strategy of learning decoupled multi-concept embeddings that are injected into the diffusion models in a standalone manner, which effectively guarantees the quality of customized videos with multiple identities, even for highly similar visual concepts. To further overcome the scarcity of high-quality MCVC data, we carefully establish a data construction pipeline, which enables systematic collection of precise multi-concept video-entity data across diverse concepts. A comprehensive benchmark is designed to validate the effectiveness of our model from three critical dimensions: concept fidelity, identity decoupling ability, and video generation quality across six different concept composition scenarios. Extensive experiments demonstrate that our ConceptMaster significantly outperforms previous approaches for this task, paving the way for generating personalized and semantically accurate videos across multiple concepts.', 'score': 6, 'issue_id': 1631, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '6e82dc0b883c447a', 'authors': ['Yuzhou Huang', 'Ziyang Yuan', 'Quande Liu', 'Qiulin Wang', 'Xintao Wang', 'Ruimao Zhang', 'Pengfei Wan', 'Di Zhang', 'Kun Gai'], 'affiliations': ['Kuaishou Technology', 'Sun Yat-sen University', 'The Chinese University of Hong Kong, Shenzhen', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04698.jpg', 'data': {'categories': ['#diffusion', '#benchmark', '#data', '#video', '#dataset'], 'emoji': '🎬', 'ru': {'title': 'ConceptMaster: новый уровень персонализации в генерации видео', 'desc': 'Статья представляет ConceptMaster - новую систему для генерации видео с множественными персонализированными концептами. Авторы решают проблему смешивания атрибутов при одновременной работе с несколькими концептами, предлагая метод обучения раздельных мультиконцептуальных эмбеддингов. Для преодоления нехватки качественных данных разработан специальный конвейер сбора видео-сущностных пар. Эксперименты показывают превосходство ConceptMaster над существующими подходами в точности концептов, способности разделения идентичностей и качестве генерации видео.'}, 'en': {'title': 'Mastering Multi-Concept Video Customization with ConceptMaster', 'desc': "This paper presents ConceptMaster, a new framework for Multi-Concept Video Customization (MCVC) that addresses two main challenges: identity decoupling and the lack of high-quality video-entity pairs. The identity decoupling problem arises when existing methods mix attributes from different concepts, leading to poor customization results. ConceptMaster introduces a novel approach to learn decoupled multi-concept embeddings, which are integrated into diffusion models to ensure high-quality video outputs with distinct identities. Additionally, the authors establish a data construction pipeline to systematically gather diverse multi-concept video-entity data, and they validate their model's effectiveness through comprehensive benchmarks across various scenarios."}, 'zh': {'title': 'ConceptMaster:多概念视频定制的新突破', 'desc': '本文介绍了一种名为ConceptMaster的创新框架,旨在解决多概念视频定制中的身份解耦问题和高质量视频实体对的稀缺性。我们提出了一种新的学习策略,通过独立注入解耦的多概念嵌入到扩散模型中,从而保证定制视频的质量。为了克服高质量MCVC数据的不足,我们建立了一个数据构建管道,系统性地收集多概念视频实体数据。实验结果表明,ConceptMaster在概念保真度、身份解耦能力和视频生成质量等方面显著优于之前的方法。'}}}, {'id': 'https://huggingface.co/papers/2501.05707', 'title': 'Multiagent Finetuning: Self Improvement with Diverse Reasoning Chains', 'url': 'https://huggingface.co/papers/2501.05707', 'abstract': 'Large language models (LLMs) have achieved remarkable performance in recent years but are fundamentally limited by the underlying training data. To improve models beyond the training data, recent works have explored how LLMs can be used to generate synthetic data for autonomous self-improvement. However, successive steps of self-improvement can reach a point of diminishing returns. In this work, we propose a complementary approach towards self-improvement where finetuning is applied to a multiagent society of language models. A group of language models, all starting from the same base model, are independently specialized by updating each one using data generated through multiagent interactions among the models. By training each model on independent sets of data, we illustrate how this approach enables specialization across models and diversification over the set of models. As a result, our overall system is able to preserve diverse reasoning chains and autonomously improve over many more rounds of fine-tuning than single-agent self-improvement methods. We quantitatively illustrate the efficacy of the approach across a wide suite of reasoning tasks.', 'score': 5, 'issue_id': 1629, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '3d75785114d08414', 'authors': ['Vighnesh Subramaniam', 'Yilun Du', 'Joshua B. Tenenbaum', 'Antonio Torralba', 'Shuang Li', 'Igor Mordatch'], 'affiliations': ['Google Deepmind', 'Harvard University', 'MIT CSAIL', 'Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05707.jpg', 'data': {'categories': ['#synthetic', '#reasoning', '#training', '#agents'], 'emoji': '🤖', 'ru': {'title': 'Мультиагентное обучение: новый путь к улучшению языковых моделей', 'desc': 'Эта статья представляет новый подход к улучшению больших языковых моделей (LLM) с помощью мультиагентного обучения. Авторы предлагают создать группу моделей, которые взаимодействуют друг с другом для генерации синтетических данных. Каждая модель специализируется на своем наборе данных, что позволяет сохранить разнообразие рассуждений. Этот метод показывает лучшие результаты по сравнению с одноагентными подходами к самоулучшению на различных задачах рассуждения.'}, 'en': {'title': 'Empowering Language Models through Multiagent Self-Improvement', 'desc': 'This paper discusses a new method for improving large language models (LLMs) by using a multiagent system. Instead of relying solely on the original training data, the authors propose that multiple LLMs can interact and generate their own synthetic data, which they then use to fine-tune themselves. This approach allows each model to specialize in different areas, leading to a more diverse set of reasoning capabilities. The results show that this multiagent fine-tuning method can enhance performance over many iterations, surpassing traditional single-agent self-improvement techniques.'}, 'zh': {'title': '多智能体模型的自我改进新方法', 'desc': '大型语言模型(LLMs)在最近几年取得了显著的性能,但其根本上受到训练数据的限制。为了超越训练数据,最近的研究探索了如何利用LLMs生成合成数据以实现自主自我改进。本文提出了一种补充的方法,通过在多智能体语言模型的社会中进行微调,来实现自我改进。通过独立训练每个模型,利用模型之间的多智能体交互生成的数据,我们展示了这种方法如何实现模型的专业化和多样化,从而在多个微调轮次中保持多样的推理链。'}}}, {'id': 'https://huggingface.co/papers/2501.04961', 'title': 'Demystifying Domain-adaptive Post-training for Financial LLMs', 'url': 'https://huggingface.co/papers/2501.04961', 'abstract': 'Domain-adaptive post-training of large language models (LLMs) has emerged as a promising approach for specialized domains such as medicine and finance. However, significant challenges remain in identifying optimal adaptation criteria and training strategies across varying data and model configurations. To address these challenges, we introduce FINDAP, a systematic and fine-grained investigation into domain-adaptive post-training of LLMs for the finance domain. Our approach begins by identifying the core capabilities required for the target domain and designing a comprehensive evaluation suite aligned with these needs. We then analyze the effectiveness of key post-training stages, including continual pretraining, instruction tuning, and preference alignment. Building on these insights, we propose an effective training recipe centered on a novel preference data distillation method, which leverages process signals from a generative reward model. The resulting model, Llama-Fin, achieves state-of-the-art performance across a wide range of financial tasks. Our analysis also highlights how each post-training stage contributes to distinct capabilities, uncovering specific challenges and effective solutions, providing valuable insights for domain adaptation of LLMs. Project page: https://github.com/SalesforceAIResearch/FinDap', 'score': 4, 'issue_id': 1642, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ade3590d1cc29d47', 'authors': ['Zixuan Ke', 'Yifei Ming', 'Xuan-Phi Nguyen', 'Caiming Xiong', 'Shafiq Joty'], 'affiliations': ['Salesforce AI Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.04961.jpg', 'data': {'categories': ['#optimization', '#rlhf', '#healthcare', '#transfer_learning', '#training'], 'emoji': '💹', 'ru': {'title': 'Оптимизация LLM для финансов: от анализа до совершенства', 'desc': 'Статья представляет FINDAP - систематический подход к доменно-адаптивному постобучению больших языковых моделей (LLM) для финансовой сферы. Авторы разработали комплексный набор оценок, анализирующий эффективность ключевых этапов постобучения, включая продолжающееся предобучение, инструктивную настройку и выравнивание предпочтений. Предложен эффективный рецепт обучения, основанный на новом методе дистилляции данных предпочтений. Результирующая модель Llama-Fin достигает передовых результатов в широком спектре финансовых задач.'}, 'en': {'title': 'FINDAP: Tailoring LLMs for Finance Excellence', 'desc': 'This paper presents FINDAP, a method for improving large language models (LLMs) specifically for the finance sector through domain-adaptive post-training. It identifies essential capabilities needed for financial tasks and creates a tailored evaluation suite to measure these capabilities. The study examines various post-training techniques, such as continual pretraining and instruction tuning, to determine their effectiveness. Ultimately, the authors introduce Llama-Fin, a model that utilizes a novel preference data distillation method, achieving top performance in financial applications while providing insights into the adaptation process.'}, 'zh': {'title': '金融领域的智能适应训练', 'desc': '本文介绍了一种针对金融领域的大型语言模型(LLM)进行领域自适应后训练的方法,称为FINDAP。我们首先识别目标领域所需的核心能力,并设计了与这些需求相一致的综合评估套件。接着,我们分析了关键后训练阶段的有效性,包括持续预训练、指令调优和偏好对齐。最终,我们提出了一种基于新颖偏好数据蒸馏方法的有效训练方案,所得到的模型Llama-Fin在多种金融任务中达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.06187', 'title': 'Multi-subject Open-set Personalization in Video Generation', 'url': 'https://huggingface.co/papers/2501.06187', 'abstract': 'Video personalization methods allow us to synthesize videos with specific concepts such as people, pets, and places. However, existing methods often focus on limited domains, require time-consuming optimization per subject, or support only a single subject. We present Video Alchemist - a video model with built-in multi-subject, open-set personalization capabilities for both foreground objects and background, eliminating the need for time-consuming test-time optimization. Our model is built on a new Diffusion Transformer module that fuses each conditional reference image and its corresponding subject-level text prompt with cross-attention layers. Developing such a large model presents two main challenges: dataset and evaluation. First, as paired datasets of reference images and videos are extremely hard to collect, we sample selected video frames as reference images and synthesize a clip of the target video. However, while models can easily denoise training videos given reference frames, they fail to generalize to new contexts. To mitigate this issue, we design a new automatic data construction pipeline with extensive image augmentations. Second, evaluating open-set video personalization is a challenge in itself. To address this, we introduce a personalization benchmark that focuses on accurate subject fidelity and supports diverse personalization scenarios. Finally, our extensive experiments show that our method significantly outperforms existing personalization methods in both quantitative and qualitative evaluations.', 'score': 4, 'issue_id': 1631, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'fcf16f5f8fe9047a', 'authors': ['Tsai-Shien Chen', 'Aliaksandr Siarohin', 'Willi Menapace', 'Yuwei Fang', 'Kwot Sin Lee', 'Ivan Skorokhodov', 'Kfir Aberman', 'Jun-Yan Zhu', 'Ming-Hsuan Yang', 'Sergey Tulyakov'], 'affiliations': ['CMU', 'Snap Inc.', 'UC Merced'], 'pdf_title_img': 'assets/pdf/title_img/2501.06187.jpg', 'data': {'categories': ['#diffusion', '#synthetic', '#benchmark', '#data', '#optimization', '#video', '#dataset'], 'emoji': '🎭', 'ru': {'title': 'Универсальная персонализация видео без длительной оптимизации', 'desc': 'Статья представляет Video Alchemist - новую модель для персонализации видео с возможностью работы с несколькими объектами. Модель использует новый модуль Diffusion Transformer, который объединяет условные референсные изображения и текстовые промпты. Авторы разработали автоматический конвейер для создания данных с обширными аугментациями изображений. Также был создан новый бенчмарк для оценки персонализации видео в открытом наборе.'}, 'en': {'title': 'Revolutionizing Video Personalization with Video Alchemist', 'desc': "The paper introduces Video Alchemist, a novel video personalization model that allows for the synthesis of videos featuring multiple subjects without the need for extensive optimization. It utilizes a Diffusion Transformer module that integrates reference images and text prompts through cross-attention layers, enabling effective personalization for both foreground and background elements. The authors tackle challenges related to dataset creation by employing a new automatic data construction pipeline with image augmentations, which helps improve generalization to new contexts. Additionally, they propose a personalization benchmark to evaluate the model's performance in diverse scenarios, demonstrating that Video Alchemist outperforms existing methods in both quantitative and qualitative assessments."}, 'zh': {'title': '视频个性化的新突破', 'desc': '视频个性化方法可以合成特定概念的视频,如人物、宠物和地点。然而,现有方法通常只关注有限的领域,且每个主题需要耗时的优化,或者仅支持单一主题。我们提出了视频炼金术师(Video Alchemist),这是一种具有内置多主题、开放集个性化能力的视频模型,能够处理前景物体和背景,消除了耗时的测试时间优化需求。我们的模型基于新的扩散变换器模块,结合条件参考图像和相应的主题级文本提示,通过交叉注意力层进行融合。'}}}, {'id': 'https://huggingface.co/papers/2501.05542', 'title': 'Infecting Generative AI With Viruses', 'url': 'https://huggingface.co/papers/2501.05542', 'abstract': 'This study demonstrates a novel approach to testing the security boundaries of Vision-Large Language Model (VLM/ LLM) using the EICAR test file embedded within JPEG images. We successfully executed four distinct protocols across multiple LLM platforms, including OpenAI GPT-4o, Microsoft Copilot, Google Gemini 1.5 Pro, and Anthropic Claude 3.5 Sonnet. The experiments validated that a modified JPEG containing the EICAR signature could be uploaded, manipulated, and potentially executed within LLM virtual workspaces. Key findings include: 1) consistent ability to mask the EICAR string in image metadata without detection, 2) successful extraction of the test file using Python-based manipulation within LLM environments, and 3) demonstration of multiple obfuscation techniques including base64 encoding and string reversal. This research extends Microsoft Research\'s "Penetration Testing Rules of Engagement" framework to evaluate cloud-based generative AI and LLM security boundaries, particularly focusing on file handling and execution capabilities within containerized environments.', 'score': 4, 'issue_id': 1630, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ac21f1bae807486e', 'authors': ['David Noever', 'Forrest McKee'], 'affiliations': ['PeopleTec, Inc., Huntsville, AL'], 'pdf_title_img': 'assets/pdf/title_img/2501.05542.jpg', 'data': {'categories': ['#cv', '#benchmark', '#data', '#security'], 'emoji': '🛡️', 'ru': {'title': 'Новые горизонты в тестировании безопасности VLM/LLM с помощью EICAR', 'desc': 'Это исследование демонстрирует новый подход к тестированию границ безопасности моделей типа Vision-Large Language Model (VLM/LLM) с использованием тестового файла EICAR, встроенного в изображения JPEG. Эксперименты проводились на нескольких платформах LLM, включая OpenAI GPT-4, Microsoft Copilot, Google Gemini 1.5 Pro и Anthropic Claude 3.5 Sonnet. Ключевые результаты включают успешную маскировку строки EICAR в метаданных изображения, извлечение тестового файла с помощью Python в среде LLM и демонстрацию различных методов обфускации. Исследование расширяет рамки оценки безопасности облачных генеративных ИИ и LLM, особенно в отношении обработки файлов и возможностей выполнения в контейнеризированных средах.'}, 'en': {'title': 'Testing Security Boundaries of LLMs with EICAR in JPEGs', 'desc': 'This paper presents a new method for testing the security limits of Vision-Large Language Models (VLMs/LLMs) by embedding the EICAR test file in JPEG images. The authors conducted experiments on various LLM platforms, revealing that modified JPEGs containing the EICAR signature could be uploaded and manipulated without detection. They demonstrated the ability to extract the EICAR file using Python scripts and employed several obfuscation techniques to hide the EICAR string. This research enhances existing security frameworks by focusing on the file handling and execution capabilities of cloud-based generative AI systems.'}, 'zh': {'title': '测试大型语言模型的安全边界新方法', 'desc': '本研究展示了一种新颖的方法,用于测试视觉大型语言模型(VLM/LLM)的安全边界,使用嵌入在JPEG图像中的EICAR测试文件。我们在多个LLM平台上成功执行了四种不同的协议,包括OpenAI GPT-4o、Microsoft Copilot、Google Gemini 1.5 Pro和Anthropic Claude 3.5 Sonnet。实验验证了修改后的JPEG图像可以在LLM虚拟工作区中上传、操控并可能执行。研究的关键发现包括:在图像元数据中无检测地掩盖EICAR字符串、在LLM环境中成功提取测试文件,以及展示多种混淆技术,如base64编码和字符串反转。'}}}, {'id': 'https://huggingface.co/papers/2501.08828', 'title': 'MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents', 'url': 'https://huggingface.co/papers/2501.08828', 'abstract': 'Multi-modal document retrieval is designed to identify and retrieve various forms of multi-modal content, such as figures, tables, charts, and layout information from extensive documents. Despite its significance, there is a notable lack of a robust benchmark to effectively evaluate the performance of systems in multi-modal document retrieval. To address this gap, this work introduces a new benchmark, named as MMDocIR, encompassing two distinct tasks: page-level and layout-level retrieval. The former focuses on localizing the most relevant pages within a long document, while the latter targets the detection of specific layouts, offering a more fine-grained granularity than whole-page analysis. A layout can refer to a variety of elements such as textual paragraphs, equations, figures, tables, or charts. The MMDocIR benchmark comprises a rich dataset featuring expertly annotated labels for 1,685 questions and bootstrapped labels for 173,843 questions, making it a pivotal resource for advancing multi-modal document retrieval for both training and evaluation. Through rigorous experiments, we reveal that (i) visual retrievers significantly outperform their text counterparts, (ii) MMDocIR train set can effectively benefit the training process of multi-modal document retrieval and (iii) text retrievers leveraging on VLM-text perform much better than those using OCR-text. These findings underscores the potential advantages of integrating visual elements for multi-modal document retrieval.', 'score': 17, 'issue_id': 1698, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'bf9a6df8fecd4ec1', 'authors': ['Kuicai Dong', 'Yujing Chang', 'Xin Deik Goh', 'Dexun Li', 'Ruiming Tang', 'Yong Liu'], 'affiliations': ['Noahs Ark Lab, Huawei'], 'pdf_title_img': 'assets/pdf/title_img/2501.08828.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'MMDocIR: Новый стандарт для мультимодального поиска документов', 'desc': 'Статья представляет новый бенчмарк MMDocIR для оценки систем мультимодального поиска документов. Бенчмарк включает две задачи: поиск на уровне страниц и на уровне макетов. Датасет содержит экспертные аннотации для 1,685 вопросов и автоматически сгенерированные метки для 173,843 вопросов. Эксперименты показали, что визуальные ретриверы превосходят текстовые, а использование визуально-языковых моделей дает лучшие результаты, чем OCR-текст.'}, 'en': {'title': 'Unlocking Multi-Modal Document Retrieval with MMDocIR', 'desc': 'This paper addresses the challenge of multi-modal document retrieval, which involves finding various types of content like figures and tables in large documents. It introduces a new benchmark called MMDocIR, which includes two tasks: page-level retrieval for finding relevant pages and layout-level retrieval for identifying specific layouts within those pages. The benchmark is supported by a comprehensive dataset with thousands of annotated questions, facilitating better training and evaluation of retrieval systems. The results show that visual retrieval methods outperform text-based methods, highlighting the importance of incorporating visual information in multi-modal retrieval tasks.'}, 'zh': {'title': '多模态文档检索的新基准MMDocIR', 'desc': '多模态文档检索旨在从大量文档中识别和提取各种形式的内容,如图形、表格、图表和布局信息。尽管其重要性显著,但目前缺乏有效评估多模态文档检索系统性能的基准。为了解决这一问题,本文提出了一个新的基准MMDocIR,包含页面级和布局级检索两个任务。通过严格的实验,我们发现视觉检索器的表现显著优于文本检索器,且MMDocIR训练集能有效促进多模态文档检索的训练过程。'}}}, {'id': 'https://huggingface.co/papers/2501.08365', 'title': 'Towards Best Practices for Open Datasets for LLM Training', 'url': 'https://huggingface.co/papers/2501.08365', 'abstract': 'Many AI companies are training their large language models (LLMs) on data without the permission of the copyright owners. The permissibility of doing so varies by jurisdiction: in countries like the EU and Japan, this is allowed under certain restrictions, while in the United States, the legal landscape is more ambiguous. Regardless of the legal status, concerns from creative producers have led to several high-profile copyright lawsuits, and the threat of litigation is commonly cited as a reason for the recent trend towards minimizing the information shared about training datasets by both corporate and public interest actors. This trend in limiting data information causes harm by hindering transparency, accountability, and innovation in the broader ecosystem by denying researchers, auditors, and impacted individuals access to the information needed to understand AI models. While this could be mitigated by training language models on open access and public domain data, at the time of writing, there are no such models (trained at a meaningful scale) due to the substantial technical and sociological challenges in assembling the necessary corpus. These challenges include incomplete and unreliable metadata, the cost and complexity of digitizing physical records, and the diverse set of legal and technical skills required to ensure relevance and responsibility in a quickly changing landscape. Building towards a future where AI systems can be trained on openly licensed data that is responsibly curated and governed requires collaboration across legal, technical, and policy domains, along with investments in metadata standards, digitization, and fostering a culture of openness.', 'score': 16, 'issue_id': 1702, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '90686080aa439157', 'authors': ['Stefan Baack', 'Stella Biderman', 'Kasia Odrozek', 'Aviya Skowron', 'Ayah Bdeir', 'Jillian Bommarito', 'Jennifer Ding', 'Maximilian Gahntz', 'Paul Keller', 'Pierre-Carl Langlais', 'Greg Lindahl', 'Sebastian Majstorovic', 'Nik Marda', 'Guilherme Penedo', 'Maarten Van Segbroeck', 'Jennifer Wang', 'Leandro von Werra', 'Mitchell Baker', 'Julie Belião', 'Kasia Chmielinski', 'Marzieh Fadaee', 'Lisa Gutermuth', 'Hynek Kydlíček', 'Greg Leppert', 'EM Lewis-Jong', 'Solana Larsen', 'Shayne Longpre', 'Angela Oduor Lungati', 'Cullen Miller', 'Victor Miller', 'Max Ryabinin', 'Kathleen Siminyu', 'Andrew Strait', 'Mark Surman', 'Anna Tumadóttir', 'Maurice Weber', 'Rebecca Weiss', 'Lee White', 'Thomas Wolf'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08365.jpg', 'data': {'categories': ['#open_source', '#ethics', '#data', '#dataset'], 'emoji': '📚', 'ru': {'title': 'Открытые данные для ответственного ИИ: вызовы и перспективы', 'desc': 'Статья рассматривает проблему обучения больших языковых моделей (LLM) на данных без разрешения правообладателей. Анализируются юридические аспекты этой практики в разных странах и связанные с ней судебные иски. Отмечается тенденция к ограничению информации о наборах данных для обучения, что негативно влияет на прозрачность и подотчетность в сфере ИИ. Обсуждаются вызовы создания моделей на основе открытых данных, включая технические и социологические аспекты.'}, 'en': {'title': 'Towards Transparent AI: The Need for Open Data Collaboration', 'desc': 'This paper discusses the legal and ethical challenges surrounding the training of large language models (LLMs) using copyrighted data without permission. It highlights the varying legal frameworks across different countries, particularly the ambiguity in the United States compared to more defined rules in the EU and Japan. The authors argue that the trend of limiting information about training datasets undermines transparency and innovation in AI, making it difficult for researchers and stakeholders to understand the models. They propose that a shift towards using open access and public domain data is necessary, but emphasize the need for collaboration and investment in infrastructure to overcome the technical and sociological barriers involved.'}, 'zh': {'title': '推动开放许可数据的AI训练未来', 'desc': '许多人工智能公司在没有版权拥有者许可的情况下训练大型语言模型(LLMs)。不同国家对这种做法的合法性有不同的规定,欧盟和日本在某些限制下允许,而美国的法律环境则较为模糊。这种限制数据共享的信息趋势,妨碍了透明度、问责制和创新,影响了研究人员和受影响个体获取理解AI模型所需的信息。为了实现未来能够在开放许可数据上训练AI系统,需要在法律、技术和政策领域进行合作,并投资于元数据标准和数字化。'}}}, {'id': 'https://huggingface.co/papers/2501.08983', 'title': 'CityDreamer4D: Compositional Generative Model of Unbounded 4D Cities', 'url': 'https://huggingface.co/papers/2501.08983', 'abstract': '3D scene generation has garnered growing attention in recent years and has made significant progress. Generating 4D cities is more challenging than 3D scenes due to the presence of structurally complex, visually diverse objects like buildings and vehicles, and heightened human sensitivity to distortions in urban environments. To tackle these issues, we propose CityDreamer4D, a compositional generative model specifically tailored for generating unbounded 4D cities. Our main insights are 1) 4D city generation should separate dynamic objects (e.g., vehicles) from static scenes (e.g., buildings and roads), and 2) all objects in the 4D scene should be composed of different types of neural fields for buildings, vehicles, and background stuff. Specifically, we propose Traffic Scenario Generator and Unbounded Layout Generator to produce dynamic traffic scenarios and static city layouts using a highly compact BEV representation. Objects in 4D cities are generated by combining stuff-oriented and instance-oriented neural fields for background stuff, buildings, and vehicles. To suit the distinct characteristics of background stuff and instances, the neural fields employ customized generative hash grids and periodic positional embeddings as scene parameterizations. Furthermore, we offer a comprehensive suite of datasets for city generation, including OSM, GoogleEarth, and CityTopia. The OSM dataset provides a variety of real-world city layouts, while the Google Earth and CityTopia datasets deliver large-scale, high-quality city imagery complete with 3D instance annotations. Leveraging its compositional design, CityDreamer4D supports a range of downstream applications, such as instance editing, city stylization, and urban simulation, while delivering state-of-the-art performance in generating realistic 4D cities.', 'score': 11, 'issue_id': 1698, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '39cd0826d4232170', 'authors': ['Haozhe Xie', 'Zhaoxi Chen', 'Fangzhou Hong', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University, Singapore 637335'], 'pdf_title_img': 'assets/pdf/title_img/2501.08983.jpg', 'data': {'categories': ['#3d', '#dataset'], 'emoji': '🏙️', 'ru': {'title': 'Композиционная генерация 4D-городов с разделением динамики и статики', 'desc': 'CityDreamer4D - это генеративная модель для создания неограниченных 4D-городов. Она разделяет генерацию динамических объектов (например, транспорта) и статических сцен (зданий, дорог). Модель использует разные типы нейронных полей для зданий, транспорта и фона, применяя специализированные генеративные хеш-сетки и периодические позиционные эмбеддинги. CityDreamer4D демонстрирует передовые результаты в генерации реалистичных 4D-городов и поддерживает различные приложения, включая редактирование объектов и городское моделирование.'}, 'en': {'title': 'Revolutionizing Urban Landscapes: CityDreamer4D for Dynamic City Generation', 'desc': "This paper introduces CityDreamer4D, a generative model designed for creating unbounded 4D cities, which include both static and dynamic elements. The model distinguishes between dynamic objects like vehicles and static structures such as buildings, using specialized neural fields for each type. It employs a compact bird's-eye view (BEV) representation to generate realistic traffic scenarios and city layouts. Additionally, the paper provides extensive datasets for training, enabling various applications like instance editing and urban simulation while achieving high-quality results in 4D city generation."}, 'zh': {'title': 'CityDreamer4D:无限4D城市生成的新突破', 'desc': '近年来,3D场景生成受到了越来越多的关注,并取得了显著进展。生成4D城市比3D场景更具挑战性,因为城市环境中存在结构复杂、视觉多样的物体,如建筑和车辆。为了解决这些问题,我们提出了CityDreamer4D,这是一种专门用于生成无限4D城市的组合生成模型。该模型通过将动态物体与静态场景分离,并使用不同类型的神经场来组合城市中的所有物体,从而实现高质量的城市生成。'}}}, {'id': 'https://huggingface.co/papers/2501.08994', 'title': 'RepVideo: Rethinking Cross-Layer Representation for Video Generation', 'url': 'https://huggingface.co/papers/2501.08994', 'abstract': 'Video generation has achieved remarkable progress with the introduction of diffusion models, which have significantly improved the quality of generated videos. However, recent research has primarily focused on scaling up model training, while offering limited insights into the direct impact of representations on the video generation process. In this paper, we initially investigate the characteristics of features in intermediate layers, finding substantial variations in attention maps across different layers. These variations lead to unstable semantic representations and contribute to cumulative differences between features, which ultimately reduce the similarity between adjacent frames and negatively affect temporal coherence. To address this, we propose RepVideo, an enhanced representation framework for text-to-video diffusion models. By accumulating features from neighboring layers to form enriched representations, this approach captures more stable semantic information. These enhanced representations are then used as inputs to the attention mechanism, thereby improving semantic expressiveness while ensuring feature consistency across adjacent frames. Extensive experiments demonstrate that our RepVideo not only significantly enhances the ability to generate accurate spatial appearances, such as capturing complex spatial relationships between multiple objects, but also improves temporal consistency in video generation.', 'score': 10, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '0d164d45ba2a5c71', 'authors': ['Chenyang Si', 'Weichen Fan', 'Zhengyao Lv', 'Ziqi Huang', 'Yu Qiao', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University, Singapore, 639798', 'Shanghai Artificial Intelligence Laboratory, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.08994.jpg', 'data': {'categories': ['#video', '#diffusion', '#architecture'], 'emoji': '🎬', 'ru': {'title': 'RepVideo: стабильные представления для качественной генерации видео', 'desc': 'Статья представляет RepVideo - улучшенную систему представлений для диффузионных моделей генерации видео на основе текста. Авторы обнаружили, что вариации в картах внимания между слоями приводят к нестабильным семантическим представлениям и снижают согласованность соседних кадров. RepVideo решает эту проблему путем накопления признаков из соседних слоев для создания обогащенных представлений. Эксперименты показывают, что RepVideo значительно улучшает способность генерировать точные пространственные образы и повышает временную согласованность при генерации видео.'}, 'en': {'title': 'Enhancing Video Generation with Stable Representations', 'desc': "This paper presents RepVideo, a new framework designed to improve video generation using text-to-video diffusion models. It identifies issues with unstable semantic representations caused by variations in attention maps across different layers of the model. By accumulating features from neighboring layers, RepVideo creates more stable and enriched representations that enhance the model's ability to maintain consistency between adjacent frames. The results show that RepVideo significantly improves both the spatial accuracy of generated videos and their temporal coherence, leading to more realistic video outputs."}, 'zh': {'title': '提升视频生成质量的RepVideo框架', 'desc': '本论文探讨了扩散模型在视频生成中的应用,提出了RepVideo框架以改善视频生成的质量。研究发现中间层特征的注意力图存在显著差异,这导致语义表示的不稳定性,进而影响相邻帧之间的相似性和时间一致性。RepVideo通过从相邻层累积特征,形成更丰富的表示,从而捕捉更稳定的语义信息。实验结果表明,RepVideo显著提高了生成视频的空间表现能力和时间一致性。'}}}, {'id': 'https://huggingface.co/papers/2501.07783', 'title': 'Parameter-Inverted Image Pyramid Networks for Visual Perception and Multimodal Understanding', 'url': 'https://huggingface.co/papers/2501.07783', 'abstract': 'Image pyramids are widely adopted in top-performing methods to obtain multi-scale features for precise visual perception and understanding. However, current image pyramids use the same large-scale model to process multiple resolutions of images, leading to significant computational cost. To address this challenge, we propose a novel network architecture, called Parameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses pretrained models (ViTs or CNNs) as branches to process multi-scale images, where images of higher resolutions are processed by smaller network branches to balance computational cost and performance. To integrate information from different spatial scales, we further propose a novel cross-branch feature interaction mechanism. To validate PIIP, we apply it to various perception models and a representative multimodal large language model called LLaVA, and conduct extensive experiments on various tasks such as object detection, segmentation, image classification and multimodal understanding. PIIP achieves superior performance compared to single-branch and existing multi-resolution approaches with lower computational cost. When applied to InternViT-6B, a large-scale vision foundation model, PIIP can improve its performance by 1%-2% on detection and segmentation with only 40%-60% of the original computation, finally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For multimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and 74.5% on MMBench with only 2.8M training data. Our code is released at https://github.com/OpenGVLab/PIIP.', 'score': 5, 'issue_id': 1701, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '87295e912b5b0670', 'authors': ['Zhaokai Wang', 'Xizhou Zhu', 'Xue Yang', 'Gen Luo', 'Hao Li', 'Changyao Tian', 'Wenhan Dou', 'Junqi Ge', 'Lewei Lu', 'Yu Qiao', 'Jifeng Dai'], 'affiliations': ['Sensetime', 'Shanghai Artificial Intelligence Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07783.jpg', 'data': {'categories': ['#architecture', '#multimodal', '#cv'], 'emoji': '🔍', 'ru': {'title': 'Эффективные многомасштабные сети для точного визуального восприятия', 'desc': 'Статья представляет новую архитектуру нейронных сетей под названием Parameter-Inverted Image Pyramid Networks (PIIP). PIIP использует предобученные модели (ViT или CNN) в качестве ветвей для обработки многомасштабных изображений, где изображения с более высоким разрешением обрабатываются меньшими сетевыми ветвями для баланса вычислительных затрат и производительности. Авторы также предлагают новый механизм взаимодействия признаков между ветвями. PIIP демонстрирует превосходную производительность по сравнению с одноветвенными и существующими многоразрешающими подходами при меньших вычислительных затратах в задачах обнаружения объектов, сегментации, классификации изображений и мультимодального понимания.'}, 'en': {'title': 'Efficient Multi-Scale Processing with PIIP Networks', 'desc': 'This paper introduces Parameter-Inverted Image Pyramid Networks (PIIP), a new architecture designed to efficiently process multi-scale images for visual tasks. Unlike traditional methods that use a single large model for all resolutions, PIIP employs smaller branches for higher resolution images, reducing computational costs while maintaining performance. The architecture also features a unique cross-branch interaction mechanism to enhance feature integration across different scales. Experimental results demonstrate that PIIP outperforms existing methods in various tasks, achieving significant accuracy improvements with lower resource usage.'}, 'zh': {'title': '高效多尺度图像处理的新方法', 'desc': '本文提出了一种新的网络架构,称为参数反转图像金字塔网络(PIIP),旨在提高多尺度图像处理的效率。PIIP利用预训练模型作为分支,处理不同分辨率的图像,从而在性能和计算成本之间取得平衡。通过引入跨分支特征交互机制,PIIP能够有效整合来自不同空间尺度的信息。实验结果表明,PIIP在目标检测、分割和多模态理解等任务上表现优于现有方法,同时显著降低了计算成本。'}}}, {'id': 'https://huggingface.co/papers/2501.09012', 'title': 'Multimodal LLMs Can Reason about Aesthetics in Zero-Shot', 'url': 'https://huggingface.co/papers/2501.09012', 'abstract': "We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability shall be elicited to evaluate the aesthetics of artworks. To facilitate this investigation, we construct MM-StyleBench, a novel high-quality dataset for benchmarking artistic stylization. We then develop a principled method for human preference modeling and perform a systematic correlation analysis between MLLMs' responses and human preference. Our experiments reveal an inherent hallucination issue of MLLMs in art evaluation, associated with response subjectivity. ArtCoT is proposed, demonstrating that art-specific task decomposition and the use of concrete language boost MLLMs' reasoning ability for aesthetics. Our findings offer valuable insights into MLLMs for art and can benefit a wide range of downstream applications, such as style transfer and artistic image generation. Code available at https://github.com/songrise/MLLM4Art.", 'score': 5, 'issue_id': 1699, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'e516a920b6534cc0', 'authors': ['Ruixiang Jiang', 'Changwen Chen'], 'affiliations': ['The Hong Kong Polytechnic University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09012.jpg', 'data': {'categories': ['#artificial intelligence', '#reasoning', '#hallucinations', '#multimodal', '#benchmark', '#dataset'], 'emoji': '🎨', 'ru': {'title': 'Искусственный интеллект учится оценивать искусство', 'desc': 'Исследование посвящено использованию мультимодальных языковых моделей (MLLM) для оценки эстетики произведений искусства. Авторы создали набор данных MM-StyleBench для тестирования художественной стилизации и разработали метод моделирования человеческих предпочтений. Эксперименты выявили проблему галлюцинаций MLLM при оценке искусства, связанную с субъективностью ответов. Предложенный метод ArtCoT улучшает способность MLLM к рассуждениям об эстетике путем декомпозиции задач и использования конкретного языка.'}, 'en': {'title': 'Enhancing MLLMs for Art Evaluation through Structured Reasoning', 'desc': "This paper investigates how Multimodal Large Language Models (MLLMs) can assess the aesthetics of artworks. The authors introduce MM-StyleBench, a new dataset designed to benchmark artistic stylization. They also create a method for modeling human preferences and analyze the correlation between MLLMs' evaluations and human judgments. The study highlights a hallucination problem in MLLMs when evaluating art and proposes ArtCoT, which improves reasoning by using task decomposition and specific language, providing insights for applications like style transfer and artistic image generation."}, 'zh': {'title': '提升多模态大语言模型的艺术推理能力', 'desc': '本研究首次探讨了多模态大语言模型(MLLMs)在评估艺术作品美学时的推理能力。我们构建了一个新的高质量数据集MM-StyleBench,用于艺术风格化的基准测试。通过系统的相关性分析,我们发现MLLMs在艺术评估中存在固有的幻觉问题,且与人类偏好存在主观性关联。我们提出了ArtCoT方法,表明艺术特定任务分解和使用具体语言可以提升MLLMs的美学推理能力。'}}}, {'id': 'https://huggingface.co/papers/2501.09019', 'title': 'Ouroboros-Diffusion: Exploring Consistent Content Generation in Tuning-free Long Video Diffusion', 'url': 'https://huggingface.co/papers/2501.09019', 'abstract': "The first-in-first-out (FIFO) video diffusion, built on a pre-trained text-to-video model, has recently emerged as an effective approach for tuning-free long video generation. This technique maintains a queue of video frames with progressively increasing noise, continuously producing clean frames at the queue's head while Gaussian noise is enqueued at the tail. However, FIFO-Diffusion often struggles to keep long-range temporal consistency in the generated videos due to the lack of correspondence modeling across frames. In this paper, we propose Ouroboros-Diffusion, a novel video denoising framework designed to enhance structural and content (subject) consistency, enabling the generation of consistent videos of arbitrary length. Specifically, we introduce a new latent sampling technique at the queue tail to improve structural consistency, ensuring perceptually smooth transitions among frames. To enhance subject consistency, we devise a Subject-Aware Cross-Frame Attention (SACFA) mechanism, which aligns subjects across frames within short segments to achieve better visual coherence. Furthermore, we introduce self-recurrent guidance. This technique leverages information from all previous cleaner frames at the front of the queue to guide the denoising of noisier frames at the end, fostering rich and contextual global information interaction. Extensive experiments of long video generation on the VBench benchmark demonstrate the superiority of our Ouroboros-Diffusion, particularly in terms of subject consistency, motion smoothness, and temporal consistency.", 'score': 4, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'c4c991699f684865', 'authors': ['Jingyuan Chen', 'Fuchen Long', 'Jie An', 'Zhaofan Qiu', 'Ting Yao', 'Jiebo Luo', 'Tao Mei'], 'affiliations': ['HiDream.ai Inc.', 'University of Rochester, Rochester, NY USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.09019.jpg', 'data': {'categories': ['#benchmark', '#video', '#long_context', '#diffusion'], 'emoji': '🐍', 'ru': {'title': 'Бесконечное видео: Ouroboros-Diffusion для непрерывной генерации согласованного контента', 'desc': 'Эта статья представляет новый метод генерации видео произвольной длины под названием Ouroboros-Diffusion. Метод улучшает структурную и сюжетную согласованность видео с помощью нового подхода к выборке латентного пространства и механизма Subject-Aware Cross-Frame Attention. Авторы также вводят самоповторяющееся руководство, использующее информацию из предыдущих очищенных кадров для улучшения шумных кадров. Эксперименты на бенчмарке VBench показывают превосходство Ouroboros-Diffusion в сохранении согласованности субъектов, плавности движения и временной согласованности.'}, 'en': {'title': 'Ouroboros-Diffusion: Enhancing Long Video Consistency and Coherence', 'desc': 'The paper introduces Ouroboros-Diffusion, a new framework for improving long video generation using a pre-trained text-to-video model. It addresses the limitations of FIFO-Diffusion, particularly in maintaining long-range temporal consistency across video frames. The proposed method enhances structural consistency through a novel latent sampling technique and improves subject consistency with a Subject-Aware Cross-Frame Attention mechanism. Additionally, self-recurrent guidance is implemented to utilize information from previous frames, resulting in videos with better visual coherence and smoother transitions.'}, 'zh': {'title': 'Ouroboros-Diffusion:提升视频生成一致性的创新框架', 'desc': 'FIFO视频扩散是一种基于预训练文本到视频模型的长视频生成方法,但在生成视频时常常缺乏长时间的一致性。本文提出了Ouroboros-Diffusion框架,通过引入新的潜在采样技术和主题感知跨帧注意机制,增强了视频的结构和内容一致性。该方法确保了帧之间的平滑过渡,并通过自递归引导技术利用前面清晰帧的信息来改善后面噪声帧的去噪效果。实验结果表明,Ouroboros-Diffusion在主题一致性、运动平滑性和时间一致性方面优于现有方法。'}}}, {'id': 'https://huggingface.co/papers/2501.08809', 'title': 'XMusic: Towards a Generalized and Controllable Symbolic Music Generation Framework', 'url': 'https://huggingface.co/papers/2501.08809', 'abstract': 'In recent years, remarkable advancements in artificial intelligence-generated content (AIGC) have been achieved in the fields of image synthesis and text generation, generating content comparable to that produced by humans. However, the quality of AI-generated music has not yet reached this standard, primarily due to the challenge of effectively controlling musical emotions and ensuring high-quality outputs. This paper presents a generalized symbolic music generation framework, XMusic, which supports flexible prompts (i.e., images, videos, texts, tags, and humming) to generate emotionally controllable and high-quality symbolic music. XMusic consists of two core components, XProjector and XComposer. XProjector parses the prompts of various modalities into symbolic music elements (i.e., emotions, genres, rhythms and notes) within the projection space to generate matching music. XComposer contains a Generator and a Selector. The Generator generates emotionally controllable and melodious music based on our innovative symbolic music representation, whereas the Selector identifies high-quality symbolic music by constructing a multi-task learning scheme involving quality assessment, emotion recognition, and genre recognition tasks. In addition, we build XMIDI, a large-scale symbolic music dataset that contains 108,023 MIDI files annotated with precise emotion and genre labels. Objective and subjective evaluations show that XMusic significantly outperforms the current state-of-the-art methods with impressive music quality. Our XMusic has been awarded as one of the nine Highlights of Collectibles at WAIC 2023. The project homepage of XMusic is https://xmusic-project.github.io.', 'score': 4, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'd4d018c9adb2579c', 'pdf_title_img': 'img/title_stub.png', 'data': {'categories': ['#audio', '#story_generation', '#multimodal', '#dataset'], 'emoji': '🎵', 'ru': {'title': 'XMusic: ИИ-композитор нового поколения с управляемыми эмоциями', 'desc': 'Статья представляет XMusic - генерализованный фреймворк для генерации символической музыки, поддерживающий различные типы промптов. XMusic состоит из двух ключевых компонентов: XProjector для обработки промптов и XComposer для генерации музыки. Авторы также создали датасет XMIDI, содержащий более 100 тысяч MIDI-файлов с аннотациями эмоций и жанров. Согласно оценкам, XMusic значительно превосходит современные методы по качеству генерируемой музыки.'}, 'en': {'title': 'XMusic: Emotionally Controlled Music Generation Made Easy!', 'desc': 'This paper introduces XMusic, a new framework for generating symbolic music that can be controlled by emotional prompts. It includes two main components: XProjector, which converts various input types into musical elements, and XComposer, which generates and selects high-quality music. The framework uses a multi-task learning approach to ensure the generated music meets quality, emotional, and genre standards. Additionally, the authors created a large dataset, XMIDI, to support their research and demonstrate that XMusic outperforms existing methods in music generation.'}, 'zh': {'title': 'XMusic:情感可控的高质量音乐生成', 'desc': '近年来,人工智能生成内容(AIGC)在图像合成和文本生成领域取得了显著进展,但在音乐生成方面仍面临挑战。本文提出了一种通用的符号音乐生成框架XMusic,能够通过灵活的提示生成可控情感和高质量的符号音乐。XMusic由两个核心组件组成:XProjector和XComposer,前者将多种模态的提示解析为音乐元素,后者则生成和选择高质量的音乐。通过构建大规模的XMIDI数据集和多任务学习方案,XMusic在音乐质量上显著优于现有方法。'}}, 'authors': [], 'affiliations': []}, {'id': 'https://huggingface.co/papers/2501.08970', 'title': 'Trusted Machine Learning Models Unlock Private Inference for Problems Currently Infeasible with Cryptography', 'url': 'https://huggingface.co/papers/2501.08970', 'abstract': 'We often interact with untrusted parties. Prioritization of privacy can limit the effectiveness of these interactions, as achieving certain goals necessitates sharing private data. Traditionally, addressing this challenge has involved either seeking trusted intermediaries or constructing cryptographic protocols that restrict how much data is revealed, such as multi-party computations or zero-knowledge proofs. While significant advances have been made in scaling cryptographic approaches, they remain limited in terms of the size and complexity of applications they can be used for. In this paper, we argue that capable machine learning models can fulfill the role of a trusted third party, thus enabling secure computations for applications that were previously infeasible. In particular, we describe Trusted Capable Model Environments (TCMEs) as an alternative approach for scaling secure computation, where capable machine learning model(s) interact under input/output constraints, with explicit information flow control and explicit statelessness. This approach aims to achieve a balance between privacy and computational efficiency, enabling private inference where classical cryptographic solutions are currently infeasible. We describe a number of use cases that are enabled by TCME, and show that even some simple classic cryptographic problems can already be solved with TCME. Finally, we outline current limitations and discuss the path forward in implementing them.', 'score': 3, 'issue_id': 1702, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '858fc03ac78b66c1', 'authors': ['Ilia Shumailov', 'Daniel Ramage', 'Sarah Meiklejohn', 'Peter Kairouz', 'Florian Hartmann', 'Borja Balle', 'Eugene Bagdasarian'], 'affiliations': ['Google', 'Google DeepMind', 'Google Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.08970.jpg', 'data': {'categories': ['#data', '#ethics', '#architecture', '#security', '#inference'], 'emoji': '🔐', 'ru': {'title': 'Машинное обучение как доверенный посредник для безопасных вычислений', 'desc': 'Статья представляет новый подход к безопасным вычислениям с использованием машинного обучения - Trusted Capable Model Environments (TCME). TCME предлагается как альтернатива традиционным криптографическим методам для обеспечения конфиденциальности при взаимодействии с ненадежными сторонами. Авторы утверждают, что мощные модели машинного обучения могут выполнять роль доверенной третьей стороны, позволяя проводить безопасные вычисления для приложений, которые ранее были невозможны. В статье описываются возможные применения TCME и обсуждаются текущие ограничения и перспективы развития этого подхода.'}, 'en': {'title': 'Empowering Privacy with Trusted Machine Learning Models', 'desc': 'This paper introduces Trusted Capable Model Environments (TCMEs) as a novel solution for secure computations involving untrusted parties. It suggests that advanced machine learning models can act as trusted intermediaries, allowing for private data sharing while maintaining privacy. The authors highlight how TCMEs can efficiently manage input/output constraints and control information flow, making them suitable for applications where traditional cryptographic methods fall short. They also present various use cases and acknowledge the limitations of their approach, paving the way for future developments in secure machine learning applications.'}, 'zh': {'title': '利用机器学习实现安全计算的新方法', 'desc': '本文探讨了在与不可信方互动时如何平衡隐私和计算效率。我们提出了可信能力模型环境(TCME),作为一种新的安全计算方法,利用机器学习模型充当可信第三方。TCME在输入/输出约束下进行交互,并通过显式的信息流控制和无状态性来保护隐私。我们展示了TCME在解决一些经典密码学问题上的潜力,并讨论了未来的实施路径。'}}}, {'id': 'https://huggingface.co/papers/2501.04693', 'title': 'Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous Sensors via Language Grounding', 'url': 'https://huggingface.co/papers/2501.04693', 'abstract': 'Interacting with the world is a multi-sensory experience: achieving effective general-purpose interaction requires making use of all available modalities -- including vision, touch, and audio -- to fill in gaps from partial observation. For example, when vision is occluded reaching into a bag, a robot should rely on its senses of touch and sound. However, state-of-the-art generalist robot policies are typically trained on large datasets to predict robot actions solely from visual and proprioceptive observations. In this work, we propose FuSe, a novel approach that enables finetuning visuomotor generalist policies on heterogeneous sensor modalities for which large datasets are not readily available by leveraging natural language as a common cross-modal grounding. We combine a multimodal contrastive loss with a sensory-grounded language generation loss to encode high-level semantics. In the context of robot manipulation, we show that FuSe enables performing challenging tasks that require reasoning jointly over modalities such as vision, touch, and sound in a zero-shot setting, such as multimodal prompting, compositional cross-modal prompting, and descriptions of objects it interacts with. We show that the same recipe is applicable to widely different generalist policies, including both diffusion-based generalist policies and large vision-language-action (VLA) models. Extensive experiments in the real world show that FuSeis able to increase success rates by over 20% compared to all considered baselines.', 'score': 0, 'issue_id': 1709, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1612a7343aff595b', 'authors': ['Joshua Jones', 'Oier Mees', 'Carmelo Sferrazza', 'Kyle Stachowicz', 'Pieter Abbeel', 'Sergey Levine'], 'affiliations': ['Berkeley AI Research (BAIR), UC Berkeley, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.04693.jpg', 'data': {'categories': ['#transfer_learning', '#multimodal', '#robotics', '#reasoning'], 'emoji': '🤖', 'ru': {'title': 'Мультисенсорный ИИ: объединение зрения, осязания и звука для улучшения взаимодействия роботов с миром', 'desc': 'Статья представляет FuSe - новый подход к обучению роботов, использующий мультимодальные сенсорные данные. FuSe использует естественный язык как общую основу для объединения различных модальностей, таких как зрение, осязание и звук. Метод сочетает мультимодальную контрастивную функцию потерь с генерацией языка на основе сенсорных данных для кодирования высокоуровневой семантики. Эксперименты показывают, что FuSe позволяет роботам выполнять сложные задачи, требующие рассуждений на основе нескольких модальностей, повышая успешность на 20% по сравнению с базовыми методами.'}, 'en': {'title': 'FuSe: Bridging Sensory Gaps for Smarter Robot Interaction', 'desc': 'This paper introduces FuSe, a method that enhances robot interaction by integrating multiple sensory modalities like vision, touch, and sound. Traditional robot policies often rely solely on visual data, but FuSe allows for fine-tuning these policies using natural language to bridge gaps in sensory information. By employing a multimodal contrastive loss and a sensory-grounded language generation loss, FuSe effectively encodes high-level semantics for better decision-making. The results demonstrate that FuSe significantly improves the success rates of robots in complex tasks, showcasing its versatility across different generalist policies.'}, 'zh': {'title': '多模态交互,提升机器人智能', 'desc': '本论文提出了一种名为FuSe的新方法,旨在通过多模态传感器数据来微调通用机器人策略。FuSe利用自然语言作为跨模态的共同基础,结合多模态对比损失和感知基础的语言生成损失,以编码高层语义。通过这种方法,机器人能够在视觉、触觉和听觉等多种感官信息的共同推理下,完成复杂的操作任务。实验结果表明,FuSe在实际应用中成功率提高了超过20%。'}}}, {'id': 'https://huggingface.co/papers/2412.19412', 'title': 'MINIMA: Modality Invariant Image Matching', 'url': 'https://huggingface.co/papers/2412.19412', 'abstract': 'Image matching for both cross-view and cross-modality plays a critical role in multimodal perception. In practice, the modality gap caused by different imaging systems/styles poses great challenges to the matching task. Existing works try to extract invariant features for specific modalities and train on limited datasets, showing poor generalization. In this paper, we present MINIMA, a unified image matching framework for multiple cross-modal cases. Without pursuing fancy modules, our MINIMA aims to enhance universal performance from the perspective of data scaling up. For such purpose, we propose a simple yet effective data engine that can freely produce a large dataset containing multiple modalities, rich scenarios, and accurate matching labels. Specifically, we scale up the modalities from cheap but rich RGB-only matching data, by means of generative models. Under this setting, the matching labels and rich diversity of the RGB dataset are well inherited by the generated multimodal data. Benefiting from this, we construct MD-syn, a new comprehensive dataset that fills the data gap for general multimodal image matching. With MD-syn, we can directly train any advanced matching pipeline on randomly selected modality pairs to obtain cross-modal ability. Extensive experiments on in-domain and zero-shot matching tasks, including 19 cross-modal cases, demonstrate that our MINIMA can significantly outperform the baselines and even surpass modality-specific methods. The dataset and code are available at https://github.com/LSXI7/MINIMA .', 'score': 0, 'issue_id': 1709, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': 'fa772dead5453f7b', 'authors': ['Xingyu Jiang', 'Jiangwei Ren', 'Zizhuo Li', 'Xin Zhou', 'Dingkang Liang', 'Xiang Bai'], 'affiliations': ['Huazhong University of Science and Technology', 'Wuhan University'], 'pdf_title_img': 'assets/pdf/title_img/2412.19412.jpg', 'data': {'categories': ['#dataset', '#data', '#multimodal', '#open_source', '#synthetic'], 'emoji': '🔀', 'ru': {'title': 'Универсальное сопоставление изображений через масштабирование данных', 'desc': 'Статья представляет MINIMA - универсальную систему сопоставления изображений для различных кросс-модальных случаев. Авторы предлагают эффективный механизм генерации большого набора данных с несколькими модальностями, разнообразными сценариями и точными метками сопоставления. Используя этот подход, они создают новый комплексный датасет MD-syn для обучения нейросетей кросс-модальному сопоставлению изображений. Эксперименты показывают, что MINIMA значительно превосходит базовые модели и даже специализированные методы для конкретных модальностей в 19 кросс-модальных задачах.'}, 'en': {'title': 'MINIMA: Bridging the Gap in Cross-Modal Image Matching', 'desc': 'This paper introduces MINIMA, a framework designed for image matching across different views and modalities, addressing the challenges posed by varying imaging systems. The authors highlight the limitations of existing methods that rely on invariant features and small datasets, which often lead to poor performance. MINIMA enhances image matching by scaling up data through a generative model that creates a large, diverse dataset with accurate matching labels. The new dataset, MD-syn, allows for effective training of matching algorithms, resulting in improved performance in both in-domain and zero-shot scenarios compared to traditional methods.'}, 'zh': {'title': 'MINIMA:跨模态图像匹配的新突破', 'desc': '本文提出了一种名为MINIMA的统一图像匹配框架,旨在解决跨视角和跨模态的图像匹配问题。现有方法在特定模态上提取不变特征,但在有限数据集上训练,导致泛化能力差。MINIMA通过一个简单有效的数据引擎,生成包含多种模态和丰富场景的大型数据集,从而提升通用性能。通过构建MD-syn数据集,MINIMA能够在随机选择的模态对上直接训练,显著提高跨模态匹配能力。'}}}, {'id': 'https://huggingface.co/papers/2501.08313', 'title': 'MiniMax-01: Scaling Foundation Models with Lightning Attention', 'url': 'https://huggingface.co/papers/2501.08313', 'abstract': 'We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, of which 45.9 billion are activated for each token. We develop an optimized parallel strategy and highly efficient computation-communication overlap techniques for MoE and lightning attention. This approach enables us to conduct efficient training and inference on models with hundreds of billions of parameters across contexts spanning millions of tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens during training and extrapolate to 4 million tokens during inference at an affordable cost. Our vision-language model, MiniMax-VL-01 is built through continued training with 512 billion vision-language tokens. Experiments on both standard and in-house benchmarks show that our models match the performance of state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 times longer context window. We publicly release MiniMax-01 at https://github.com/MiniMax-AI.', 'score': 192, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'a57d7b1914e7383a', 'authors': ['MiniMax', 'Aonian Li', 'Bangwei Gong', 'Bo Yang', 'Boji Shan', 'Chang Liu', 'Cheng Zhu', 'Chunhao Zhang', 'Congchao Guo', 'Da Chen', 'Dong Li', 'Enwei Jiao', 'Gengxin Li', 'Guojun Zhang', 'Haohai Sun', 'Houze Dong', 'Jiadai Zhu', 'Jiaqi Zhuang', 'Jiayuan Song', 'Jin Zhu', 'Jingtao Han', 'Jingyang Li', 'Junbin Xie', 'Junhao Xu', 'Junjie Yan', 'Kaishun Zhang', 'Kecheng Xiao', 'Kexi Kang', 'Le Han', 'Leyang Wang', 'Lianfei Yu', 'Liheng Feng', 'Lin Zheng', 'Linbo Chai', 'Long Xing', 'Meizhi Ju', 'Mingyuan Chi', 'Mozhi Zhang', 'Peikai Huang', 'Pengcheng Niu', 'Pengfei Li', 'Pengyu Zhao', 'Qi Yang', 'Qidi Xu', 'Qiexiang Wang', 'Qin Wang', 'Qiuhui Li', 'Ruitao Leng', 'Shengmin Shi', 'Shuqi Yu', 'Sichen Li', 'Songquan Zhu', 'Tao Huang', 'Tianrun Liang', 'Weigao Sun', 'Weixuan Sun', 'Weiyu Cheng', 'Wenkai Li', 'Xiangjun Song', 'Xiao Su', 'Xiaodong Han', 'Xinjie Zhang', 'Xinzhu Hou', 'Xu Min', 'Xun Zou', 'Xuyang Shen', 'Yan Gong', 'Yingjie Zhu', 'Yipeng Zhou', 'Yiran Zhong', 'Yongyi Hu', 'Yuanxiang Fan', 'Yue Yu', 'Yufeng Yang', 'Yuhao Li', 'Yunan Huang', 'Yunji Li', 'Yunpeng Huang', 'Yunzhi Xu', 'Yuxin Mao', 'Zehan Li', 'Zekang Li', 'Zewei Tao', 'Zewen Ying', 'Zhaoyang Cong', 'Zhen Qin', 'Zhenhua Fan', 'Zhihang Yu', 'Zhuo Jiang', 'Zijia Wu'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08313.jpg', 'data': {'categories': ['#open_source', '#architecture', '#optimization', '#benchmark', '#long_context', '#training'], 'emoji': '🚀', 'ru': {'title': 'MiniMax-01: Революция в обработке длинных контекстов', 'desc': 'Исследователи представили серию моделей MiniMax-01, включая MiniMax-Text-01 и MiniMax-VL-01, которые сравнимы с лучшими моделями, но обладают улучшенными возможностями обработки длинных контекстов. В основе лежит технология lightning attention и ее эффективное масштабирование, интегрированные с Mixture of Experts (MoE). Модель имеет 32 эксперта и 456 миллиардов параметров, из которых 45,9 миллиардов активируются для каждого токена. Контекстное окно MiniMax-Text-01 может достигать 1 миллиона токенов при обучении и экстраполироваться до 4 миллионов токенов при инференсе.'}, 'en': {'title': 'Unleashing Long Contexts with MiniMax-01 Models', 'desc': 'The MiniMax-01 series introduces advanced models, MiniMax-Text-01 and MiniMax-VL-01, designed to handle longer contexts effectively. These models utilize lightning attention and a Mixture of Experts (MoE) architecture, featuring 32 experts and a staggering 456 billion parameters, optimizing the activation of 45.9 billion parameters per token. By implementing efficient parallel strategies and computation-communication overlap techniques, the models can train and infer on extensive datasets, reaching context windows of up to 1 million tokens during training and 4 million during inference. Performance evaluations indicate that MiniMax-01 models rival leading models like GPT-4o and Claude-3.5-Sonnet while significantly extending context capabilities.'}, 'zh': {'title': 'MiniMax-01:超长上下文处理的新纪元', 'desc': '我们介绍了MiniMax-01系列,包括MiniMax-Text-01和MiniMax-VL-01,这些模型在处理更长的上下文时具有优越的能力。核心技术是闪电注意力和高效的扩展能力。为了最大化计算能力,我们将其与专家混合模型(MoE)结合,创建了一个拥有32个专家和4560亿参数的模型。我们的实验表明,这些模型在标准和内部基准测试中表现出色,能够与最先进的模型相媲美,同时提供20到32倍更长的上下文窗口。'}}}, {'id': 'https://huggingface.co/papers/2501.08332', 'title': 'MangaNinja: Line Art Colorization with Precise Reference Following', 'url': 'https://huggingface.co/papers/2501.08332', 'abstract': 'Derived from diffusion models, MangaNinjia specializes in the task of reference-guided line art colorization. We incorporate two thoughtful designs to ensure precise character detail transcription, including a patch shuffling module to facilitate correspondence learning between the reference color image and the target line art, and a point-driven control scheme to enable fine-grained color matching. Experiments on a self-collected benchmark demonstrate the superiority of our model over current solutions in terms of precise colorization. We further showcase the potential of the proposed interactive point control in handling challenging cases, cross-character colorization, multi-reference harmonization, beyond the reach of existing algorithms.', 'score': 31, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '20ea6b75639e2ced', 'authors': ['Zhiheng Liu', 'Ka Leong Cheng', 'Xi Chen', 'Jie Xiao', 'Hao Ouyang', 'Kai Zhu', 'Yu Liu', 'Yujun Shen', 'Qifeng Chen', 'Ping Luo'], 'affiliations': ['Ant Group', 'HKU', 'HKUST', 'Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.08332.jpg', 'data': {'categories': ['#cv', '#diffusion', '#benchmark'], 'emoji': '🎨', 'ru': {'title': 'Прецизионное раскрашивание манги с помощью ИИ', 'desc': 'MangaNinjia - это модель для раскрашивания линейных рисунков манги, основанная на диффузионных моделях. Она использует модуль перемешивания патчей для обучения соответствиям между цветным изображением-образцом и целевым линейным рисунком. Модель также включает схему точечного контроля для точного подбора цветов. Эксперименты показывают превосходство MangaNinjia над существующими решениями в точности раскрашивания.'}, 'en': {'title': 'MangaNinjia: Mastering Line Art Colorization with Precision', 'desc': 'MangaNinjia is a model designed for coloring line art by using reference images. It employs a patch shuffling module to help the model learn how to match colors from the reference image to the target line art accurately. Additionally, it features a point-driven control scheme that allows for detailed color adjustments, ensuring that colors are applied precisely. Our experiments show that MangaNinjia outperforms existing methods in colorization tasks, especially in complex scenarios involving multiple references and different characters.'}, 'zh': {'title': 'MangaNinjia:精准上色的新方法', 'desc': 'MangaNinjia 是一种基于扩散模型的参考引导线条艺术上色技术。我们设计了两个模块来确保角色细节的准确转录,包括补丁洗牌模块和点驱动控制方案,以实现精细的颜色匹配。实验结果表明,我们的模型在精确上色方面优于现有解决方案。我们还展示了所提议的交互式点控制在处理复杂案例和多参考协调方面的潜力,超越了现有算法的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.06751', 'title': 'Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models', 'url': 'https://huggingface.co/papers/2501.06751', 'abstract': "Text-to-image (T2I) diffusion models rely on encoded prompts to guide the image generation process. Typically, these prompts are extended to a fixed length by adding padding tokens before text encoding. Despite being a default practice, the influence of padding tokens on the image generation process has not been investigated. In this work, we conduct the first in-depth analysis of the role padding tokens play in T2I models. We develop two causal techniques to analyze how information is encoded in the representation of tokens across different components of the T2I pipeline. Using these techniques, we investigate when and how padding tokens impact the image generation process. Our findings reveal three distinct scenarios: padding tokens may affect the model's output during text encoding, during the diffusion process, or be effectively ignored. Moreover, we identify key relationships between these scenarios and the model's architecture (cross or self-attention) and its training process (frozen or trained text encoder). These insights contribute to a deeper understanding of the mechanisms of padding tokens, potentially informing future model design and training practices in T2I systems.", 'score': 27, 'issue_id': 1677, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': '05733e8e82e23568', 'authors': ['Michael Toker', 'Ido Galil', 'Hadas Orgad', 'Rinon Gal', 'Yoad Tewel', 'Gal Chechik', 'Yonatan Belinkov'], 'affiliations': ['Bar-Ilan University', 'NVIDIA', 'Technion Israel Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.06751.jpg', 'data': {'categories': ['#cv', '#architecture', '#interpretability', '#diffusion', '#training'], 'emoji': '🧩', 'ru': {'title': 'Раскрытие тайн токенов заполнения в генерации изображений', 'desc': 'Исследователи провели первый глубокий анализ роли токенов заполнения в моделях преобразования текста в изображение (T2I). Они разработали две причинно-следственные техники для изучения того, как информация кодируется в представлении токенов в различных компонентах конвейера T2I. Результаты показали три различных сценария влияния токенов заполнения на процесс генерации изображений. Исследование выявило ключевые взаимосвязи между этими сценариями и архитектурой модели, а также процессом ее обучения.'}, 'en': {'title': 'Unpacking Padding: The Hidden Role in Text-to-Image Models', 'desc': "This paper explores the impact of padding tokens in text-to-image (T2I) diffusion models, which are used to generate images from text prompts. The authors analyze how these padding tokens influence the image generation process at different stages, including text encoding and the diffusion process. They identify three scenarios where padding tokens can either affect the output or be ignored, depending on the model's architecture and training methods. The findings provide valuable insights that could guide future improvements in T2I model design and training practices."}, 'zh': {'title': '填充标记在图像生成中的关键作用', 'desc': '本文研究了文本到图像(T2I)扩散模型中填充标记的作用。填充标记通常用于将提示扩展到固定长度,但其对图像生成过程的影响尚未被深入探讨。我们开发了两种因果分析技术,探讨填充标记在T2I模型不同组件中的信息编码方式。研究结果表明,填充标记在文本编码、扩散过程中的影响各不相同,并与模型架构和训练过程存在重要关系。'}}}, {'id': 'https://huggingface.co/papers/2501.08316', 'title': 'Diffusion Adversarial Post-Training for One-Step Video Generation', 'url': 'https://huggingface.co/papers/2501.08316', 'abstract': 'The diffusion models are widely used for image and video generation, but their iterative generation process is slow and expansive. While existing distillation approaches have demonstrated the potential for one-step generation in the image domain, they still suffer from significant quality degradation. In this work, we propose Adversarial Post-Training (APT) against real data following diffusion pre-training for one-step video generation. To improve the training stability and quality, we introduce several improvements to the model architecture and training procedures, along with an approximated R1 regularization objective. Empirically, our experiments show that our adversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720, 24fps videos in real time using a single forward evaluation step. Additionally, our model is capable of generating 1024px images in a single step, achieving quality comparable to state-of-the-art methods.', 'score': 19, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '4122a780e8356ce7', 'authors': ['Shanchuan Lin', 'Xin Xia', 'Yuxi Ren', 'Ceyuan Yang', 'Xuefeng Xiao', 'Lu Jiang'], 'affiliations': ['ByteDance'], 'pdf_title_img': 'assets/pdf/title_img/2501.08316.jpg', 'data': {'categories': ['#architecture', '#optimization', '#video', '#diffusion', '#training'], 'emoji': '🎬', 'ru': {'title': 'Революция в генерации видео: от итераций к мгновенному результату', 'desc': 'Эта статья представляет новый метод под названием Adversarial Post-Training (APT) для одношаговой генерации видео. Авторы предлагают улучшения архитектуры модели и процедур обучения, включая аппроксимированную регуляризацию R1. Их модель Seaweed-APT способна генерировать 2-секундные видео высокого разрешения в реальном времени за один проход. Кроме того, модель может создавать изображения размером 1024px за один шаг, достигая качества, сравнимого с современными методами.'}, 'en': {'title': 'Fast and High-Quality Video Generation with Seaweed-APT', 'desc': 'This paper addresses the slow and costly iterative process of generating images and videos using diffusion models. The authors introduce Adversarial Post-Training (APT) to enhance one-step video generation while maintaining high quality. They implement architectural and procedural improvements, including an approximated R1 regularization, to stabilize training. Their model, Seaweed-APT, successfully generates high-quality 2-second videos and 1024px images in real time with a single forward evaluation step.'}, 'zh': {'title': '对抗后训练:快速高质量视频生成的新方法', 'desc': '扩散模型广泛应用于图像和视频生成,但其迭代生成过程较慢且成本高昂。现有的蒸馏方法在图像领域展示了单步生成的潜力,但仍存在显著的质量下降。本文提出了一种针对真实数据的对抗后训练(APT)方法,以实现单步视频生成。我们的实验表明,经过对抗后训练的模型Seaweed-APT能够实时生成1280x720、24fps的2秒视频,并且在单步生成1024px图像时,其质量可与最先进的方法相媲美。'}}}, {'id': 'https://huggingface.co/papers/2501.08187', 'title': 'A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction Following', 'url': 'https://huggingface.co/papers/2501.08187', 'abstract': 'Large language models excel at interpreting complex natural language instructions, enabling them to perform a wide range of tasks. In the life sciences, single-cell RNA sequencing (scRNA-seq) data serves as the "language of cellular biology", capturing intricate gene expression patterns at the single-cell level. However, interacting with this "language" through conventional tools is often inefficient and unintuitive, posing challenges for researchers. To address these limitations, we present InstructCell, a multi-modal AI copilot that leverages natural language as a medium for more direct and flexible single-cell analysis. We construct a comprehensive multi-modal instruction dataset that pairs text-based instructions with scRNA-seq profiles from diverse tissues and species. Building on this, we develop a multi-modal cell language architecture capable of simultaneously interpreting and processing both modalities. InstructCell empowers researchers to accomplish critical tasks-such as cell type annotation, conditional pseudo-cell generation, and drug sensitivity prediction-using straightforward natural language commands. Extensive evaluations demonstrate that InstructCell consistently meets or exceeds the performance of existing single-cell foundation models, while adapting to diverse experimental conditions. More importantly, InstructCell provides an accessible and intuitive tool for exploring complex single-cell data, lowering technical barriers and enabling deeper biological insights.', 'score': 18, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'de984ce7cc62fa5e', 'authors': ['Yin Fang', 'Xinle Deng', 'Kangwei Liu', 'Ningyu Zhang', 'Jingyang Qian', 'Penghui Yang', 'Xiaohui Fan', 'Huajun Chen'], 'affiliations': ['College of Computer Science and Technology, Zhejiang University, Hangzhou 310027, China', 'College of Pharmaceutical Sciences, Zhejiang University, Hangzhou 310058, China', 'Future Health Laboratory, Innovation Center of Yangtze River Delta, Zhejiang University, Jiaxing 314100, China', 'Innovation Center in Zhejiang University, State Key Laboratory of Component-Based Chinese Medicine, Hangzhou 310058, China', 'School of Software Technology, Zhejiang University, Ningbo 315048, China', 'ZJU-Hangzhou Global Scientific and Technological Innovation Center, Hangzhou 311200, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.08187.jpg', 'data': {'categories': ['#architecture', '#multimodal', '#dataset', '#science', '#healthcare'], 'emoji': '🧬', 'ru': {'title': 'Естественный язык как ключ к расшифровке клеточной биологии', 'desc': 'InstructCell - это мультимодальный ИИ-помощник для анализа данных одноклеточного РНК-секвенирования (scRNA-seq). Он использует архитектуру, способную интерпретировать как естественный язык, так и профили экспрессии генов. InstructCell позволяет исследователям выполнять такие задачи, как аннотация типов клеток и предсказание чувствительности к лекарствам, с помощью простых текстовых команд. Модель демонстрирует высокую производительность и адаптивность к различным экспериментальным условиям.'}, 'en': {'title': 'InstructCell: Bridging Language and Biology for Seamless Single-Cell Analysis', 'desc': 'This paper introduces InstructCell, an AI tool designed to simplify the analysis of single-cell RNA sequencing (scRNA-seq) data using natural language instructions. By creating a dataset that links text commands with scRNA-seq profiles, InstructCell allows researchers to perform complex tasks like cell type annotation and drug sensitivity prediction more intuitively. The model employs a multi-modal architecture that processes both text and biological data simultaneously, enhancing its usability. Evaluations show that InstructCell outperforms existing models, making single-cell analysis more accessible and efficient for researchers in the life sciences.'}, 'zh': {'title': '用自然语言解锁单细胞数据的潜力', 'desc': '这篇论文介绍了InstructCell,一个多模态的人工智能助手,旨在通过自然语言简化单细胞RNA测序(scRNA-seq)数据的分析。传统工具在处理细胞生物学的复杂数据时效率低下,而InstructCell通过将文本指令与scRNA-seq数据结合,提供了更直接和灵活的分析方式。该系统能够执行细胞类型注释、条件伪细胞生成和药物敏感性预测等关键任务,且使用简单的自然语言命令即可完成。评估结果表明,InstructCell在性能上优于现有的单细胞基础模型,同时适应多种实验条件,降低了技术门槛,促进了生物学的深入理解。'}}}, {'id': 'https://huggingface.co/papers/2501.08225', 'title': 'FramePainter: Endowing Interactive Image Editing with Video Diffusion Priors', 'url': 'https://huggingface.co/papers/2501.08225', 'abstract': 'Interactive image editing allows users to modify images through visual interaction operations such as drawing, clicking, and dragging. Existing methods construct such supervision signals from videos, as they capture how objects change with various physical interactions. However, these models are usually built upon text-to-image diffusion models, so necessitate (i) massive training samples and (ii) an additional reference encoder to learn real-world dynamics and visual consistency. In this paper, we reformulate this task as an image-to-video generation problem, so that inherit powerful video diffusion priors to reduce training costs and ensure temporal consistency. Specifically, we introduce FramePainter as an efficient instantiation of this formulation. Initialized with Stable Video Diffusion, it only uses a lightweight sparse control encoder to inject editing signals. Considering the limitations of temporal attention in handling large motion between two frames, we further propose matching attention to enlarge the receptive field while encouraging dense correspondence between edited and source image tokens. We highlight the effectiveness and efficiency of FramePainter across various of editing signals: it domainantly outperforms previous state-of-the-art methods with far less training data, achieving highly seamless and coherent editing of images, \\eg, automatically adjust the reflection of the cup. Moreover, FramePainter also exhibits exceptional generalization in scenarios not present in real-world videos, \\eg, transform the clownfish into shark-like shape. Our code will be available at https://github.com/YBYBZhang/FramePainter.', 'score': 12, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '811cfd0f18eb1e53', 'authors': ['Yabo Zhang', 'Xinpeng Zhou', 'Yihan Zeng', 'Hang Xu', 'Hui Li', 'Wangmeng Zuo'], 'affiliations': ['Harbin Institute of Technology', 'Huawei Noahs Ark Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.08225.jpg', 'data': {'categories': ['#video', '#cv', '#optimization', '#diffusion'], 'emoji': '🎨', 'ru': {'title': 'FramePainter: эффективное редактирование изображений через генерацию видео', 'desc': 'Статья представляет FramePainter - новый подход к интерактивному редактированию изображений, основанный на генерации видео. В отличие от существующих методов, использующих модели диффузии текст-изображение, FramePainter опирается на мощные видео-диффузионные модели для обеспечения временной согласованности и снижения затрат на обучение. Метод использует легковесный энкодер для внедрения сигналов редактирования и вводит механизм согласованного внимания для улучшения обработки крупных движений между кадрами. FramePainter превосходит современные методы, требуя значительно меньше обучающих данных и демонстрируя высокую обобщающую способность.'}, 'en': {'title': 'Revolutionizing Image Editing with Efficient Video Diffusion', 'desc': 'This paper presents FramePainter, a novel approach to interactive image editing that reformulates the task as image-to-video generation. By leveraging video diffusion models, FramePainter reduces the need for extensive training data while ensuring temporal consistency in edited images. It utilizes a lightweight sparse control encoder to effectively incorporate editing signals, and introduces matching attention to improve the handling of large motion between frames. The results demonstrate that FramePainter significantly outperforms existing methods, achieving seamless image edits and showcasing strong generalization capabilities.'}, 'zh': {'title': 'FramePainter:高效的图像编辑新方法', 'desc': '本文提出了一种交互式图像编辑的新方法,称为FramePainter。该方法将图像编辑任务重新定义为图像到视频的生成问题,从而利用强大的视频扩散先验,降低训练成本并确保时间一致性。FramePainter使用轻量级的稀疏控制编码器来注入编辑信号,并通过匹配注意力机制增强了对大运动的处理能力。实验结果表明,FramePainter在各种编辑信号下表现优异,能够实现无缝且连贯的图像编辑,且在未见过的场景中也展现出卓越的泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2501.08326', 'title': 'Omni-RGPT: Unifying Image and Video Region-level Understanding via Token Marks', 'url': 'https://huggingface.co/papers/2501.08326', 'abstract': 'We present Omni-RGPT, a multimodal large language model designed to facilitate region-level comprehension for both images and videos. To achieve consistent region representation across spatio-temporal dimensions, we introduce Token Mark, a set of tokens highlighting the target regions within the visual feature space. These tokens are directly embedded into spatial regions using region prompts (e.g., boxes or masks) and simultaneously incorporated into the text prompt to specify the target, establishing a direct connection between visual and text tokens. To further support robust video understanding without requiring tracklets, we introduce an auxiliary task that guides Token Mark by leveraging the consistency of the tokens, enabling stable region interpretation across the video. Additionally, we introduce a large-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT achieves state-of-the-art results on image and video-based commonsense reasoning benchmarks while showing strong performance in captioning and referring expression comprehension tasks.', 'score': 11, 'issue_id': 1678, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '463580cacfaa6789', 'authors': ['Miran Heo', 'Min-Hung Chen', 'De-An Huang', 'Sifei Liu', 'Subhashree Radhakrishnan', 'Seon Joo Kim', 'Yu-Chiang Frank Wang', 'Ryo Hachiuma'], 'affiliations': ['NVIDIA', 'Yonsei University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08326.jpg', 'data': {'categories': ['#multimodal', '#reasoning', '#agi', '#cv', '#dataset', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'Omni-RGPT: Новый уровень понимания изображений и видео искусственным интеллектом', 'desc': 'Omni-RGPT - это мультимодальная большая языковая модель, разработанная для понимания изображений и видео на уровне регионов. Модель использует технологию Token Mark для выделения целевых регионов в визуальном пространстве признаков. Для улучшения понимания видео без необходимости трекинга объектов введена вспомогательная задача, использующая согласованность токенов. Авторы также представили большой набор данных RegVID-300k для обучения на видео с инструкциями на уровне регионов.'}, 'en': {'title': 'Omni-RGPT: Bridging Visual and Textual Understanding with Token Mark', 'desc': 'Omni-RGPT is a multimodal large language model that enhances understanding of specific regions in images and videos. It uses a novel approach called Token Mark, which embeds tokens into visual features to highlight target areas, linking them with text prompts. This model also includes an auxiliary task that ensures consistent token representation across video frames, improving video comprehension. With the introduction of the RegVID-300k dataset, Omni-RGPT sets new benchmarks in commonsense reasoning, captioning, and referring expression tasks.'}, 'zh': {'title': 'Omni-RGPT:图像与视频的区域理解新突破', 'desc': '本文介绍了Omni-RGPT,这是一种多模态的大型语言模型,旨在促进图像和视频的区域级理解。为了在时空维度上实现一致的区域表示,我们引入了Token Mark,这是一组突出视觉特征空间中目标区域的标记。通过使用区域提示(如框或掩码),这些标记被直接嵌入到空间区域中,并同时与文本提示结合,以指定目标,从而建立视觉和文本标记之间的直接联系。此外,我们还引入了一个辅助任务,通过利用标记的一致性来指导Token Mark,从而支持稳健的视频理解。'}}}, {'id': 'https://huggingface.co/papers/2501.07730', 'title': 'Democratizing Text-to-Image Masked Generative Models with Compact Text-Aware One-Dimensional Tokens', 'url': 'https://huggingface.co/papers/2501.07730', 'abstract': 'Image tokenizers form the foundation of modern text-to-image generative models but are notoriously difficult to train. Furthermore, most existing text-to-image models rely on large-scale, high-quality private datasets, making them challenging to replicate. In this work, we introduce Text-Aware Transformer-based 1-Dimensional Tokenizer (TA-TiTok), an efficient and powerful image tokenizer that can utilize either discrete or continuous 1-dimensional tokens. TA-TiTok uniquely integrates textual information during the tokenizer decoding stage (i.e., de-tokenization), accelerating convergence and enhancing performance. TA-TiTok also benefits from a simplified, yet effective, one-stage training process, eliminating the need for the complex two-stage distillation used in previous 1-dimensional tokenizers. This design allows for seamless scalability to large datasets. Building on this, we introduce a family of text-to-image Masked Generative Models (MaskGen), trained exclusively on open data while achieving comparable performance to models trained on private data. We aim to release both the efficient, strong TA-TiTok tokenizers and the open-data, open-weight MaskGen models to promote broader access and democratize the field of text-to-image masked generative models.', 'score': 10, 'issue_id': 1673, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '80f40715084c602b', 'authors': ['Dongwon Kim', 'Ju He', 'Qihang Yu', 'Chenglin Yang', 'Xiaohui Shen', 'Suha Kwak', 'Liang-Chieh Chen'], 'affiliations': ['ByteDance Seed', 'POSTECH'], 'pdf_title_img': 'assets/pdf/title_img/2501.07730.jpg', 'data': {'categories': ['#dataset', '#data', '#training', '#cv', '#open_source'], 'emoji': '🖼️', 'ru': {'title': 'Демократизация генерации изображений с помощью эффективной токенизации и открытых данных', 'desc': 'В этой статье представлен новый подход к токенизации изображений для генеративных моделей текст-в-изображение под названием TA-TiTok. Данный токенизатор использует одномерные токены и интегрирует текстовую информацию на этапе детокенизации, что ускоряет сходимость и улучшает производительность. На основе TA-TiTok авторы разработали семейство моделей MaskGen, обученных исключительно на открытых данных. Целью работы является демократизация области генеративных моделей текст-в-изображение путем публикации эффективных токенизаторов и моделей с открытыми весами.'}, 'en': {'title': 'Democratizing Text-to-Image Generation with TA-TiTok', 'desc': 'This paper presents TA-TiTok, a novel image tokenizer designed for text-to-image generative models, which simplifies the training process and improves performance. Unlike traditional models that require large private datasets, TA-TiTok can effectively utilize open data, making it more accessible for researchers. The tokenizer incorporates textual information during the decoding stage, which helps it learn faster and perform better. Additionally, the authors introduce MaskGen, a family of generative models that leverage TA-TiTok and are trained on publicly available datasets, aiming to democratize access to advanced text-to-image generation technology.'}, 'zh': {'title': '高效的文本到图像生成模型,推动开放数据的使用', 'desc': '本文介绍了一种新的图像标记器,称为TA-TiTok,它可以有效地处理文本到图像的生成任务。TA-TiTok在解码阶段整合了文本信息,从而加快了模型的收敛速度并提高了性能。与以往的标记器不同,TA-TiTok采用了一种简化的一阶段训练过程,避免了复杂的两阶段蒸馏过程。我们还提出了一系列基于开放数据训练的文本到图像生成模型MaskGen,旨在促进更广泛的访问和民主化。'}}}, {'id': 'https://huggingface.co/papers/2501.05131', 'title': '3DIS-FLUX: simple and efficient multi-instance generation with DiT rendering', 'url': 'https://huggingface.co/papers/2501.05131', 'abstract': "The growing demand for controllable outputs in text-to-image generation has driven significant advancements in multi-instance generation (MIG), enabling users to define both instance layouts and attributes. Currently, the state-of-the-art methods in MIG are primarily adapter-based. However, these methods necessitate retraining a new adapter each time a more advanced model is released, resulting in significant resource consumption. A methodology named Depth-Driven Decoupled Instance Synthesis (3DIS) has been introduced, which decouples MIG into two distinct phases: 1) depth-based scene construction and 2) detail rendering with widely pre-trained depth control models. The 3DIS method requires adapter training solely during the scene construction phase, while enabling various models to perform training-free detail rendering. Initially, 3DIS focused on rendering techniques utilizing U-Net architectures such as SD1.5, SD2, and SDXL, without exploring the potential of recent DiT-based models like FLUX. In this paper, we present 3DIS-FLUX, an extension of the 3DIS framework that integrates the FLUX model for enhanced rendering capabilities. Specifically, we employ the FLUX.1-Depth-dev model for depth map controlled image generation and introduce a detail renderer that manipulates the Attention Mask in FLUX's Joint Attention mechanism based on layout information. This approach allows for the precise rendering of fine-grained attributes of each instance. Our experimental results indicate that 3DIS-FLUX, leveraging the FLUX model, outperforms the original 3DIS method, which utilized SD2 and SDXL, and surpasses current state-of-the-art adapter-based methods in terms of both performance and image quality. Project Page: https://limuloo.github.io/3DIS/.", 'score': 9, 'issue_id': 1684, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ca5ad23cb146f3aa', 'authors': ['Dewei Zhou', 'Ji Xie', 'Zongxin Yang', 'Yi Yang'], 'affiliations': ['DBMI, HMS, Harvard University', 'RELER, CCAI, Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05131.jpg', 'data': {'categories': ['#cv', '#games', '#architecture', '#multimodal', '#optimization'], 'emoji': '🎨', 'ru': {'title': '3DIS-FLUX: Новый уровень контролируемой генерации мульти-объектных изображений', 'desc': 'Статья представляет метод 3DIS-FLUX для управляемой генерации изображений с несколькими объектами. Этот подход разделяет процесс на создание сцены на основе глубины и детализированный рендеринг с использованием предобученных моделей контроля глубины. 3DIS-FLUX интегрирует модель FLUX для улучшенного рендеринга, манипулируя маской внимания в механизме совместного внимания FLUX. Эксперименты показывают, что 3DIS-FLUX превосходит предыдущие методы по производительности и качеству изображений.'}, 'en': {'title': 'Enhancing Text-to-Image Generation with 3DIS-FLUX', 'desc': 'This paper introduces a new method called 3DIS-FLUX for improving text-to-image generation by enhancing the multi-instance generation (MIG) process. The 3DIS framework separates the generation into two phases: constructing the scene based on depth and rendering details using pre-trained models. By integrating the FLUX model, the method allows for better control over the rendering of fine details while reducing the need for retraining adapters. Experimental results show that 3DIS-FLUX outperforms previous methods in both performance and image quality, making it a significant advancement in controllable image generation.'}, 'zh': {'title': '深度驱动解耦实例合成:提升图像生成的可控性与质量', 'desc': '随着对可控文本到图像生成输出的需求增加,多实例生成(MIG)技术得到了显著进展。现有的MIG方法主要基于适配器,但每次新模型发布时都需要重新训练适配器,消耗大量资源。本文提出了一种名为深度驱动解耦实例合成(3DIS)的方法,将MIG分为两个阶段:基于深度的场景构建和细节渲染。通过引入FLUX模型,3DIS-FLUX在细节渲染方面实现了更高的性能和图像质量。'}}}, {'id': 'https://huggingface.co/papers/2501.08328', 'title': 'PokerBench: Training Large Language Models to become Professional Poker Players', 'url': 'https://huggingface.co/papers/2501.08328', 'abstract': 'We introduce PokerBench - a benchmark for evaluating the poker-playing abilities of large language models (LLMs). As LLMs excel in traditional NLP tasks, their application to complex, strategic games like poker poses a new challenge. Poker, an incomplete information game, demands a multitude of skills such as mathematics, reasoning, planning, strategy, and a deep understanding of game theory and human psychology. This makes Poker the ideal next frontier for large language models. PokerBench consists of a comprehensive compilation of 11,000 most important scenarios, split between pre-flop and post-flop play, developed in collaboration with trained poker players. We evaluate prominent models including GPT-4, ChatGPT 3.5, and various Llama and Gemma series models, finding that all state-of-the-art LLMs underperform in playing optimal poker. However, after fine-tuning, these models show marked improvements. We validate PokerBench by having models with different scores compete with each other, demonstrating that higher scores on PokerBench lead to higher win rates in actual poker games. Through gameplay between our fine-tuned model and GPT-4, we also identify limitations of simple supervised fine-tuning for learning optimal playing strategy, suggesting the need for more advanced methodologies for effectively training language models to excel in games. PokerBench thus presents a unique benchmark for a quick and reliable evaluation of the poker-playing ability of LLMs as well as a comprehensive benchmark to study the progress of LLMs in complex game-playing scenarios. The dataset and code will be made available at: https://github.com/pokerllm/pokerbench.', 'score': 9, 'issue_id': 1674, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '7b4dacedffdbfa15', 'authors': ['Richard Zhuang', 'Akshat Gupta', 'Richard Yang', 'Aniket Rahane', 'Zhengyu Li', 'Gopala Anumanchipalli'], 'affiliations': ['Georgia Institute of Technology', 'University of California, Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.08328.jpg', 'data': {'categories': ['#training', '#reasoning', '#games', '#optimization', '#benchmark'], 'emoji': '🃏', 'ru': {'title': 'PokerBench: новый рубеж для оценки стратегических способностей языковых моделей', 'desc': 'PokerBench - это новый бенчмарк для оценки способностей больших языковых моделей (LLM) играть в покер. Он включает 11000 важнейших сценариев игры, разработанных совместно с профессиональными игроками. Авторы оценили производительность современных LLM, таких как GPT-4 и ChatGPT 3.5, обнаружив, что все модели показывают результаты ниже оптимальных. После дообучения модели демонстрируют значительное улучшение, но авторы отмечают ограничения простого обучения с учителем для освоения оптимальной стратегии игры.'}, 'en': {'title': 'PokerBench: Elevating LLMs to Master the Game of Poker', 'desc': 'PokerBench is a new benchmark designed to assess the poker-playing skills of large language models (LLMs). It focuses on the unique challenges of poker, which requires a blend of mathematical skills, strategic reasoning, and an understanding of human psychology. The benchmark includes 11,000 scenarios that cover various aspects of the game, and it has been tested on several leading models, revealing that they initially struggle with optimal poker play. However, after fine-tuning, these models show significant improvement, highlighting the need for advanced training techniques to enhance their performance in complex games.'}, 'zh': {'title': 'PokerBench:评估语言模型扑克能力的新基准', 'desc': '我们介绍了PokerBench,这是一个用于评估大型语言模型(LLMs)扑克游戏能力的基准。扑克是一种不完全信息游戏,需要数学、推理、规划、策略以及对博弈论和人类心理的深刻理解。PokerBench包含11,000个重要场景,分为翻牌前和翻牌后游戏,经过训练的扑克玩家共同开发。通过对不同模型的评估,我们发现尽管当前的LLMs在扑克游戏中表现不佳,但经过微调后,它们的表现有显著提升。'}}}, {'id': 'https://huggingface.co/papers/2501.08319', 'title': 'Enhancing Automated Interpretability with Output-Centric Feature Descriptions', 'url': 'https://huggingface.co/papers/2501.08319', 'abstract': 'Automated interpretability pipelines generate natural language descriptions for the concepts represented by features in large language models (LLMs), such as plants or the first word in a sentence. These descriptions are derived using inputs that activate the feature, which may be a dimension or a direction in the model\'s representation space. However, identifying activating inputs is costly, and the mechanistic role of a feature in model behavior is determined both by how inputs cause a feature to activate and by how feature activation affects outputs. Using steering evaluations, we reveal that current pipelines provide descriptions that fail to capture the causal effect of the feature on outputs. To fix this, we propose efficient, output-centric methods for automatically generating feature descriptions. These methods use the tokens weighted higher after feature stimulation or the highest weight tokens after applying the vocabulary "unembedding" head directly to the feature. Our output-centric descriptions better capture the causal effect of a feature on model outputs than input-centric descriptions, but combining the two leads to the best performance on both input and output evaluations. Lastly, we show that output-centric descriptions can be used to find inputs that activate features previously thought to be "dead".', 'score': 7, 'issue_id': 1677, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '22615e3bb16f93af', 'authors': ['Yoav Gur-Arieh', 'Roy Mayan', 'Chen Agassy', 'Atticus Geiger', 'Mor Geva'], 'affiliations': ['Blavatnik School of Computer Science and AI, Tel Aviv University', 'Pr(Ai)2R Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.08319.jpg', 'data': {'categories': ['#interpretability', '#inference', '#training', '#data'], 'emoji': '🔍', 'ru': {'title': 'Взгляд изнутри: новый метод интерпретации больших языковых моделей', 'desc': 'Статья описывает новый подход к автоматической интерпретации нейронных сетей, фокусируясь на выходных данных модели вместо входных. Авторы предлагают эффективные методы для генерации описаний признаков, основанные на токенах с наибольшим весом после стимуляции признака. Эксперименты показывают, что ориентированные на выход описания лучше отражают причинно-следственное влияние признака на результаты модели. Комбинация подходов, ориентированных на вход и выход, дает наилучшие результаты в оценке как входных, так и выходных данных.'}, 'en': {'title': 'Unlocking Feature Interpretability in Language Models', 'desc': 'This paper discusses how automated interpretability pipelines can create natural language descriptions for features in large language models (LLMs). It highlights the challenge of identifying inputs that activate these features, which is essential for understanding their role in model behavior. The authors propose new methods that focus on the output effects of features, leading to more accurate descriptions of their causal impact. By combining both input-centric and output-centric approaches, the proposed methods improve the overall interpretability of LLMs and can even identify previously overlooked features.'}, 'zh': {'title': '以输出为中心的特征描述生成方法', 'desc': '这篇论文讨论了自动化可解释性管道如何为大型语言模型中的特征生成自然语言描述。特征的描述是通过激活特征的输入生成的,但识别这些输入的过程成本高昂。研究表明,现有的描述方法未能有效捕捉特征对输出的因果影响。为此,作者提出了一种以输出为中心的方法,能够更好地生成特征描述,并结合输入和输出的评估来提高性能。'}}}, {'id': 'https://huggingface.co/papers/2501.08197', 'title': 'OpenCSG Chinese Corpus: A Series of High-quality Chinese Datasets for LLM Training', 'url': 'https://huggingface.co/papers/2501.08197', 'abstract': 'Large language models (LLMs) have demonstrated remarkable capabilities, but their success heavily relies on the quality of pretraining corpora. For Chinese LLMs, the scarcity of high-quality Chinese datasets presents a significant challenge, often limiting their performance. To address this issue, we propose the OpenCSG Chinese Corpus, a series of high-quality datasets specifically designed for LLM pretraining, post-training, and fine-tuning. This corpus includes Fineweb-edu-chinese, Fineweb-edu-chinese-v2, Cosmopedia-chinese, and Smoltalk-chinese, each with distinct characteristics: Fineweb-edu datasets focus on filtered, high-quality content derived from diverse Chinese web sources; Cosmopedia-chinese provides synthetic, textbook-style data for knowledge-intensive training; and Smoltalk-chinese emphasizes stylistic and diverse chat-format data. The OpenCSG Chinese Corpus is characterized by its high-quality text, diverse coverage across domains, and scalable, reproducible data curation processes. Additionally, we conducted extensive experimental analyses, including evaluations on smaller parameter models, which demonstrated significant performance improvements in tasks such as C-Eval, showcasing the effectiveness of the corpus for training Chinese LLMs.', 'score': 5, 'issue_id': 1675, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '27267ae1a569051c', 'authors': ['Yijiong Yu', 'Ziyun Dai', 'Zekun Wang', 'Wei Wang', 'Ran Chen', 'Ji Pei'], 'affiliations': ['OpenCSG', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08197.jpg', 'data': {'categories': ['#data', '#open_source', '#dataset', '#synthetic', '#training', '#low_resource'], 'emoji': '🐉', 'ru': {'title': 'Прорыв в обучении китайских языковых моделей: OpenCSG Chinese Corpus', 'desc': 'Эта статья представляет OpenCSG Chinese Corpus - набор высококачественных китайских датасетов для предобучения, пост-обучения и тонкой настройки больших языковых моделей (LLM). Корпус включает в себя несколько датасетов, каждый с уникальными характеристиками: от отфильтрованного веб-контента до синтетических учебных данных и разговорных форматов. Авторы подчеркивают высокое качество текста, разнообразие тематик и масштабируемость процесса сбора данных. Эксперименты показали значительное улучшение производительности моделей на различных задачах, включая C-Eval.'}, 'en': {'title': 'Empowering Chinese LLMs with OpenCSG Corpus', 'desc': 'This paper introduces the OpenCSG Chinese Corpus, a collection of high-quality datasets aimed at improving the performance of Chinese large language models (LLMs). The corpus includes several datasets, each tailored for different training needs: Fineweb-edu datasets focus on high-quality web content, Cosmopedia-chinese offers synthetic textbook-style data, and Smoltalk-chinese provides diverse chat-format data. The authors highlight the importance of quality pretraining data for LLMs and demonstrate through experiments that using this corpus leads to significant performance gains in various evaluation tasks. Overall, the OpenCSG Chinese Corpus addresses the challenge of limited high-quality datasets for Chinese LLMs, promoting better training outcomes.'}, 'zh': {'title': '提升中文LLM性能的高质量语料库', 'desc': '大型语言模型(LLMs)在处理自然语言方面表现出色,但其成功依赖于高质量的预训练语料库。针对中文LLMs,优质中文数据集的稀缺性成为了一个重大挑战,限制了它们的性能。为了解决这个问题,我们提出了OpenCSG中文语料库,这是一系列专门为LLM预训练、后训练和微调设计的高质量数据集。该语料库包括Fineweb-edu-chinese、Fineweb-edu-chinese-v2、Cosmopedia-chinese和Smoltalk-chinese,涵盖了多样化的内容和风格,显著提升了中文LLMs的训练效果。'}}}, {'id': 'https://huggingface.co/papers/2501.08167', 'title': 'Potential and Perils of Large Language Models as Judges of Unstructured Textual Data', 'url': 'https://huggingface.co/papers/2501.08167', 'abstract': "Rapid advancements in large language models have unlocked remarkable capabilities when it comes to processing and summarizing unstructured text data. This has implications for the analysis of rich, open-ended datasets, such as survey responses, where LLMs hold the promise of efficiently distilling key themes and sentiments. However, as organizations increasingly turn to these powerful AI systems to make sense of textual feedback, a critical question arises, can we trust LLMs to accurately represent the perspectives contained within these text based datasets? While LLMs excel at generating human-like summaries, there is a risk that their outputs may inadvertently diverge from the true substance of the original responses. Discrepancies between the LLM-generated outputs and the actual themes present in the data could lead to flawed decision-making, with far-reaching consequences for organizations. This research investigates the effectiveness of LLMs as judge models to evaluate the thematic alignment of summaries generated by other LLMs. We utilized an Anthropic Claude model to generate thematic summaries from open-ended survey responses, with Amazon's Titan Express, Nova Pro, and Meta's Llama serving as LLM judges. The LLM-as-judge approach was compared to human evaluations using Cohen's kappa, Spearman's rho, and Krippendorff's alpha, validating a scalable alternative to traditional human centric evaluation methods. Our findings reveal that while LLMs as judges offer a scalable solution comparable to human raters, humans may still excel at detecting subtle, context-specific nuances. This research contributes to the growing body of knowledge on AI assisted text analysis. We discuss limitations and provide recommendations for future research, emphasizing the need for careful consideration when generalizing LLM judge models across various contexts and use cases.", 'score': 5, 'issue_id': 1675, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '866161709624c632', 'authors': ['Rewina Bedemariam', 'Natalie Perez', 'Sreyoshi Bhaduri', 'Satya Kapoor', 'Alex Gil', 'Elizabeth Conjar', 'Ikkei Itoku', 'David Theil', 'Aman Chadha', 'Naumaan Nayyar'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08167.jpg', 'data': {'categories': ['#data', '#dataset', '#science', '#ethics', '#multimodal', '#benchmark', '#interpretability'], 'emoji': '🤖', 'ru': {'title': 'LLM как судьи: масштабируемая альтернатива человеческим оценкам в анализе текста', 'desc': 'Исследование посвящено использованию больших языковых моделей (LLM) для анализа неструктурированных текстовых данных, таких как ответы на опросы. Авторы изучают эффективность применения LLM в качестве судей для оценки тематического соответствия сгенерированных другими LLM резюме. Результаты показывают, что LLM-судьи предлагают масштабируемое решение, сопоставимое с оценками людей, хотя люди все еще могут превосходить их в обнаружении тонких, контекстно-зависимых нюансов. Исследование вносит вклад в растущий объем знаний об анализе текста с помощью искусственного интеллекта.'}, 'en': {'title': 'Trusting AI: Evaluating LLMs for Accurate Text Analysis', 'desc': 'This paper explores the use of large language models (LLMs) for summarizing and analyzing unstructured text data, particularly from open-ended survey responses. It raises concerns about the trustworthiness of LLM-generated summaries, as they may not accurately reflect the original sentiments and themes present in the data. The research introduces an LLM-as-judge framework, where one LLM generates summaries while others evaluate their thematic alignment, comparing this method to human evaluations. The findings suggest that while LLMs can provide a scalable alternative to human raters, they may struggle with detecting subtle nuances that humans can identify, highlighting the importance of careful application in different contexts.'}, 'zh': {'title': '信任大型语言模型的总结能力吗?', 'desc': '这篇论文探讨了大型语言模型(LLMs)在处理和总结非结构化文本数据方面的能力,尤其是在分析开放式调查反馈时的应用。研究表明,虽然LLMs能够生成类似人类的总结,但它们的输出可能与原始文本的真实主题存在偏差,这可能导致错误的决策。为了评估LLMs生成的总结与实际主题的一致性,研究使用了LLMs作为评判模型,并与人类评估进行了比较。结果显示,LLMs作为评判者提供了一种可扩展的解决方案,但人类在捕捉细微的上下文特征方面仍然表现更佳。'}}}, {'id': 'https://huggingface.co/papers/2501.07888', 'title': 'Tarsier2: Advancing Large Vision-Language Models from Detailed Video Description to Comprehensive Video Understanding', 'url': 'https://huggingface.co/papers/2501.07888', 'abstract': 'We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM) designed for generating detailed and accurate video descriptions, while also exhibiting superior general video understanding capabilities. Tarsier2 achieves significant advancements through three key upgrades: (1) Scaling pre-training data from 11M to 40M video-text pairs, enriching both volume and diversity; (2) Performing fine-grained temporal alignment during supervised fine-tuning; (3) Using model-based sampling to automatically construct preference data and applying DPO training for optimization. Extensive experiments show that Tarsier2-7B consistently outperforms leading proprietary models, including GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K benchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\% performance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B also sets new state-of-the-art results across 15 public benchmarks, spanning tasks such as video question-answering, video grounding, hallucination test, and embodied question-answering, demonstrating its versatility as a robust generalist vision-language model.', 'score': 5, 'issue_id': 1674, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '54780a4b6f93fb10', 'authors': ['Liping Yuan', 'Jiawei Wang', 'Haomiao Sun', 'Yuchen Zhang', 'Yuan Lin'], 'affiliations': ['ByteDance Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.07888.jpg', 'data': {'categories': ['#dataset', '#training', '#cv', '#hallucinations', '#optimization', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'Tarsier2: Революция в понимании видео искусственным интеллектом', 'desc': 'Tarsier2 - это современная крупномасштабная модель для понимания видео и языка (LVLM), разработанная для создания детальных и точных описаний видео. Модель достигает значительных улучшений благодаря увеличению объема обучающих данных, точной временной синхронизации при тонкой настройке и применению обучения с предпочтениями (DPO). Tarsier2-7B превосходит ведущие проприетарные модели, такие как GPT-4o и Gemini 1.5 Pro, в задачах детального описания видео. Модель также устанавливает новые рекорды в 15 публичных бенчмарках, демонстрируя свою универсальность как надежная модель общего назначения для понимания видео и языка.'}, 'en': {'title': 'Tarsier2: Redefining Video Understanding with Advanced LVLM Technology', 'desc': "Tarsier2 is a cutting-edge large vision-language model (LVLM) that excels in generating precise and detailed descriptions of videos while showcasing advanced video comprehension skills. The model's improvements stem from three main enhancements: increasing the pre-training dataset from 11 million to 40 million video-text pairs, implementing fine-grained temporal alignment during fine-tuning, and utilizing model-based sampling for preference data construction with DPO training for optimization. Extensive testing reveals that Tarsier2-7B surpasses top proprietary models like GPT-4o and Gemini 1.5 Pro in video description tasks, achieving notable F1 score improvements on the DREAM-1K benchmark. Additionally, Tarsier2-7B sets new records across 15 public benchmarks, proving its effectiveness in various tasks such as video question-answering and video grounding."}, 'zh': {'title': 'Tarsier2:视频描述的新标杆', 'desc': 'Tarsier2是一种先进的大型视觉语言模型,专门用于生成详细且准确的视频描述,同时具备出色的视频理解能力。该模型通过三个关键升级实现了显著进步:首先,预训练数据从1100万对视频文本扩展到4000万对,增加了数据的数量和多样性;其次,在监督微调过程中进行精细的时间对齐;最后,采用基于模型的采样自动构建偏好数据,并应用DPO训练进行优化。实验结果表明,Tarsier2-7B在视频描述任务中持续超越领先的专有模型,展现出其作为强大通用视觉语言模型的多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.08292', 'title': 'HALoGEN: Fantastic LLM Hallucinations and Where to Find Them', 'url': 'https://huggingface.co/papers/2501.08292', 'abstract': 'Despite their impressive ability to generate high-quality and fluent text, generative large language models (LLMs) also produce hallucinations: statements that are misaligned with established world knowledge or provided input context. However, measuring hallucination can be challenging, as having humans verify model generations on-the-fly is both expensive and time-consuming. In this work, we release HALoGEN, a comprehensive hallucination benchmark consisting of: (1) 10,923 prompts for generative models spanning nine domains including programming, scientific attribution, and summarization, and (2) automatic high-precision verifiers for each use case that decompose LLM generations into atomic units, and verify each unit against a high-quality knowledge source. We use this framework to evaluate ~150,000 generations from 14 language models, finding that even the best-performing models are riddled with hallucinations (sometimes up to 86% of generated atomic facts depending on the domain). We further define a novel error classification for LLM hallucinations based on whether they likely stem from incorrect recollection of training data (Type A errors), or incorrect knowledge in training data (Type B errors), or are fabrication (Type C errors). We hope our framework provides a foundation to enable the principled study of why generative models hallucinate, and advances the development of trustworthy large language models.', 'score': 5, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'f6751d682ff824ed', 'authors': ['Abhilasha Ravichander', 'Shrusti Ghela', 'David Wadden', 'Yejin Choi'], 'affiliations': ['Google', 'NVIDIA', 'University of Washington'], 'pdf_title_img': 'assets/pdf/title_img/2501.08292.jpg', 'data': {'categories': ['#dataset', '#hallucinations', '#benchmark'], 'emoji': '🔍', 'ru': {'title': 'HALoGEN: Автоматическая проверка галлюцинаций в языковых моделях', 'desc': 'Эта статья представляет HALoGEN - комплексный инструмент для оценки галлюцинаций в больших языковых моделях (LLM). Авторы создали набор из 10,923 промптов в девяти различных областях и автоматические верификаторы высокой точности для проверки генераций LLM. Исследование выявило, что даже лучшие модели страдают от галлюцинаций, иногда до 86% сгенерированных фактов оказываются неверными. Авторы также предложили новую классификацию ошибок LLM, разделив их на три типа в зависимости от источника галлюцинаций.'}, 'en': {'title': 'HALoGEN: A Benchmark for Measuring Hallucinations in Language Models', 'desc': 'This paper introduces HALoGEN, a new benchmark designed to measure hallucinations in generative large language models (LLMs). Hallucinations refer to incorrect statements generated by these models that do not align with known facts or the given context. The benchmark includes over 10,000 prompts across various domains and employs automatic verifiers to assess the accuracy of model outputs. The study reveals that even top-performing models exhibit significant hallucinations, prompting a classification system for different types of errors to better understand their origins and improve model reliability.'}, 'zh': {'title': '揭示生成模型的幻觉问题', 'desc': '尽管生成性大型语言模型(LLMs)能够生成高质量和流畅的文本,但它们也会产生幻觉,即与已知世界知识或输入上下文不一致的陈述。测量幻觉的难度在于,实时验证模型生成的内容既昂贵又耗时。为此,我们推出了HALoGEN,这是一个全面的幻觉基准,包含10,923个跨越九个领域的提示和自动高精度验证器。我们的研究发现,即使是表现最好的模型,其生成的原子事实中也有高达86%可能存在幻觉,这为理解生成模型的幻觉提供了基础。'}}}, {'id': 'https://huggingface.co/papers/2501.08284', 'title': 'AfriHate: A Multilingual Collection of Hate Speech and Abusive Language Datasets for African Languages', 'url': 'https://huggingface.co/papers/2501.08284', 'abstract': 'Hate speech and abusive language are global phenomena that need socio-cultural background knowledge to be understood, identified, and moderated. However, in many regions of the Global South, there have been several documented occurrences of (1) absence of moderation and (2) censorship due to the reliance on keyword spotting out of context. Further, high-profile individuals have frequently been at the center of the moderation process, while large and targeted hate speech campaigns against minorities have been overlooked. These limitations are mainly due to the lack of high-quality data in the local languages and the failure to include local communities in the collection, annotation, and moderation processes. To address this issue, we present AfriHate: a multilingual collection of hate speech and abusive language datasets in 15 African languages. Each instance in AfriHate is annotated by native speakers familiar with the local culture. We report the challenges related to the construction of the datasets and present various classification baseline results with and without using LLMs. The datasets, individual annotations, and hate speech and offensive language lexicons are available on https://github.com/AfriHate/AfriHate', 'score': 3, 'issue_id': 1676, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '8c76dd102740009c', 'authors': ['Shamsuddeen Hassan Muhammad', 'Idris Abdulmumin', 'Abinew Ali Ayele', 'David Ifeoluwa Adelani', 'Ibrahim Said Ahmad', 'Saminu Mohammad Aliyu', 'Nelson Odhiambo Onyango', 'Lilian D. A. Wanzare', 'Samuel Rutunda', 'Lukman Jibril Aliyu', 'Esubalew Alemneh', 'Oumaima Hourrane', 'Hagos Tesfahun Gebremichael', 'Elyas Abdi Ismail', 'Meriem Beloucif', 'Ebrahim Chekol Jibril', 'Andiswa Bukula', 'Rooweither Mabuya', 'Salomey Osei', 'Abigail Oppong', 'Tadesse Destaw Belay', 'Tadesse Kebede Guge', 'Tesfa Tegegne Asfaw', 'Chiamaka Ijeoma Chukwuneke', 'Paul Röttger', 'Seid Muhie Yimam', 'Nedjma Ousidhoum'], 'affiliations': ['Addis Ababa University', 'Al Akhawayn University', 'Bahir Dar University', 'Bayero University Kano', 'Bocconi University', 'Cardiff University', 'DSFSI, University of Pretoria', 'Digital Umuganda', 'Haramaya University', 'HausaNLP', 'Imperial College London', 'Independent Researcher', 'Instituto Politécnico Nacional', 'Istanbul Technical University', 'Lancaster University', 'Maseno University', 'Mila, McGill University & Canada CIFAR AI Chair', 'Northeastern University', 'SADiLaR', 'University of Deusto', 'University of Hamburg', 'Uppsala University', 'Wollo University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08284.jpg', 'data': {'categories': ['#dataset', '#ethics', '#multilingual', '#data', '#low_resource', '#open_source'], 'emoji': '🌍', 'ru': {'title': 'AfriHate: борьба с языком вражды в Африке с помощью локальных данных и экспертизы', 'desc': 'Статья представляет AfriHate - многоязычный набор данных по языку вражды и оскорбительной лексике на 15 африканских языках. Датасет создан для решения проблемы недостатка качественных данных на местных языках и отсутствия вовлечения локальных сообществ в процессы сбора, разметки и модерации контента. Каждый пример в AfriHate размечен носителями языка, знакомыми с местной культурой. Авторы описывают трудности, связанные с созданием датасетов, и представляют результаты базовых классификационных моделей, в том числе с использованием языковых моделей.'}, 'en': {'title': 'Empowering Local Voices Against Hate Speech with AfriHate', 'desc': 'This paper addresses the challenges of identifying and moderating hate speech in the Global South, particularly in African languages. It highlights the limitations of existing moderation techniques that rely on keyword spotting without cultural context, leading to ineffective censorship and oversight of targeted hate campaigns. To combat this, the authors introduce AfriHate, a multilingual dataset of hate speech and abusive language in 15 African languages, annotated by native speakers. The paper also discusses the difficulties faced during dataset construction and presents baseline classification results, demonstrating the potential of using large language models (LLMs) for this task.'}, 'zh': {'title': '构建多语言仇恨言论数据集,助力社会文化理解', 'desc': '本论文介绍了AfriHate,这是一个包含15种非洲语言的仇恨言论和辱骂语言数据集。该数据集由熟悉当地文化的母语者进行标注,以解决全球南方地区在仇恨言论管理中的数据缺乏问题。研究还探讨了数据集构建过程中的挑战,并展示了使用和不使用大型语言模型(LLMs)进行分类的基线结果。所有数据集、标注和相关词汇表均可在指定网站上获取。'}}}, {'id': 'https://huggingface.co/papers/2501.08120', 'title': 'In-situ graph reasoning and knowledge expansion using Graph-PReFLexOR', 'url': 'https://huggingface.co/papers/2501.08120', 'abstract': "The pursuit of automated scientific discovery has fueled progress from symbolic logic to modern AI, forging new frontiers in reasoning and pattern recognition. Transformers function as potential systems, where every possible relationship remains latent potentiality until tasks impose constraints, akin to measurement. Yet, refining their sampling requires more than probabilistic selection: solutions must conform to specific structures or rules, ensuring consistency and the invocation of general principles. We present Graph-PReFLexOR (Graph-based Preference-based Recursive Language Modeling for Exploratory Optimization of Reasoning), a framework that combines graph reasoning with symbolic abstraction to dynamically expand domain knowledge. Inspired by reinforcement learning, Graph-PReFLexOR defines reasoning as a structured mapping, where tasks yield knowledge graphs, abstract patterns, and ultimately, final answers. Inspired by category theory, it encodes concepts as nodes and their relationships as edges, supporting hierarchical inference and adaptive learning through isomorphic representations. Demonstrations include hypothesis generation, materials design, and creative reasoning, such as discovering relationships between mythological concepts like 'thin places' with materials science. We propose a 'knowledge garden growth' strategy that integrates insights across domains, promoting interdisciplinary connections. Results with a 3-billion-parameter Graph-PReFLexOR model show superior reasoning depth and adaptability, underscoring the potential for transparent, multidisciplinary AI-driven discovery. It lays the groundwork for general autonomous reasoning solutions.", 'score': 1, 'issue_id': 1683, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'f8f5360d1fb8fb75', 'authors': ['Markus J. Buehler'], 'affiliations': ['Laboratory for Atomistic and Molecular Mechanics, MIT, Cambridge, MA 02139, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.08120.jpg', 'data': {'categories': ['#multimodal', '#reasoning', '#agents', '#graphs', '#rl', '#science', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Graph-PReFLexOR: Новый горизонт в автономном научном открытии', 'desc': 'Статья представляет Graph-PReFLexOR - фреймворк, объединяющий графовые рассуждения с символьной абстракцией для динамического расширения предметных знаний. Вдохновленный теорией категорий, он кодирует концепции как узлы, а их отношения как ребра, поддерживая иерархический вывод и адаптивное обучение. Демонстрации включают генерацию гипотез, дизайн материалов и творческие рассуждения, такие как обнаружение связей между мифологическими концепциями и материаловедением. Результаты с 3-миллиардной моделью Graph-PReFLexOR показывают превосходную глубину рассуждений и адаптивность, подчеркивая потенциал для прозрачных, междисциплинарных решений на основе ИИ.'}, 'en': {'title': 'Empowering AI with Graph-Based Reasoning for Scientific Discovery', 'desc': 'This paper introduces Graph-PReFLexOR, a novel framework that enhances automated scientific discovery by integrating graph reasoning with symbolic abstraction. It utilizes a structured mapping approach inspired by reinforcement learning, allowing for the generation of knowledge graphs and abstract patterns from various tasks. The framework supports hierarchical inference and adaptive learning, enabling it to explore interdisciplinary connections effectively. Demonstrations of its capabilities include hypothesis generation and creative reasoning, showcasing its potential for deep and adaptable reasoning in AI-driven discovery.'}, 'zh': {'title': '知识花园的成长:跨领域的智能推理', 'desc': '这篇论文介绍了一种名为Graph-PReFLexOR的框架,它结合了图推理和符号抽象,以动态扩展领域知识。该框架通过结构化映射定义推理,利用知识图谱和抽象模式来生成最终答案。它的灵感来自强化学习和范畴理论,将概念编码为节点,关系编码为边,支持层次推理和自适应学习。实验结果表明,Graph-PReFLexOR在推理深度和适应性方面表现优越,为自动化推理解决方案奠定了基础。'}}}, {'id': 'https://huggingface.co/papers/2501.07556', 'title': 'MatchAnything: Universal Cross-Modality Image Matching with Large-Scale Pre-Training', 'url': 'https://huggingface.co/papers/2501.07556', 'abstract': 'Image matching, which aims to identify corresponding pixel locations between images, is crucial in a wide range of scientific disciplines, aiding in image registration, fusion, and analysis. In recent years, deep learning-based image matching algorithms have dramatically outperformed humans in rapidly and accurately finding large amounts of correspondences. However, when dealing with images captured under different imaging modalities that result in significant appearance changes, the performance of these algorithms often deteriorates due to the scarcity of annotated cross-modal training data. This limitation hinders applications in various fields that rely on multiple image modalities to obtain complementary information. To address this challenge, we propose a large-scale pre-training framework that utilizes synthetic cross-modal training signals, incorporating diverse data from various sources, to train models to recognize and match fundamental structures across images. This capability is transferable to real-world, unseen cross-modality image matching tasks. Our key finding is that the matching model trained with our framework achieves remarkable generalizability across more than eight unseen cross-modality registration tasks using the same network weight, substantially outperforming existing methods, whether designed for generalization or tailored for specific tasks. This advancement significantly enhances the applicability of image matching technologies across various scientific disciplines and paves the way for new applications in multi-modality human and artificial intelligence analysis and beyond.', 'score': 0, 'issue_id': 1688, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': 'ad0c408491c545d5', 'authors': ['Xingyi He', 'Hao Yu', 'Sida Peng', 'Dongli Tan', 'Zehong Shen', 'Hujun Bao', 'Xiaowei Zhou'], 'affiliations': ['Shandong University', 'State Key Lab of CAD&CG, Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07556.jpg', 'data': {'categories': ['#synthetic', '#dataset', '#multimodal', '#transfer_learning', '#cv'], 'emoji': '🔍', 'ru': {'title': 'Универсальное сопоставление изображений разных модальностей с помощью глубокого обучения', 'desc': 'Статья представляет новый подход к сопоставлению изображений разных модальностей с использованием глубокого обучения. Авторы предлагают фреймворк для предварительного обучения на синтетических кросс-модальных данных, что позволяет модели распознавать фундаментальные структуры в изображениях. Обученная модель демонстрирует впечатляющую обобщаемость на более чем восемь новых задач кросс-модальной регистрации, значительно превосходя существующие методы. Это достижение открывает новые возможности для применения технологий сопоставления изображений в различных научных дисциплинах.'}, 'en': {'title': 'Enhancing Image Matching Across Modalities with Synthetic Training', 'desc': "This paper presents a new framework for image matching that helps identify corresponding pixel locations between images taken in different ways. Traditional deep learning methods struggle with this task due to a lack of annotated training data for different image types. The proposed solution uses synthetic training signals from diverse sources to improve the model's ability to recognize and match structures across various images. As a result, the model shows excellent performance in unseen cross-modal tasks, making it highly useful for applications in many scientific fields."}, 'zh': {'title': '跨模态图像匹配的新突破', 'desc': '本文提出了一种大规模预训练框架,用于解决图像匹配中的跨模态问题。该框架利用合成的跨模态训练信号,结合来自不同来源的多样化数据,训练模型识别和匹配图像中的基本结构。研究发现,使用该框架训练的匹配模型在超过八个未见的跨模态配准任务中表现出显著的泛化能力,远超现有方法。此进展大大增强了图像匹配技术在各科学领域的适用性,并为多模态人类和人工智能分析的新应用铺平了道路。'}}}, {'id': 'https://huggingface.co/papers/2501.01895', 'title': 'EnerVerse: Envisioning Embodied Future Space for Robotics Manipulation', 'url': 'https://huggingface.co/papers/2501.01895', 'abstract': "We introduce EnerVerse, a comprehensive framework for embodied future space generation specifically designed for robotic manipulation tasks. EnerVerse seamlessly integrates convolutional and bidirectional attention mechanisms for inner-chunk space modeling, ensuring low-level consistency and continuity. Recognizing the inherent redundancy in video data, we propose a sparse memory context combined with a chunkwise unidirectional generative paradigm to enable the generation of infinitely long sequences. To further augment robotic capabilities, we introduce the Free Anchor View (FAV) space, which provides flexible perspectives to enhance observation and analysis. The FAV space mitigates motion modeling ambiguity, removes physical constraints in confined environments, and significantly improves the robot's generalization and adaptability across various tasks and settings. To address the prohibitive costs and labor intensity of acquiring multi-camera observations, we present a data engine pipeline that integrates a generative model with 4D Gaussian Splatting (4DGS). This pipeline leverages the generative model's robust generalization capabilities and the spatial constraints provided by 4DGS, enabling an iterative enhancement of data quality and diversity, thus creating a data flywheel effect that effectively narrows the sim-to-real gap. Finally, our experiments demonstrate that the embodied future space generation prior substantially enhances policy predictive capabilities, resulting in improved overall performance, particularly in long-range robotic manipulation tasks.", 'score': 41, 'issue_id': 1506, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'bae2a6e63f87958d', 'authors': ['Siyuan Huang', 'Liliang Chen', 'Pengfei Zhou', 'Shengcong Chen', 'Zhengkai Jiang', 'Yue Hu', 'Peng Gao', 'Hongsheng Li', 'Maoqing Yao', 'Guanghui Ren'], 'affiliations': ['AgiBot', 'CUHK', 'FDU', 'HIT', 'HKUST', 'SJTU', 'Shanghai AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.01895.jpg', 'data': {'categories': ['#3d', '#data', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'EnerVerse: Революция в пространственном моделировании для роботов-манипуляторов', 'desc': 'EnerVerse - это комплексная система для генерации пространства будущего в задачах роботизированной манипуляции. Она использует сверточные механизмы и двунаправленное внимание для моделирования внутренних фрагментов пространства, обеспечивая согласованность на низком уровне. Система вводит пространство Free Anchor View для гибких перспектив наблюдения и анализа, улучшая обобщение и адаптивность робота. EnerVerse также включает конвейер данных, интегрирующий генеративную модель с 4D Gaussian Splatting для сужения разрыва между симуляцией и реальностью.'}, 'en': {'title': 'Empowering Robots with EnerVerse: A New Era in Space Generation and Manipulation', 'desc': 'EnerVerse is a new framework designed to help robots better understand and manipulate their environments. It uses advanced techniques like convolutional and bidirectional attention mechanisms to create a consistent model of space. By recognizing that video data often has unnecessary information, EnerVerse employs a sparse memory context to generate long sequences efficiently. Additionally, the Free Anchor View (FAV) space allows robots to observe from different angles, improving their ability to adapt and perform tasks in various settings.'}, 'zh': {'title': 'EnerVerse:提升机器人操作的未来空间生成框架', 'desc': '本文介绍了EnerVerse,这是一个专为机器人操作任务设计的未来空间生成框架。EnerVerse结合了卷积和双向注意机制,以确保内部空间建模的一致性和连续性。我们提出了一种稀疏记忆上下文和单向生成范式的结合,能够生成无限长的序列,从而提高机器人的能力。通过引入自由锚视图空间(FAV),我们增强了观察和分析的灵活性,显著改善了机器人在各种任务和环境中的泛化能力和适应性。'}}}, {'id': 'https://huggingface.co/papers/2501.01957', 'title': 'VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction', 'url': 'https://huggingface.co/papers/2501.01957', 'abstract': 'Recent Multimodal Large Language Models (MLLMs) have typically focused on integrating visual and textual modalities, with less emphasis placed on the role of speech in enhancing interaction. However, speech plays a crucial role in multimodal dialogue systems, and implementing high-performance in both vision and speech tasks remains a significant challenge due to the fundamental modality differences. In this paper, we propose a carefully designed multi-stage training methodology that progressively trains LLM to understand both visual and speech information, ultimately enabling fluent vision and speech interaction. Our approach not only preserves strong vision-language capacity, but also enables efficient speech-to-speech dialogue capabilities without separate ASR and TTS modules, significantly accelerating multimodal end-to-end response speed. By comparing our method against state-of-the-art counterparts across benchmarks for image, video, and speech tasks, we demonstrate that our model is equipped with both strong visual and speech capabilities, making near real-time vision and speech interaction.', 'score': 19, 'issue_id': 1506, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'b6690c7efedf5a39', 'authors': ['Chaoyou Fu', 'Haojia Lin', 'Xiong Wang', 'Yi-Fan Zhang', 'Yunhang Shen', 'Xiaoyu Liu', 'Yangze Li', 'Zuwei Long', 'Heting Gao', 'Ke Li', 'Xiawu Zheng', 'Rongrong Ji', 'Xing Sun', 'Caifeng Shan', 'Ran He'], 'affiliations': ['CASIA', 'NJU', 'Tencent Youtu Lab', 'XMU'], 'pdf_title_img': 'assets/pdf/title_img/2501.01957.jpg', 'data': {'categories': ['#training', '#cv', '#multimodal', '#benchmark', '#audio'], 'emoji': '🗣️', 'ru': {'title': 'Революция в мультимодальном взаимодействии: речь и зрение в одной модели', 'desc': 'В статье представлена новая методология обучения мультимодальных языковых моделей, объединяющая визуальную и речевую модальности. Авторы предлагают поэтапный подход к обучению, который позволяет модели эффективно понимать как визуальную, так и речевую информацию. Модель демонстрирует высокую производительность в задачах обработки изображений, видео и речи, превосходя современные аналоги. Этот подход обеспечивает возможность ведения диалога с использованием речи и изображений в режиме, близком к реальному времени.'}, 'en': {'title': 'Enhancing Multimodal Interaction with Speech and Vision Integration', 'desc': 'This paper introduces a novel training methodology for Multimodal Large Language Models (MLLMs) that enhances their ability to process both visual and speech data. The proposed multi-stage training approach allows the model to progressively learn and integrate information from images, videos, and spoken language, facilitating seamless interaction. By eliminating the need for separate Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) modules, the model achieves faster response times in multimodal dialogues. Experimental results show that this method not only maintains strong vision-language performance but also excels in speech tasks, enabling near real-time interactions.'}, 'zh': {'title': '实现流畅的视觉与语音交互', 'desc': '最近的多模态大型语言模型(MLLMs)主要集中在视觉和文本的整合上,而对语音在增强交互中的作用关注较少。然而,语音在多模态对话系统中起着至关重要的作用,如何在视觉和语音任务中实现高性能仍然是一个重大挑战。本文提出了一种精心设计的多阶段训练方法,逐步训练大型语言模型理解视觉和语音信息,从而实现流畅的视觉和语音交互。我们的方法不仅保持了强大的视觉-语言能力,还实现了高效的语音对话能力,显著加快了多模态端到端的响应速度。'}}}, {'id': 'https://huggingface.co/papers/2501.01904', 'title': 'Virgo: A Preliminary Exploration on Reproducing o1-like MLLM', 'url': 'https://huggingface.co/papers/2501.01904', 'abstract': 'Recently, slow-thinking reasoning systems, built upon large language models (LLMs), have garnered widespread attention by scaling the thinking time during inference. There is also growing interest in adapting this capability to multimodal large language models (MLLMs). Given that MLLMs handle more complex data semantics across different modalities, it is intuitively more challenging to implement multimodal slow-thinking systems. To address this issue, in this paper, we explore a straightforward approach by fine-tuning a capable MLLM with a small amount of textual long-form thought data, resulting in a multimodal slow-thinking system, Virgo (Visual reasoning with long thought). We find that these long-form reasoning processes, expressed in natural language, can be effectively transferred to MLLMs. Moreover, it seems that such textual reasoning data can be even more effective than visual reasoning data in eliciting the slow-thinking capacities of MLLMs. While this work is preliminary, it demonstrates that slow-thinking capacities are fundamentally associated with the language model component, which can be transferred across modalities or domains. This finding can be leveraged to guide the development of more powerful slow-thinking reasoning systems. We release our resources at https://github.com/RUCAIBox/Virgo.', 'score': 12, 'issue_id': 1505, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '576423a20b419d0f', 'authors': ['Yifan Du', 'Zikang Liu', 'Yifan Li', 'Wayne Xin Zhao', 'Yuqi Huo', 'Bingning Wang', 'Weipeng Chen', 'Zheng Liu', 'Zhongyuan Wang', 'Ji-Rong Wen'], 'affiliations': ['BAAI', 'Baichuan AI', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01904.jpg', 'data': {'categories': ['#reasoning', '#multimodal', '#transfer_learning', '#training'], 'emoji': '🧠', 'ru': {'title': 'Обучение мультимодальных ИИ длительным рассуждениям через текст', 'desc': 'Статья описывает исследование в области мультимодальных больших языковых моделей (MLLM) и их способности к медленному мышлению. Авторы предлагают метод Virgo, который позволяет обучить MLLM длительным рассуждениям с помощью небольшого количества текстовых данных. Результаты показывают, что текстовые данные для обучения рассуждениям могут быть даже эффективнее визуальных. Это исследование демонстрирует, что способности к медленному мышлению в основном связаны с языковым компонентом модели и могут переноситься между модальностями.'}, 'en': {'title': 'Unlocking Slow-Thinking in Multimodal Models with Textual Reasoning', 'desc': 'This paper discusses the development of a multimodal slow-thinking reasoning system called Virgo, which is based on fine-tuning a multimodal large language model (MLLM) using long-form textual reasoning data. The authors found that incorporating long-form reasoning in natural language significantly enhances the slow-thinking capabilities of MLLMs, even more so than using visual reasoning data. This suggests that the slow-thinking abilities are closely linked to the language model aspect, allowing for effective transfer across different data modalities. The research indicates a promising direction for creating advanced reasoning systems that can handle complex data semantics.'}, 'zh': {'title': '多模态慢思维推理的探索', 'desc': '最近,基于大型语言模型(LLMs)的慢思维推理系统引起了广泛关注,尤其是在推理过程中延长思考时间的能力。本文探讨了如何将这种能力应用于多模态大型语言模型(MLLMs),尽管处理不同模态的复杂数据语义更具挑战性。我们通过微调一个强大的MLLM,使用少量的长文本思维数据,成功构建了一个多模态慢思维系统,命名为Virgo(视觉推理与长思维)。研究表明,长文本推理过程可以有效转移到MLLMs,并且这种文本推理数据在激发MLLMs的慢思维能力方面,似乎比视觉推理数据更有效。'}}}, {'id': 'https://huggingface.co/papers/2412.21059', 'title': 'VisionReward: Fine-Grained Multi-Dimensional Human Preference Learning for Image and Video Generation', 'url': 'https://huggingface.co/papers/2412.21059', 'abstract': 'We present a general strategy to aligning visual generation models -- both image and video generation -- with human preference. To start with, we build VisionReward -- a fine-grained and multi-dimensional reward model. We decompose human preferences in images and videos into multiple dimensions, each represented by a series of judgment questions, linearly weighted and summed to an interpretable and accurate score. To address the challenges of video quality assessment, we systematically analyze various dynamic features of videos, which helps VisionReward surpass VideoScore by 17.2% and achieve top performance for video preference prediction. Based on VisionReward, we develop a multi-objective preference learning algorithm that effectively addresses the issue of confounding factors within preference data. Our approach significantly outperforms existing image and video scoring methods on both machine metrics and human evaluation. All code and datasets are provided at https://github.com/THUDM/VisionReward.', 'score': 11, 'issue_id': 1510, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '1f3bb267ffa751d9', 'authors': ['Jiazheng Xu', 'Yu Huang', 'Jiale Cheng', 'Yuanming Yang', 'Jiajun Xu', 'Yuan Wang', 'Wenbo Duan', 'Shen Yang', 'Qunlin Jin', 'Shurun Li', 'Jiayan Teng', 'Zhuoyi Yang', 'Wendi Zheng', 'Xiao Liu', 'Ming Ding', 'Xiaohan Zhang', 'Xiaotao Gu', 'Shiyu Huang', 'Minlie Huang', 'Jie Tang', 'Yuxiao Dong'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2412.21059.jpg', 'data': {'categories': ['#rag', '#training', '#open_source', '#cv', '#video', '#optimization', '#alignment'], 'emoji': '🎥', 'ru': {'title': 'VisionReward: многомерная оценка визуального контента с учетом человеческих предпочтений', 'desc': 'Исследователи представили стратегию для согласования моделей генерации визуального контента с человеческими предпочтениями. Они разработали VisionReward - многомерную модель вознаграждения, которая декомпозирует предпочтения в изображениях и видео на несколько измерений. Для оценки качества видео были проанализированы различные динамические характеристики, что позволило VisionReward превзойти существующие методы на 17.2%. На основе VisionReward был разработан алгоритм многоцелевого обучения предпочтениям, эффективно решающий проблему конфаундинг-факторов в данных о предпочтениях.'}, 'en': {'title': 'Aligning Visual Generation with Human Preferences', 'desc': 'This paper introduces a method for aligning visual generation models, such as those for images and videos, with human preferences. The authors create a reward model called VisionReward, which breaks down human preferences into multiple dimensions assessed through specific judgment questions. They enhance video quality assessment by analyzing dynamic features, leading to a 17.2% improvement over previous methods. Additionally, a multi-objective preference learning algorithm is developed to manage confounding factors in preference data, resulting in superior performance compared to existing scoring methods.'}, 'zh': {'title': '视觉生成模型与人类偏好的完美对齐', 'desc': '本文提出了一种通用策略,用于将视觉生成模型(包括图像和视频生成)与人类偏好对齐。我们构建了VisionReward,这是一个细粒度和多维度的奖励模型,能够将人类对图像和视频的偏好分解为多个维度。通过分析视频的动态特征,VisionReward在视频偏好预测中超越了现有方法,提升了17.2%的性能。基于VisionReward,我们开发了一种多目标偏好学习算法,有效解决了偏好数据中的混淆因素问题。'}}}, {'id': 'https://huggingface.co/papers/2501.01821', 'title': 'SDPO: Segment-Level Direct Preference Optimization for Social Agents', 'url': 'https://huggingface.co/papers/2501.01821', 'abstract': "Social agents powered by large language models (LLMs) can simulate human social behaviors but fall short in handling complex goal-oriented social dialogues. Direct Preference Optimization (DPO) has proven effective in aligning LLM behavior with human preferences across a variety of agent tasks. Existing DPO-based approaches for multi-turn interactions are divided into turn-level and session-level methods. The turn-level method is overly fine-grained, focusing exclusively on individual turns, while session-level methods are too coarse-grained, often introducing training noise. To address these limitations, we propose Segment-Level Direct Preference Optimization (SDPO), which focuses on specific key segments within interactions to optimize multi-turn agent behavior while minimizing training noise. Evaluations on the SOTOPIA benchmark demonstrate that SDPO-tuned agents consistently outperform both existing DPO-based methods and proprietary LLMs like GPT-4o, underscoring SDPO's potential to advance the social intelligence of LLM-based agents. We release our code and data at https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/SDPO.", 'score': 10, 'issue_id': 1514, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '499b008b0bce4f74', 'authors': ['Aobo Kong', 'Wentao Ma', 'Shiwan Zhao', 'Yongbin Li', 'Yuchuan Wu', 'Ke Wang', 'Xiaoqian Liu', 'Qicheng Li', 'Yong Qin', 'Fei Huang'], 'affiliations': ['TMCC, CS, Nankai University', 'Tongyi Lab', 'alibaba-inc.com'], 'pdf_title_img': 'assets/pdf/title_img/2501.01821.jpg', 'data': {'categories': ['#open_source', '#benchmark', '#rlhf', '#agents', '#alignment', '#training'], 'emoji': '🤖', 'ru': {'title': 'SDPO: Новый шаг к созданию социально интеллектуальных ИИ-агентов', 'desc': 'В статье представлен новый метод оптимизации поведения языковых моделей (LLM) в сложных многоходовых социальных диалогах - Segment-Level Direct Preference Optimization (SDPO). SDPO фокусируется на ключевых сегментах взаимодействия, что позволяет эффективнее оптимизировать поведение агентов по сравнению с существующими методами. Эксперименты на бенчмарке SOTOPIA показали, что агенты, настроенные с помощью SDPO, превосходят как другие методы на основе DPO, так и проприетарные модели вроде GPT-4. Это демонстрирует потенциал SDPO для повышения социального интеллекта агентов на основе LLM.'}, 'en': {'title': 'Enhancing Social Intelligence in LLMs with SDPO', 'desc': "This paper introduces Segment-Level Direct Preference Optimization (SDPO), a new method for improving the performance of social agents powered by large language models (LLMs) in complex dialogues. Unlike existing methods that either focus too narrowly on individual turns or too broadly on entire sessions, SDPO targets specific key segments of conversations to better align agent behavior with human preferences. The approach reduces training noise and enhances the agent's ability to engage in multi-turn interactions effectively. Evaluations show that agents trained with SDPO outperform both traditional DPO methods and advanced LLMs like GPT-4o, highlighting its effectiveness in enhancing social intelligence."}, 'zh': {'title': '提升社交智能的新方法:分段级直接偏好优化', 'desc': '本论文提出了一种新的方法,称为分段级直接偏好优化(SDPO),旨在提高大型语言模型(LLM)在多轮社交对话中的表现。现有的直接偏好优化(DPO)方法在处理多轮交互时存在细粒度和粗粒度的局限性,导致训练噪声。SDPO通过关注交互中的关键段落,优化代理的多轮行为,从而减少训练噪声。实验结果表明,SDPO调优的代理在SOTOPIA基准测试中表现优于现有的DPO方法和其他大型语言模型,显示出其在提升社交智能方面的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01073', 'title': 'Graph Generative Pre-trained Transformer', 'url': 'https://huggingface.co/papers/2501.01073', 'abstract': "Graph generation is a critical task in numerous domains, including molecular design and social network analysis, due to its ability to model complex relationships and structured data. While most modern graph generative models utilize adjacency matrix representations, this work revisits an alternative approach that represents graphs as sequences of node set and edge set. We advocate for this approach due to its efficient encoding of graphs and propose a novel representation. Based on this representation, we introduce the Graph Generative Pre-trained Transformer (G2PT), an auto-regressive model that learns graph structures via next-token prediction. To further exploit G2PT's capabilities as a general-purpose foundation model, we explore fine-tuning strategies for two downstream applications: goal-oriented generation and graph property prediction. We conduct extensive experiments across multiple datasets. Results indicate that G2PT achieves superior generative performance on both generic graph and molecule datasets. Furthermore, G2PT exhibits strong adaptability and versatility in downstream tasks from molecular design to property prediction.", 'score': 9, 'issue_id': 1508, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '596abc88d57e0650', 'authors': ['Xiaohui Chen', 'Yinkai Wang', 'Jiaxing He', 'Yuanqi Du', 'Soha Hassoun', 'Xiaolin Xu', 'Li-Ping Liu'], 'affiliations': ['Cornell University', 'Northeastern University', 'Tufts University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01073.jpg', 'data': {'categories': ['#dataset', '#optimization', '#training', '#architecture', '#data', '#graphs'], 'emoji': '🕸️', 'ru': {'title': 'G2PT: Универсальный трансформер для эффективной генерации графов', 'desc': 'В статье представлена новая модель генерации графов - Graph Generative Pre-trained Transformer (G2PT). G2PT использует альтернативный подход к представлению графов в виде последовательностей множеств узлов и рёбер вместо матриц смежности. Модель обучается предсказывать следующий токен автореgressивным способом. G2PT показывает превосходные результаты в генерации как общих графов, так и молекул, а также демонстрирует хорошую адаптивность к различным задачам.'}, 'en': {'title': 'Revolutionizing Graph Generation with G2PT', 'desc': 'This paper focuses on improving graph generation, which is important for tasks like designing molecules and analyzing social networks. Instead of using the common adjacency matrix, it proposes a new way to represent graphs as sequences of node and edge sets, making the encoding more efficient. The authors introduce the Graph Generative Pre-trained Transformer (G2PT), an auto-regressive model that learns to generate graph structures by predicting the next token in a sequence. Through various experiments, they demonstrate that G2PT outperforms existing models in generating graphs and is effective in applications like molecular design and predicting graph properties.'}, 'zh': {'title': '图生成的创新:G2PT模型', 'desc': '图生成在许多领域中非常重要,比如分子设计和社交网络分析,因为它能够建模复杂的关系和结构化数据。本文提出了一种新的图表示方法,将图表示为节点集和边集的序列,而不是传统的邻接矩阵。基于这种表示,我们引入了图生成预训练变换器(G2PT),这是一种通过下一个标记预测学习图结构的自回归模型。实验结果表明,G2PT在通用图和分子数据集上表现出色,并且在分子设计和属性预测等下游任务中具有很强的适应性和多功能性。'}}}, {'id': 'https://huggingface.co/papers/2501.00874', 'title': 'LUSIFER: Language Universal Space Integration for Enhanced Multilingual Embeddings with Large Language Models', 'url': 'https://huggingface.co/papers/2501.00874', 'abstract': "Recent advancements in large language models (LLMs) based embedding models have established new state-of-the-art benchmarks for text embedding tasks, particularly in dense vector-based retrieval. However, these models predominantly focus on English, leaving multilingual embedding capabilities largely unexplored. To address this limitation, we present LUSIFER, a novel zero-shot approach that adapts LLM-based embedding models for multilingual tasks without requiring multilingual supervision. LUSIFER's architecture combines a multilingual encoder, serving as a language-universal learner, with an LLM-based embedding model optimized for embedding-specific tasks. These components are seamlessly integrated through a minimal set of trainable parameters that act as a connector, effectively transferring the multilingual encoder's language understanding capabilities to the specialized embedding model. Additionally, to comprehensively evaluate multilingual embedding performance, we introduce a new benchmark encompassing 5 primary embedding tasks, 123 diverse datasets, and coverage across 14 languages. Extensive experimental results demonstrate that LUSIFER significantly enhances the multilingual performance across various embedding tasks, particularly for medium and low-resource languages, without requiring explicit multilingual training data.", 'score': 7, 'issue_id': 1507, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': '5bdfec436923a2a6', 'authors': ['Hieu Man', 'Nghia Trung Ngo', 'Viet Dac Lai', 'Ryan A. Rossi', 'Franck Dernoncourt', 'Thien Huu Nguyen'], 'affiliations': ['Adobe Research, USA', 'Dept. of Computer Science, University of Oregon, OR, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.00874.jpg', 'data': {'categories': ['#transfer_learning', '#architecture', '#benchmark', '#multilingual', '#low_resource'], 'emoji': '🌍', 'ru': {'title': 'Универсальные многоязычные эмбеддинги без многоязычного обучения', 'desc': 'LUSIFER - это новый подход к созданию многоязычных эмбеддингов без использования многоязычных обучающих данных. Он объединяет многоязычный энкодер и LLM-модель для эмбеддингов через набор обучаемых параметров. Авторы также представили новый бенчмарк для оценки качества многоязычных эмбеддингов, охватывающий 5 основных задач, 123 датасета и 14 языков. Эксперименты показали, что LUSIFER значительно улучшает многоязычную производительность, особенно для языков с ограниченными ресурсами.'}, 'en': {'title': 'LUSIFER: Bridging Multilingual Gaps in Text Embedding', 'desc': "This paper introduces LUSIFER, a new method that enhances large language models (LLMs) for multilingual text embedding tasks. Unlike existing models that mainly focus on English, LUSIFER uses a zero-shot approach to adapt LLMs for multiple languages without needing multilingual training data. It combines a multilingual encoder with an LLM-based embedding model, allowing for effective language understanding and embedding performance. The authors also present a comprehensive benchmark to evaluate LUSIFER's performance across various languages and tasks, showing significant improvements, especially for less-resourced languages."}, 'zh': {'title': 'LUSIFER:无监督多语言嵌入的新突破', 'desc': '最近,大型语言模型(LLMs)在文本嵌入任务中取得了新的突破,尤其是在基于密集向量的检索方面。然而,这些模型主要集中在英语上,导致多语言嵌入能力尚未得到充分探索。为了解决这个问题,我们提出了LUSIFER,这是一种新颖的零样本方法,可以在不需要多语言监督的情况下,将LLM嵌入模型适应于多语言任务。LUSIFER的架构结合了一个多语言编码器和一个针对嵌入特定任务优化的LLM嵌入模型,通过一组最小的可训练参数实现无缝连接,有效地将多语言编码器的语言理解能力转移到专门的嵌入模型上。'}}}, {'id': 'https://huggingface.co/papers/2501.01540', 'title': 'BoxingGym: Benchmarking Progress in Automated Experimental Design and Model Discovery', 'url': 'https://huggingface.co/papers/2501.01540', 'abstract': "Understanding the world and explaining it with scientific theories is a central aspiration of artificial intelligence research. Proposing theories, designing experiments to test them, and then revising them based on data are fundamental to scientific discovery. Despite the significant promise of LLM-based scientific agents, no benchmarks systematically test LLM's ability to propose scientific models, collect experimental data, and revise them in light of new data. We introduce BoxingGym, a benchmark with 10 environments for systematically evaluating both experimental design (e.g. collecting data to test a scientific theory) and model discovery (e.g. proposing and revising scientific theories). To enable tractable and quantitative evaluation, we implement each environment as a generative probabilistic model with which a scientific agent can run interactive experiments. These probabilistic models are drawn from various real-world scientific domains ranging from psychology to ecology. To quantitatively evaluate a scientific agent's ability to collect informative experimental data, we compute the expected information gain (EIG), an information-theoretic quantity which measures how much an experiment reduces uncertainty about the parameters of a generative model. A good scientific theory is a concise and predictive explanation. Therefore, to quantitatively evaluate model discovery, we ask a scientific agent to explain their model and then assess whether this explanation enables another scientific agent to make reliable predictions about this environment. In addition to this explanation-based evaluation, we compute standard model evaluation metrics such as prediction errors. We find that current LLMs, such as GPT-4o, struggle with both experimental design and model discovery. We find that augmenting the LLM-based agent with an explicit statistical model does not reliably improve these results.", 'score': 4, 'issue_id': 1510, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '0f853b1681ef29b5', 'authors': ['Kanishk Gandhi', 'Michael Y. Li', 'Lyle Goodyear', 'Louise Li', 'Aditi Bhaskar', 'Mohammed Zaman', 'Noah D. Goodman'], 'affiliations': ['Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01540.jpg', 'data': {'categories': ['#benchmark', '#data', '#science', '#agents'], 'emoji': '🧪', 'ru': {'title': 'BoxingGym: новый вызов для ИИ в научном моделировании', 'desc': 'Статья представляет новый бенчмарк BoxingGym для оценки способности языковых моделей (LLM) к научному открытию. Бенчмарк включает 10 сред, моделирующих различные научные области, и позволяет тестировать планирование экспериментов и построение теорий. Для оценки качества экспериментов используется ожидаемый прирост информации (EIG), а для оценки теорий - их способность объяснять и предсказывать. Результаты показывают, что современные LLM, включая GPT-4, пока слабо справляются с этими задачами.'}, 'en': {'title': 'BoxingGym: Evaluating LLMs in Scientific Discovery', 'desc': 'This paper introduces BoxingGym, a benchmark designed to evaluate the capabilities of large language models (LLMs) in scientific discovery tasks. It focuses on two main aspects: experimental design, which involves collecting data to test scientific theories, and model discovery, which includes proposing and revising these theories. The benchmark consists of 10 environments modeled as generative probabilistic models from various scientific fields, allowing for interactive experimentation. The study finds that current LLMs, like GPT-4o, face challenges in both areas, and adding a statistical model does not consistently enhance their performance.'}, 'zh': {'title': '评估人工智能在科学研究中的能力', 'desc': '这篇论文探讨了人工智能在科学研究中的应用,特别是大型语言模型(LLM)在提出科学理论和设计实验方面的能力。作者提出了一个名为BoxingGym的基准测试,包含10个环境,用于系统评估实验设计和模型发现的能力。通过计算期望信息增益(EIG),论文量化了科学代理收集实验数据的有效性,并评估其提出的模型是否能进行可靠预测。研究发现,当前的LLM在实验设计和模型发现方面表现不佳,且简单地增加统计模型并未显著改善结果。'}}}, {'id': 'https://huggingface.co/papers/2501.00958', 'title': '2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining', 'url': 'https://huggingface.co/papers/2501.00958', 'abstract': 'Compared to image-text pair data, interleaved corpora enable Vision-Language Models (VLMs) to understand the world more naturally like humans. However, such existing datasets are crawled from webpage, facing challenges like low knowledge density, loose image-text relations, and poor logical coherence between images. On the other hand, the internet hosts vast instructional videos (e.g., online geometry courses) that are widely used by humans to learn foundational subjects, yet these valuable resources remain underexplored in VLM training. In this paper, we introduce a high-quality multimodal textbook corpus with richer foundational knowledge for VLM pretraining. It collects over 2.5 years of instructional videos, totaling 22,000 class hours. We first use an LLM-proposed taxonomy to systematically gather instructional videos. Then we progressively extract and refine visual (keyframes), audio (ASR), and textual knowledge (OCR) from the videos, and organize as an image-text interleaved corpus based on temporal order. Compared to its counterparts, our video-centric textbook offers more coherent context, richer knowledge, and better image-text alignment. Experiments demonstrate its superb pretraining performance, particularly in knowledge- and reasoning-intensive tasks like ScienceQA and MathVista. Moreover, VLMs pre-trained on our textbook exhibit outstanding interleaved context awareness, leveraging visual and textual cues in their few-shot context for task solving~Our code are available at \\url{https://github.com/DAMO-NLP-SG/multimodal_textbook}.', 'score': 68, 'issue_id': 1475, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'b10f0cd62f6334fc', 'authors': ['Wenqi Zhang', 'Hang Zhang', 'Xin Li', 'Jiashuo Sun', 'Yongliang Shen', 'Weiming Lu', 'Deli Zhao', 'Yueting Zhuang', 'Lidong Bing'], 'affiliations': ['College of Computer Science and Technology, Zhejiang University', 'DAMO Academy, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.00958.jpg', 'data': {'categories': ['#science', '#dataset', '#reasoning', '#multimodal', '#cv', '#video'], 'emoji': '📚', 'ru': {'title': 'Мультимодальный учебник: новый стандарт для обучения VLM', 'desc': 'Эта статья представляет новый подход к обучению моделей компьютерного зрения и обработки естественного языка (VLM) с использованием мультимодального учебного корпуса. Авторы создали базу данных из 22 000 часов обучающих видео, систематически собранных с помощью таксономии, предложенной языковой моделью (LLM). Этот корпус отличается более высокой плотностью знаний, лучшей связью между изображениями и текстом, а также логической согласованностью по сравнению с существующими наборами данных. Эксперименты показывают превосходную производительность предобучения на этом корпусе, особенно в задачах, требующих глубоких знаний и рассуждений.'}, 'en': {'title': 'Harnessing Instructional Videos for Superior Vision-Language Model Training', 'desc': 'This paper presents a new approach to training Vision-Language Models (VLMs) using a multimodal textbook corpus derived from instructional videos. Unlike traditional datasets that often suffer from low knowledge density and weak image-text relationships, this corpus offers a richer and more coherent context for VLM pretraining. The authors systematically extract visual, audio, and textual information from over 22,000 hours of instructional content, enhancing the alignment between images and text. Experiments show that VLMs trained on this video-centric dataset perform significantly better on knowledge-intensive tasks, demonstrating improved reasoning and context awareness.'}, 'zh': {'title': '视频教材:提升视觉语言模型的知识与推理能力', 'desc': '本文提出了一种高质量的多模态教材语料库,旨在为视觉语言模型(VLM)提供更丰富的基础知识。该语料库收集了超过2.5年的教学视频,总计22,000小时,系统性地提取了视频中的视觉、音频和文本知识。与现有的数据集相比,这种视频中心的教材提供了更连贯的上下文、更丰富的知识和更好的图像-文本对齐。实验结果表明,基于该教材预训练的VLM在知识和推理密集型任务中表现优异,尤其在ScienceQA和MathVista等任务中。'}}}, {'id': 'https://huggingface.co/papers/2501.01427', 'title': 'VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion Control', 'url': 'https://huggingface.co/papers/2501.01427', 'abstract': 'Despite significant advancements in video generation, inserting a given object into videos remains a challenging task. The difficulty lies in preserving the appearance details of the reference object and accurately modeling coherent motions at the same time. In this paper, we propose VideoAnydoor, a zero-shot video object insertion framework with high-fidelity detail preservation and precise motion control. Starting from a text-to-video model, we utilize an ID extractor to inject the global identity and leverage a box sequence to control the overall motion. To preserve the detailed appearance and meanwhile support fine-grained motion control, we design a pixel warper. It takes the reference image with arbitrary key-points and the corresponding key-point trajectories as inputs. It warps the pixel details according to the trajectories and fuses the warped features with the diffusion U-Net, thus improving detail preservation and supporting users in manipulating the motion trajectories. In addition, we propose a training strategy involving both videos and static images with a reweight reconstruction loss to enhance insertion quality. VideoAnydoor demonstrates significant superiority over existing methods and naturally supports various downstream applications (e.g., talking head generation, video virtual try-on, multi-region editing) without task-specific fine-tuning.', 'score': 39, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '4c67f688775a3eca', 'authors': ['Yuanpeng Tu', 'Hao Luo', 'Xi Chen', 'Sihui Ji', 'Xiang Bai', 'Hengshuang Zhao'], 'affiliations': ['DAMO Academy, Alibaba Group', 'HUST', 'Hupan Lab', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.01427.jpg', 'data': {'categories': ['#diffusion', '#games', '#video'], 'emoji': '🎬', 'ru': {'title': 'Точная вставка объектов в видео с сохранением деталей', 'desc': 'В этой статье представлен VideoAnydoor - фреймворк для вставки объектов в видео без предварительного обучения. Он использует экстрактор идентификаторов и последовательность ограничивающих рамок для контроля движения объекта. Ключевым компонентом является пиксельный варпер, который сохраняет детали внешнего вида и позволяет точно управлять движением. Предложенная стратегия обучения с использованием видео и статических изображений улучшает качество вставки объектов.'}, 'en': {'title': 'Seamless Object Insertion in Videos with VideoAnydoor', 'desc': 'This paper introduces VideoAnydoor, a novel framework for zero-shot video object insertion that excels in maintaining high-fidelity details and precise motion control. The approach begins with a text-to-video model and incorporates an ID extractor to ensure consistent object identity while using a box sequence for motion management. A key innovation is the pixel warper, which adjusts pixel details based on key-point trajectories, enhancing both detail preservation and user control over motion. The proposed training strategy, which combines videos and static images with a reweighted reconstruction loss, significantly improves the quality of object insertion, making VideoAnydoor versatile for various applications without needing specific fine-tuning.'}, 'zh': {'title': '高保真视频对象插入的新突破', 'desc': '尽管视频生成技术取得了显著进展,但将特定对象插入视频仍然是一项具有挑战性的任务。本文提出了VideoAnydoor,这是一个零-shot视频对象插入框架,能够高保真地保留细节并精确控制运动。我们设计了一种像素变形器,能够根据关键点轨迹扭曲像素细节,并与扩散U-Net融合,从而提高细节保留能力。VideoAnydoor在现有方法中表现出显著优势,并支持多种下游应用,无需特定任务的微调。'}}}, {'id': 'https://huggingface.co/papers/2501.01257', 'title': 'CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings', 'url': 'https://huggingface.co/papers/2501.01257', 'abstract': 'With the increasing code reasoning capabilities of existing large language models (LLMs) and breakthroughs in reasoning models like OpenAI o1 and o3, there is a growing need to develop more challenging and comprehensive benchmarks that effectively test their sophisticated competition-level coding abilities. Existing benchmarks, like LiveCodeBench and USACO, fall short due to the unavailability of private test cases, lack of support for special judges, and misaligned execution environments. To bridge this gap, we introduce CodeElo, a standardized competition-level code generation benchmark that effectively addresses all these challenges for the first time. CodeElo benchmark is mainly based on the official CodeForces platform and tries to align with the platform as much as possible. We compile the recent six months of contest problems on CodeForces with detailed information such as contest divisions, problem difficulty ratings, and problem algorithm tags. We introduce a unique judging method in which problems are submitted directly to the platform and develop a reliable Elo rating calculation system that aligns with the platform and is comparable with human participants but has lower variance. By testing on our CodeElo, we provide the Elo ratings of 30 existing popular open-source and 3 proprietary LLMs for the first time. The results show that o1-mini and QwQ-32B-Preview stand out significantly, achieving Elo ratings of 1578 and 1261, respectively, while other models struggle even with the easiest problems, placing in the lowest 20 percent among all human participants. Detailed analysis experiments are also conducted to provide insights into performance across algorithms and comparisons between using C++ and Python, which can suggest directions for future studies.', 'score': 36, 'issue_id': 1475, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'e31430bb6ba5dfc8', 'authors': ['Shanghaoran Quan', 'Jiaxi Yang', 'Bowen Yu', 'Bo Zheng', 'Dayiheng Liu', 'An Yang', 'Xuancheng Ren', 'Bofei Gao', 'Yibo Miao', 'Yunlong Feng', 'Zekun Wang', 'Jian Yang', 'Zeyu Cui', 'Yang Fan', 'Yichang Zhang', 'Binyuan Hui', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.01257.jpg', 'data': {'categories': ['#dataset', '#benchmark', '#reasoning', '#optimization', '#open_source'], 'emoji': '🏆', 'ru': {'title': 'CodeElo: новый стандарт оценки LLM в соревновательном программировании', 'desc': 'Статья представляет новый бенчмарк CodeElo для оценки способностей больших языковых моделей (LLM) в решении задач по программированию соревновательного уровня. CodeElo основан на платформе CodeForces и включает проблемы с детальной информацией о сложности и алгоритмических тегах. Авторы разработали систему расчета рейтинга Эло, сопоставимую с рейтингами человеческих участников. Результаты тестирования 33 LLM показали, что модели o1-mini и QwQ-32B-Preview значительно превосходят остальные, достигая рейтингов 1578 и 1261 соответственно.'}, 'en': {'title': 'CodeElo: Elevating Code Generation Benchmarks for LLMs', 'desc': 'This paper presents CodeElo, a new benchmark designed to evaluate the coding abilities of large language models (LLMs) in a competitive setting. Unlike existing benchmarks, CodeElo addresses limitations such as the lack of private test cases and misaligned execution environments by utilizing the CodeForces platform. The benchmark includes a unique judging method and an Elo rating system that allows for fair comparisons between LLMs and human participants. Results indicate that certain models, like o1-mini, perform significantly better than others, highlighting the varying capabilities of LLMs in code generation tasks.'}, 'zh': {'title': 'CodeElo:提升代码生成能力的标准化基准测试', 'desc': '随着大型语言模型(LLMs)在代码推理能力上的提升,开发更具挑战性和全面性的基准测试变得愈发重要。现有的基准测试如LiveCodeBench和USACO存在一些不足,例如缺乏私有测试用例和特殊评判支持。为了解决这些问题,我们提出了CodeElo,这是一个标准化的竞赛级代码生成基准,首次有效应对这些挑战。通过在CodeForces平台上编译最近六个月的竞赛问题,我们为30个流行的开源和3个专有LLMs提供了Elo评分,结果显示o1-mini和QwQ-32B-Preview表现突出。'}}}, {'id': 'https://huggingface.co/papers/2501.00599', 'title': 'VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with Video LLM', 'url': 'https://huggingface.co/papers/2501.00599', 'abstract': 'Video Large Language Models (Video LLMs) have recently exhibited remarkable capabilities in general video understanding. However, they mainly focus on holistic comprehension and struggle with capturing fine-grained spatial and temporal details. Besides, the lack of high-quality object-level video instruction data and a comprehensive benchmark further hinders their advancements. To tackle these challenges, we introduce the VideoRefer Suite to empower Video LLM for finer-level spatial-temporal video understanding, i.e., enabling perception and reasoning on any objects throughout the video. Specially, we thoroughly develop VideoRefer Suite across three essential aspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent data engine to meticulously curate a large-scale, high-quality object-level video instruction dataset, termed VideoRefer-700K. Next, we present the VideoRefer model, which equips a versatile spatial-temporal object encoder to capture precise regional and sequential representations. Finally, we meticulously create a VideoRefer-Bench to comprehensively assess the spatial-temporal understanding capability of a Video LLM, evaluating it across various aspects. Extensive experiments and analyses demonstrate that our VideoRefer model not only achieves promising performance on video referring benchmarks but also facilitates general video understanding capabilities.', 'score': 31, 'issue_id': 1474, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': 'daee687ce36ef3db', 'authors': ['Yuqian Yuan', 'Hang Zhang', 'Wentong Li', 'Zesen Cheng', 'Boqiang Zhang', 'Long Li', 'Xin Li', 'Deli Zhao', 'Wenqiao Zhang', 'Yueting Zhuang', 'Jianke Zhu', 'Lidong Bing'], 'affiliations': ['DAMO Academy, Alibaba Group', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00599.jpg', 'data': {'categories': ['#reasoning', '#benchmark', '#dataset', '#optimization', '#video'], 'emoji': '🎥', 'ru': {'title': 'Точное пространственно-временное понимание видео с помощью VideoRefer Suite', 'desc': 'Статья представляет VideoRefer Suite - комплексный подход к улучшению пространственно-временного понимания видео большими языковыми моделями. Авторы разработали масштабный набор данных VideoRefer-700K с инструкциями на уровне объектов, созданный с помощью мультиагентного движка. Они также представили модель VideoRefer с универсальным пространственно-временным кодировщиком объектов. Для оценки возможностей видео-LLM был создан бенчмарк VideoRefer-Bench, охватывающий различные аспекты понимания видео.'}, 'en': {'title': 'Empowering Video LLMs for Fine-Grained Understanding', 'desc': 'This paper introduces the VideoRefer Suite, which enhances Video Large Language Models (Video LLMs) for better understanding of videos by focusing on fine-grained spatial and temporal details. It addresses the limitations of existing models that primarily focus on overall comprehension and lack high-quality object-level instruction data. The suite includes a new dataset called VideoRefer-700K, a specialized VideoRefer model with a spatial-temporal object encoder, and a benchmark for evaluating video understanding capabilities. Experimental results show that the VideoRefer model significantly improves performance on video referring tasks while also enhancing general video comprehension.'}, 'zh': {'title': '提升视频理解,细致捕捉空间与时间', 'desc': '视频大型语言模型(Video LLMs)在视频理解方面展现了出色的能力,但在捕捉细粒度的空间和时间细节上存在困难。为了应对这些挑战,我们提出了VideoRefer Suite,以增强视频LLM在空间-时间视频理解方面的能力。我们开发了一个多代理数据引擎,创建了一个高质量的对象级视频指令数据集VideoRefer-700K,并提出了VideoRefer模型,配备了多功能的空间-时间对象编码器。最后,我们创建了VideoRefer-Bench,以全面评估视频LLM的空间-时间理解能力,实验结果表明我们的模型在视频引用基准上表现优异。'}}}, {'id': 'https://huggingface.co/papers/2501.01423', 'title': 'Reconstruction vs. Generation: Taming Optimization Dilemma in Latent Diffusion Models', 'url': 'https://huggingface.co/papers/2501.01423', 'abstract': 'Latent diffusion models with Transformer architectures excel at generating high-fidelity images. However, recent studies reveal an optimization dilemma in this two-stage design: while increasing the per-token feature dimension in visual tokenizers improves reconstruction quality, it requires substantially larger diffusion models and more training iterations to achieve comparable generation performance. Consequently, existing systems often settle for sub-optimal solutions, either producing visual artifacts due to information loss within tokenizers or failing to converge fully due to expensive computation costs. We argue that this dilemma stems from the inherent difficulty in learning unconstrained high-dimensional latent spaces. To address this, we propose aligning the latent space with pre-trained vision foundation models when training the visual tokenizers. Our proposed VA-VAE (Vision foundation model Aligned Variational AutoEncoder) significantly expands the reconstruction-generation frontier of latent diffusion models, enabling faster convergence of Diffusion Transformers (DiT) in high-dimensional latent spaces. To exploit the full potential of VA-VAE, we build an enhanced DiT baseline with improved training strategies and architecture designs, termed LightningDiT. The integrated system achieves state-of-the-art (SOTA) performance on ImageNet 256x256 generation with an FID score of 1.35 while demonstrating remarkable training efficiency by reaching an FID score of 2.11 in just 64 epochs--representing an over 21 times convergence speedup compared to the original DiT. Models and codes are available at: https://github.com/hustvl/LightningDiT.', 'score': 30, 'issue_id': 1473, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '173fa21b6e47d04c', 'authors': ['Jingfeng Yao', 'Xinggang Wang'], 'affiliations': ['Huazhong University of Science and Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.01423.jpg', 'data': {'categories': ['#training', '#optimization', '#cv', '#architecture', '#diffusion'], 'emoji': '⚡', 'ru': {'title': 'Революция в латентных диффузионных моделях: быстрее, лучше, эффективнее', 'desc': 'Статья представляет новый подход к улучшению латентных диффузионных моделей с архитектурой Трансформер для генерации изображений высокого качества. Авторы предлагают метод VA-VAE, который выравнивает латентное пространство с предобученными моделями компьютерного зрения. Это позволяет значительно расширить границы реконструкции-генерации и ускорить сходимость Диффузионных Трансформеров в высокоразмерных латентных пространствах. На основе VA-VAE авторы создали улучшенную модель LightningDiT, достигающую современного уровня производительности на задаче генерации изображений ImageNet 256x256.'}, 'en': {'title': 'Accelerating Image Generation with Aligned Latent Spaces', 'desc': 'This paper discusses the challenges faced by latent diffusion models, particularly when using Transformer architectures for image generation. It highlights an optimization issue where increasing the feature dimensions in visual tokenizers can lead to larger models and longer training times, often resulting in sub-optimal image quality. The authors propose a solution by aligning the latent space with pre-trained vision models, introducing a new framework called VA-VAE to enhance the training process. Their improved model, LightningDiT, achieves state-of-the-art performance in image generation while significantly speeding up the training process.'}, 'zh': {'title': '优化潜在扩散模型,提升图像生成效率', 'desc': '本论文探讨了潜在扩散模型与变换器架构在生成高质量图像时的优化困境。研究表明,虽然增加视觉标记器中的每个标记特征维度可以提高重建质量,但这也导致需要更大的扩散模型和更多的训练迭代。为了解决这一问题,作者提出将潜在空间与预训练的视觉基础模型对齐,从而提高训练效率。最终,提出的VA-VAE模型显著提升了潜在扩散模型的重建生成能力,并在ImageNet数据集上实现了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.00103', 'title': 'LTX-Video: Realtime Video Latent Diffusion', 'url': 'https://huggingface.co/papers/2501.00103', 'abstract': "We introduce LTX-Video, a transformer-based latent diffusion model that adopts a holistic approach to video generation by seamlessly integrating the responsibilities of the Video-VAE and the denoising transformer. Unlike existing methods, which treat these components as independent, LTX-Video aims to optimize their interaction for improved efficiency and quality. At its core is a carefully designed Video-VAE that achieves a high compression ratio of 1:192, with spatiotemporal downscaling of 32 x 32 x 8 pixels per token, enabled by relocating the patchifying operation from the transformer's input to the VAE's input. Operating in this highly compressed latent space enables the transformer to efficiently perform full spatiotemporal self-attention, which is essential for generating high-resolution videos with temporal consistency. However, the high compression inherently limits the representation of fine details. To address this, our VAE decoder is tasked with both latent-to-pixel conversion and the final denoising step, producing the clean result directly in pixel space. This approach preserves the ability to generate fine details without incurring the runtime cost of a separate upsampling module. Our model supports diverse use cases, including text-to-video and image-to-video generation, with both capabilities trained simultaneously. It achieves faster-than-real-time generation, producing 5 seconds of 24 fps video at 768x512 resolution in just 2 seconds on an Nvidia H100 GPU, outperforming all existing models of similar scale. The source code and pre-trained models are publicly available, setting a new benchmark for accessible and scalable video generation.", 'score': 29, 'issue_id': 1484, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'a2358f7cf156ff08', 'authors': ['Yoav HaCohen', 'Nisan Chiprut', 'Benny Brazowski', 'Daniel Shalem', 'Dudu Moshe', 'Eitan Richardson', 'Eran Levin', 'Guy Shiran', 'Nir Zabari', 'Ori Gordon', 'Poriya Panet', 'Sapir Weissbuch', 'Victor Kulikov', 'Yaki Bitterman', 'Zeev Melumian', 'Ofir Bibi'], 'affiliations': ['Lightricks'], 'pdf_title_img': 'assets/pdf/title_img/2501.00103.jpg', 'data': {'categories': ['#open_source', '#benchmark', '#video', '#diffusion'], 'emoji': '🎬', 'ru': {'title': 'Революция в генерации видео: быстрее реального времени', 'desc': 'LTX-Video - это трансформерная модель латентной диффузии для генерации видео. Она объединяет функции Video-VAE и шумоподавляющего трансформера, оптимизируя их взаимодействие. Модель использует сильно сжатое латентное пространство, позволяя трансформеру эффективно выполнять полное пространственно-временное самовнимание. LTX-Video поддерживает генерацию видео из текста и изображений, превосходя существующие модели по скорости и качеству.'}, 'en': {'title': 'Revolutionizing Video Generation with LTX-Video', 'desc': "LTX-Video is a novel transformer-based latent diffusion model designed for efficient video generation by integrating the roles of Video-VAE and denoising transformers. It achieves a high compression ratio of 1:192, allowing the model to operate in a compressed latent space while maintaining spatiotemporal self-attention for generating high-resolution videos. The model's VAE decoder performs both latent-to-pixel conversion and denoising, enabling the generation of fine details without the need for a separate upsampling module. With capabilities for text-to-video and image-to-video generation, LTX-Video produces videos faster than real-time, setting a new standard in the field."}, 'zh': {'title': 'LTX-Video:高效视频生成的新标准', 'desc': 'LTX-Video是一种基于变换器的潜在扩散模型,旨在通过整合视频生成中的Video-VAE和去噪变换器的功能来提高效率和质量。该模型的核心是一个高压缩比的Video-VAE,能够在压缩的潜在空间中高效执行时空自注意力,从而生成高分辨率且具有时间一致性的视频。为了克服高压缩带来的细节损失,VAE解码器同时负责潜在到像素的转换和最终的去噪步骤,直接在像素空间中生成清晰的结果。LTX-Video支持多种应用场景,包括文本到视频和图像到视频的生成,并且在Nvidia H100 GPU上以超实时速度生成视频,设立了视频生成的新基准。'}}}, {'id': 'https://huggingface.co/papers/2501.01264', 'title': 'ProgCo: Program Helps Self-Correction of Large Language Models', 'url': 'https://huggingface.co/papers/2501.01264', 'abstract': 'Self-Correction aims to enable large language models (LLMs) to self-verify and self-refine their initial responses without external feedback. However, LLMs often fail to effectively self-verify and generate correct feedback, further misleading refinement and leading to the failure of self-correction, especially in complex reasoning tasks. In this paper, we propose Program-driven Self-Correction (ProgCo). First, program-driven verification (ProgVe) achieves complex verification logic and extensive validation through self-generated, self-executing verification pseudo-programs. Then, program-driven refinement (ProgRe) receives feedback from ProgVe, conducts dual reflection and refinement on both responses and verification programs to mitigate misleading of incorrect feedback in complex reasoning tasks. Experiments on three instruction-following and mathematical benchmarks indicate that ProgCo achieves effective self-correction, and can be further enhance performance when combined with real program tools.', 'score': 22, 'issue_id': 1473, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'bda3f96e83319526', 'authors': ['Xiaoshuai Song', 'Yanan Wu', 'Weixun Wang', 'Jiaheng Liu', 'Wenbo Su', 'Bo Zheng'], 'affiliations': ['Taobao & Tmall Group of Alibaba'], 'pdf_title_img': 'assets/pdf/title_img/2501.01264.jpg', 'data': {'categories': ['#training', '#math', '#reasoning', '#interpretability', '#rlhf'], 'emoji': '🤖', 'ru': {'title': 'ProgCo: Самокоррекция языковых моделей через программно-управляемую верификацию и уточнение', 'desc': 'Эта статья представляет новый подход к самокоррекции больших языковых моделей (LLM) под названием Program-driven Self-Correction (ProgCo). Метод включает в себя программно-управляемую верификацию (ProgVe), которая использует самогенерируемые и самовыполняющиеся псевдопрограммы для сложной логики проверки. Затем программно-управляемое уточнение (ProgRe) проводит двойную рефлексию и улучшение как ответов, так и программ верификации. Эксперименты показали, что ProgCo эффективен в самокоррекции и может дополнительно улучшить производительность при комбинировании с реальными программными инструментами.'}, 'en': {'title': 'Empowering LLMs with Program-Driven Self-Correction', 'desc': 'This paper introduces Program-driven Self-Correction (ProgCo) to improve the self-verification and self-refinement capabilities of large language models (LLMs). It addresses the common issue where LLMs struggle to provide accurate feedback, which can lead to incorrect refinements, particularly in complex reasoning tasks. ProgCo utilizes program-driven verification (ProgVe) to create self-executing verification pseudo-programs that enhance the verification process. Additionally, program-driven refinement (ProgRe) allows the model to reflect on and refine both its responses and the verification programs, leading to more reliable self-correction outcomes.'}, 'zh': {'title': '基于程序的自我纠正:提升语言模型的自我验证能力', 'desc': '自我纠正旨在使大型语言模型(LLMs)能够在没有外部反馈的情况下自我验证和自我完善其初始响应。然而,LLMs往往无法有效自我验证并生成正确的反馈,这会进一步误导其完善过程,尤其是在复杂推理任务中。本文提出了基于程序的自我纠正(ProgCo),通过自生成、自执行的验证伪程序实现复杂的验证逻辑和广泛的验证。实验结果表明,ProgCo在三个指令遵循和数学基准测试中实现了有效的自我纠正,并且与真实程序工具结合时可以进一步提升性能。'}}}, {'id': 'https://huggingface.co/papers/2501.00316', 'title': 'MapEval: A Map-Based Evaluation of Geo-Spatial Reasoning in Foundation Models', 'url': 'https://huggingface.co/papers/2501.00316', 'abstract': "Recent advancements in foundation models have enhanced AI systems' capabilities in autonomous tool usage and reasoning. However, their ability in location or map-based reasoning - which improves daily life by optimizing navigation, facilitating resource discovery, and streamlining logistics - has not been systematically studied. To bridge this gap, we introduce MapEval, a benchmark designed to assess diverse and complex map-based user queries with geo-spatial reasoning. MapEval features three task types (textual, API-based, and visual) that require collecting world information via map tools, processing heterogeneous geo-spatial contexts (e.g., named entities, travel distances, user reviews or ratings, images), and compositional reasoning, which all state-of-the-art foundation models find challenging. Comprising 700 unique multiple-choice questions about locations across 180 cities and 54 countries, MapEval evaluates foundation models' ability to handle spatial relationships, map infographics, travel planning, and navigation challenges. Using MapEval, we conducted a comprehensive evaluation of 28 prominent foundation models. While no single model excelled across all tasks, Claude-3.5-Sonnet, GPT-4o, and Gemini-1.5-Pro achieved competitive performance overall. However, substantial performance gaps emerged, particularly in MapEval, where agents with Claude-3.5-Sonnet outperformed GPT-4o and Gemini-1.5-Pro by 16% and 21%, respectively, and the gaps became even more amplified when compared to open-source LLMs. Our detailed analyses provide insights into the strengths and weaknesses of current models, though all models still fall short of human performance by more than 20% on average, struggling with complex map images and rigorous geo-spatial reasoning. This gap highlights MapEval's critical role in advancing general-purpose foundation models with stronger geo-spatial understanding.", 'score': 20, 'issue_id': 1477, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': 'a4e45c6bd9d30ff4', 'authors': ['Mahir Labib Dihan', 'Md Tanvir Hassan', 'Md Tanvir Parvez', 'Md Hasebul Hasan', 'Md Almash Alam', 'Muhammad Aamir Cheema', 'Mohammed Eunus Ali', 'Md Rizwan Parvez'], 'affiliations': ['Bangladesh Computer Council (BCC)', 'Department of Computer Science and Engineering Bangladesh University of Engineering and Technology (BUET)', 'Monash University', 'Qatar Computing Research Institute (QCRI)', 'Statistics, Islamic University Bangladesh'], 'pdf_title_img': 'assets/pdf/title_img/2501.00316.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#multimodal', '#survey'], 'emoji': '🗺️', 'ru': {'title': 'MapEval: Новый рубеж в геопространственном ИИ', 'desc': 'Статья представляет MapEval - новый бенчмарк для оценки способностей моделей искусственного интеллекта в области пространственных рассуждений и работы с картами. MapEval включает 700 вопросов с множественным выбором, охватывающих 180 городов и 54 страны, и оценивает навыки моделей в понимании пространственных отношений, инфографики карт, планирования путешествий и навигации. Авторы провели оценку 28 ведущих фундаментальных моделей, выявив значительные различия в производительности, при этом все модели все еще отстают от человеческого уровня более чем на 20%. Результаты исследования подчеркивают важность MapEval для развития моделей с более сильным геопространственным пониманием.'}, 'en': {'title': "Enhancing AI's Geo-Spatial Reasoning with MapEval", 'desc': 'This paper introduces MapEval, a benchmark designed to evaluate the performance of foundation models in map-based reasoning tasks. It focuses on assessing how well these models can handle complex geo-spatial queries, which are essential for navigation and resource discovery. The benchmark includes various task types that require models to process diverse information, such as travel distances and user reviews, and perform compositional reasoning. The evaluation reveals that while some models perform competitively, they still lag behind human capabilities, indicating a need for further advancements in geo-spatial understanding within AI systems.'}, 'zh': {'title': '提升地图推理能力的基准评估', 'desc': '最近基础模型的进展提升了人工智能系统在自主工具使用和推理方面的能力。然而,它们在基于位置或地图的推理能力上尚未得到系统研究,这对于优化导航、资源发现和物流管理至关重要。为了解决这个问题,我们引入了MapEval,一个旨在评估复杂地图用户查询的基准,涉及地理空间推理。MapEval包含700个关于180个城市和54个国家的独特多项选择题,评估基础模型在处理空间关系、地图信息、旅行规划和导航挑战方面的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.01149', 'title': 'A3: Android Agent Arena for Mobile GUI Agents', 'url': 'https://huggingface.co/papers/2501.01149', 'abstract': 'AI agents have become increasingly prevalent in recent years, driven by significant advancements in the field of large language models (LLMs). Mobile GUI agents, a subset of AI agents, are designed to autonomously perform tasks on mobile devices. While numerous studies have introduced agents, datasets, and benchmarks to advance mobile GUI agent research, many existing datasets focus on static frame evaluations and fail to provide a comprehensive platform for assessing performance on real-world, in-the-wild tasks. To address this gap, we present Android Agent Arena (A3), a novel evaluation platform. Unlike existing in-the-wild systems, A3 offers: (1) meaningful and practical tasks, such as real-time online information retrieval and operational instructions; (2) a larger, more flexible action space, enabling compatibility with agents trained on any dataset; and (3) automated business-level LLM-based evaluation process. A3 includes 21 widely used general third-party apps and 201 tasks representative of common user scenarios, providing a robust foundation for evaluating mobile GUI agents in real-world situations and a new autonomous evaluation process for less human labor and coding expertise. The project is available at https://yuxiangchai.github.io/Android-Agent-Arena/.', 'score': 20, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '050f155aa526c100', 'authors': ['Yuxiang Chai', 'Hanhao Li', 'Jiayu Zhang', 'Liang Liu', 'Guozhi Wang', 'Shuai Ren', 'Siyuan Huang', 'Hongsheng Li'], 'affiliations': ['EE department @ CUHK', 'MMLab @ CUHK'], 'pdf_title_img': 'assets/pdf/title_img/2501.01149.jpg', 'data': {'categories': ['#benchmark', '#dataset', '#agents'], 'emoji': '🤖', 'ru': {'title': 'A3: Арена для тестирования мобильных AI-агентов в реальном мире', 'desc': 'Статья представляет новую платформу для оценки мобильных GUI-агентов под названием Android Agent Arena (A3). A3 предлагает реалистичные задачи, широкое пространство действий и автоматизированную оценку на основе больших языковых моделей. Платформа включает 21 популярное стороннее приложение и 201 задачу, отражающую типичные пользовательские сценарии. A3 позволяет оценивать производительность агентов в реальных условиях, что отличает её от существующих статических наборов данных.'}, 'en': {'title': 'Revolutionizing Mobile GUI Agent Evaluation with A3', 'desc': 'This paper introduces the Android Agent Arena (A3), a new evaluation platform for mobile GUI agents that addresses limitations in existing datasets. A3 focuses on real-world tasks, providing a larger action space that accommodates agents trained on various datasets. It features 21 popular third-party apps and 201 tasks that reflect common user scenarios, enhancing the assessment of agent performance. Additionally, A3 incorporates an automated evaluation process using large language models, reducing the need for extensive human involvement and coding skills.'}, 'zh': {'title': 'Android Agent Arena:移动GUI代理的新评估平台', 'desc': '近年来,人工智能代理的应用越来越广泛,尤其是在大型语言模型(LLMs)领域的进步推动下。移动图形用户界面(GUI)代理是人工智能代理的一种,旨在自主执行移动设备上的任务。现有的研究虽然提出了许多代理、数据集和基准,但大多数数据集仅关注静态框架评估,无法全面评估真实世界中的任务表现。为了解决这一问题,我们提出了Android Agent Arena(A3),这是一个新颖的评估平台,提供了实际的任务和更灵活的操作空间,支持基于LLM的自动化评估过程。'}}}, {'id': 'https://huggingface.co/papers/2501.00192', 'title': 'MLLM-as-a-Judge for Image Safety without Human Labeling', 'url': 'https://huggingface.co/papers/2501.00192', 'abstract': 'Image content safety has become a significant challenge with the rise of visual media on online platforms. Meanwhile, in the age of AI-generated content (AIGC), many image generation models are capable of producing harmful content, such as images containing sexual or violent material. Thus, it becomes crucial to identify such unsafe images based on established safety rules. Pre-trained Multimodal Large Language Models (MLLMs) offer potential in this regard, given their strong pattern recognition abilities. Existing approaches typically fine-tune MLLMs with human-labeled datasets, which however brings a series of drawbacks. First, relying on human annotators to label data following intricate and detailed guidelines is both expensive and labor-intensive. Furthermore, users of safety judgment systems may need to frequently update safety rules, making fine-tuning on human-based annotation more challenging. This raises the research question: Can we detect unsafe images by querying MLLMs in a zero-shot setting using a predefined safety constitution (a set of safety rules)? Our research showed that simply querying pre-trained MLLMs does not yield satisfactory results. This lack of effectiveness stems from factors such as the subjectivity of safety rules, the complexity of lengthy constitutions, and the inherent biases in the models. To address these challenges, we propose a MLLM-based method includes objectifying safety rules, assessing the relevance between rules and images, making quick judgments based on debiased token probabilities with logically complete yet simplified precondition chains for safety rules, and conducting more in-depth reasoning with cascaded chain-of-thought processes if necessary. Experiment results demonstrate that our method is highly effective for zero-shot image safety judgment tasks.', 'score': 20, 'issue_id': 1474, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': '2a62bcbb87c1b7a5', 'authors': ['Zhenting Wang', 'Shuming Hu', 'Shiyu Zhao', 'Xiaowen Lin', 'Felix Juefei-Xu', 'Zhuowei Li', 'Ligong Han', 'Harihar Subramanyam', 'Li Chen', 'Jianfa Chen', 'Nan Jiang', 'Lingjuan Lyu', 'Shiqing Ma', 'Dimitris N. Metaxas', 'Ankit Jain'], 'affiliations': ['GenAI @ Meta', 'Rutgers University', 'UMass Amherst', 'Westlake University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00192.jpg', 'data': {'categories': ['#reasoning', '#training', '#ethics', '#cv', '#multimodal'], 'emoji': '🛡️', 'ru': {'title': 'Интеллектуальная защита: Zero-shot оценка безопасности изображений с помощью MLLM', 'desc': 'Статья представляет метод определения безопасности изображений с использованием мультимодальных больших языковых моделей (MLLM) в режиме zero-shot. Авторы предлагают подход, включающий объективизацию правил безопасности, оценку релевантности между правилами и изображениями, и быстрое принятие решений на основе дебиасированных вероятностей токенов. Метод также включает каскадные цепочки рассуждений для более глубокого анализа при необходимости. Эксперименты показывают высокую эффективность предложенного метода для задач оценки безопасности изображений без предварительного обучения.'}, 'en': {'title': 'Zero-Shot Image Safety Detection with MLLMs', 'desc': 'This paper addresses the challenge of identifying unsafe images in the context of AI-generated content using Multimodal Large Language Models (MLLMs). The authors propose a novel approach that allows for zero-shot detection of harmful images by utilizing predefined safety rules without the need for extensive human labeling. They highlight the limitations of traditional methods, such as the subjectivity of safety rules and the biases present in models. The proposed method enhances safety judgment by objectifying rules, assessing their relevance to images, and employing a reasoning process that simplifies complex safety guidelines.'}, 'zh': {'title': '利用MLLMs实现零样本图像安全判断', 'desc': '随着在线平台视觉媒体的兴起,图像内容安全成为一个重要挑战。许多图像生成模型能够产生有害内容,因此识别不安全图像变得至关重要。我们提出了一种基于预训练多模态大语言模型(MLLMs)的方法,通过查询这些模型来检测不安全图像,而无需依赖人工标注。实验结果表明,我们的方法在零样本图像安全判断任务中非常有效。'}}}, {'id': 'https://huggingface.co/papers/2501.01426', 'title': 'Unifying Specialized Visual Encoders for Video Language Models', 'url': 'https://huggingface.co/papers/2501.01426', 'abstract': 'The recent advent of Large Language Models (LLMs) has ushered sophisticated reasoning capabilities into the realm of video through Video Large Language Models (VideoLLMs). However, VideoLLMs currently rely on a single vision encoder for all of their visual processing, which limits the amount and type of visual information that can be conveyed to the LLM. Our method, MERV, Multi-Encoder Representation of Videos, instead leverages multiple frozen visual encoders to create a unified representation of a video, providing the VideoLLM with a comprehensive set of specialized visual knowledge. Spatio-temporally aligning the features from each encoder allows us to tackle a wider range of open-ended and multiple-choice video understanding questions and outperform prior state-of-the-art works. MERV is up to 3.7% better in accuracy than Video-LLaVA across the standard suite video understanding benchmarks, while also having a better Video-ChatGPT score. We also improve upon SeViLA, the previous best on zero-shot Perception Test accuracy, by 2.2%. MERV introduces minimal extra parameters and trains faster than equivalent single-encoder methods while parallelizing the visual processing. Finally, we provide qualitative evidence that MERV successfully captures domain knowledge from each of its encoders. Our results offer promising directions in utilizing multiple vision encoders for comprehensive video understanding.', 'score': 19, 'issue_id': 1488, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'c868a7ebcbafa704', 'authors': ['Jihoon Chung', 'Tyler Zhu', 'Max Gonzalez Saez-Diez', 'Juan Carlos Niebles', 'Honglu Zhou', 'Olga Russakovsky'], 'affiliations': ['Princeton University', 'Salesforce Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.01426.jpg', 'data': {'categories': ['#architecture', '#reasoning', '#video', '#benchmark', '#multimodal', '#optimization'], 'emoji': '🎥', 'ru': {'title': 'MERV: Многоэнкодерное представление видео для улучшенного машинного понимания', 'desc': 'Статья представляет MERV - новый метод для улучшения понимания видео с помощью больших языковых моделей. MERV использует несколько замороженных визуальных энкодеров для создания единого представления видео, что позволяет охватить больший объем визуальной информации. Этот подход превосходит предыдущие методы в точности на стандартных тестах понимания видео. MERV вводит минимальное количество дополнительных параметров и обучается быстрее, чем эквивалентные методы с одним энкодером.'}, 'en': {'title': 'Unlocking Video Understanding with Multi-Encoder Magic!', 'desc': 'This paper introduces MERV, a method that enhances Video Large Language Models (VideoLLMs) by using multiple visual encoders instead of just one. By combining the outputs of these encoders, MERV creates a richer representation of videos, which helps the model understand complex video content better. The approach allows for improved performance on various video understanding tasks, achieving higher accuracy than previous models. Additionally, MERV is efficient, requiring fewer parameters and training time while effectively leveraging the strengths of each encoder.'}, 'zh': {'title': '多编码器提升视频理解能力', 'desc': '本文介绍了一种名为MERV(多编码器视频表示)的方法,旨在提升视频理解的能力。MERV通过使用多个冻结的视觉编码器,创建视频的统一表示,从而为视频大型语言模型(VideoLLM)提供更全面的视觉知识。通过时空对齐每个编码器的特征,MERV能够更好地处理开放式和多选的视频理解问题,且在准确性上超越了之前的最佳模型。该方法不仅提高了性能,还在参数和训练速度上优于单编码器方法,展示了多视觉编码器在视频理解中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01054', 'title': 'Dynamic Scaling of Unit Tests for Code Reward Modeling', 'url': 'https://huggingface.co/papers/2501.01054', 'abstract': 'Current large language models (LLMs) often struggle to produce accurate responses on the first attempt for complex reasoning tasks like code generation. Prior research tackles this challenge by generating multiple candidate solutions and validating them with LLM-generated unit tests. The execution results of unit tests serve as reward signals to identify correct solutions. As LLMs always confidently make mistakes, these unit tests are not reliable, thereby diminishing the quality of reward signals. Motivated by the observation that scaling the number of solutions improves LLM performance, we explore the impact of scaling unit tests to enhance reward signal quality. Our pioneer experiment reveals a positive correlation between the number of unit tests and reward signal quality, with greater benefits observed in more challenging problems. Based on these insights, we propose CodeRM-8B, a lightweight yet effective unit test generator that enables efficient and high-quality unit test scaling. Additionally, we implement a dynamic scaling mechanism that adapts the number of unit tests based on problem difficulty, further improving efficiency. Experimental results show that our approach significantly improves performance across various models on three benchmarks (e.g., with gains of 18.43% for Llama3-8B and 3.42% for GPT-4o-mini on HumanEval Plus).', 'score': 15, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '33b9590f2acb0e48', 'authors': ['Zeyao Ma', 'Xiaokang Zhang', 'Jing Zhang', 'Jifan Yu', 'Sijia Luo', 'Jie Tang'], 'affiliations': ['Key Laboratory of Data Engineering and Knowledge Engineering, Beijing, China', 'School of Information, Renmin University of China', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01054.jpg', 'data': {'categories': ['#reasoning', '#benchmark', '#training', '#small_models', '#rlhf', '#optimization'], 'emoji': '🧪', 'ru': {'title': 'Масштабирование юнит-тестов для повышения точности LLM в сложных задачах', 'desc': 'Эта статья посвящена улучшению точности больших языковых моделей (LLM) в задачах сложного мышления, таких как генерация кода. Авторы предлагают метод масштабирования юнит-тестов для повышения качества сигналов вознаграждения при оценке решений. Они разработали легковесный генератор юнит-тестов CodeRM-8B и механизм динамического масштабирования, адаптирующийся к сложности задачи. Эксперименты показали значительное улучшение производительности различных моделей на нескольких тестовых наборах.'}, 'en': {'title': 'Enhancing LLM Performance through Scaled Unit Testing', 'desc': 'This paper addresses the limitations of large language models (LLMs) in generating accurate responses for complex tasks like code generation. It highlights the issue of unreliable reward signals from LLM-generated unit tests, which can lead to incorrect solutions. The authors propose a novel approach, CodeRM-8B, which generates a larger number of unit tests to improve the quality of these reward signals. Their experiments demonstrate that scaling unit tests enhances LLM performance, particularly for more challenging problems, leading to significant improvements across various models.'}, 'zh': {'title': '提升单元测试质量,增强模型性能', 'desc': '当前的大型语言模型(LLMs)在复杂推理任务(如代码生成)中,往往难以在第一次尝试时产生准确的响应。以往的研究通过生成多个候选解决方案并使用LLM生成的单元测试进行验证来应对这一挑战。单元测试的执行结果作为奖励信号,用于识别正确的解决方案。然而,由于LLMs常常自信地犯错,这些单元测试的可靠性不足,从而降低了奖励信号的质量。我们提出了CodeRM-8B,一个轻量级且有效的单元测试生成器,能够高效地扩展单元测试,并根据问题的难度动态调整单元测试的数量,从而进一步提高效率。'}}}, {'id': 'https://huggingface.co/papers/2501.01320', 'title': 'SeedVR: Seeding Infinity in Diffusion Transformer Towards Generic Video Restoration', 'url': 'https://huggingface.co/papers/2501.01320', 'abstract': "Video restoration poses non-trivial challenges in maintaining fidelity while recovering temporally consistent details from unknown degradations in the wild. Despite recent advances in diffusion-based restoration, these methods often face limitations in generation capability and sampling efficiency. In this work, we present SeedVR, a diffusion transformer designed to handle real-world video restoration with arbitrary length and resolution. The core design of SeedVR lies in the shifted window attention that facilitates effective restoration on long video sequences. SeedVR further supports variable-sized windows near the boundary of both spatial and temporal dimensions, overcoming the resolution constraints of traditional window attention. Equipped with contemporary practices, including causal video autoencoder, mixed image and video training, and progressive training, SeedVR achieves highly-competitive performance on both synthetic and real-world benchmarks, as well as AI-generated videos. Extensive experiments demonstrate SeedVR's superiority over existing methods for generic video restoration.", 'score': 8, 'issue_id': 1479, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'fa277e5baed864a4', 'authors': ['Jianyi Wang', 'Zhijie Lin', 'Meng Wei', 'Yang Zhao', 'Ceyuan Yang', 'Chen Change Loy', 'Lu Jiang'], 'affiliations': ['ByteDance', 'Nanyang Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01320.jpg', 'data': {'categories': ['#architecture', '#benchmark', '#long_context', '#video', '#training', '#diffusion', '#synthetic'], 'emoji': '🎥', 'ru': {'title': 'SeedVR: Восстановление видео нового поколения с помощью диффузионных трансформеров', 'desc': 'SeedVR - это диффузионный трансформер для восстановления видео в реальных условиях. Он использует сдвинутое оконное внимание для эффективной обработки длинных видеопоследовательностей. SeedVR поддерживает окна переменного размера на границах пространственных и временных измерений, преодолевая ограничения традиционного оконного внимания. Благодаря современным практикам, таким как каузальный видеоавтоэнкодер и прогрессивное обучение, SeedVR достигает высоких результатов на синтетических и реальных тестовых наборах.'}, 'en': {'title': 'SeedVR: Revolutionizing Video Restoration with Diffusion Transformers', 'desc': 'This paper introduces SeedVR, a novel diffusion transformer aimed at improving video restoration by effectively managing long sequences and varying resolutions. It utilizes shifted window attention to enhance the restoration process, allowing for better handling of temporal consistency and fidelity in videos. SeedVR incorporates advanced techniques such as causal video autoencoders and mixed training strategies to boost its performance on both synthetic and real-world datasets. The results show that SeedVR outperforms existing video restoration methods, making it a significant advancement in the field.'}, 'zh': {'title': 'SeedVR:高效的视频修复新方法', 'desc': '视频修复面临着在恢复未知退化的同时保持细节一致性的挑战。尽管基于扩散的修复方法有所进展,但它们在生成能力和采样效率上仍存在局限性。本文提出了SeedVR,这是一种专为处理任意长度和分辨率的真实视频修复而设计的扩散变换器。SeedVR通过移动窗口注意力机制,有效地处理长视频序列,并在空间和时间维度的边界附近支持可变大小的窗口,克服了传统窗口注意力的分辨率限制。'}}}, {'id': 'https://huggingface.co/papers/2412.21015', 'title': 'MapQaTor: A System for Efficient Annotation of Map Query Datasets', 'url': 'https://huggingface.co/papers/2412.21015', 'abstract': 'Mapping and navigation services like Google Maps, Apple Maps, Openstreet Maps, are essential for accessing various location-based data, yet they often struggle to handle natural language geospatial queries. Recent advancements in Large Language Models (LLMs) show promise in question answering (QA), but creating reliable geospatial QA datasets from map services remains challenging. We introduce MapQaTor, a web application that streamlines the creation of reproducible, traceable map-based QA datasets. With its plug-and-play architecture, MapQaTor enables seamless integration with any maps API, allowing users to gather and visualize data from diverse sources with minimal setup. By caching API responses, the platform ensures consistent ground truth, enhancing the reliability of the data even as real-world information evolves. MapQaTor centralizes data retrieval, annotation, and visualization within a single platform, offering a unique opportunity to evaluate the current state of LLM-based geospatial reasoning while advancing their capabilities for improved geospatial understanding. Evaluation metrics show that, MapQaTor speeds up the annotation process by at least 30 times compared to manual methods, underscoring its potential for developing geospatial resources, such as complex map reasoning datasets. The website is live at: https://mapqator.github.io/ and a demo video is available at: https://youtu.be/7_aV9Wmhs6Q.', 'score': 8, 'issue_id': 1477, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '0d1081756b5bc4f7', 'authors': ['Mahir Labib Dihan', 'Mohammed Eunus Ali', 'Md Rizwan Parvez'], 'affiliations': ['Department of Computer Science and Engineering Bangladesh University of Engineering and Technology (BUET)', 'Qatar Computing Research Institute (QCRI)'], 'pdf_title_img': 'assets/pdf/title_img/2412.21015.jpg', 'data': {'categories': ['#dataset', '#science', '#reasoning', '#data', '#benchmark'], 'emoji': '🗺️', 'ru': {'title': 'MapQaTor: Революция в создании геопространственных данных для ИИ', 'desc': 'MapQaTor - это веб-приложение, которое упрощает создание воспроизводимых наборов данных для вопросно-ответных систем на основе карт. Оно интегрируется с любым картографическим API и позволяет собирать и визуализировать данные из различных источников. MapQaTor кэширует ответы API, обеспечивая согласованность данных, и централизует процессы сбора, аннотации и визуализации. Приложение ускоряет процесс аннотации в 30 раз по сравнению с ручными методами, что делает его полезным инструментом для развития геопространственных ресурсов и оценки возможностей больших языковых моделей в области геопространственных рассуждений.'}, 'en': {'title': 'Streamlining Geospatial QA with MapQaTor', 'desc': 'This paper presents MapQaTor, a web application designed to facilitate the creation of geospatial question answering (QA) datasets using map services. It leverages recent advancements in Large Language Models (LLMs) to improve the handling of natural language queries related to locations. The platform features a plug-and-play architecture that integrates with various maps APIs, allowing users to efficiently gather, annotate, and visualize geospatial data. By caching API responses, MapQaTor ensures consistent and reliable data, significantly speeding up the annotation process and enhancing the evaluation of LLM-based geospatial reasoning capabilities.'}, 'zh': {'title': 'MapQaTor:提升地图问答数据集创建效率的利器', 'desc': '本文介绍了MapQaTor,一个用于创建地图问答数据集的网络应用程序。它利用大型语言模型的优势,简化了从地图服务生成可重复和可追溯的数据集的过程。MapQaTor支持与任何地图API的无缝集成,并通过缓存API响应来确保数据的一致性。该平台显著提高了数据标注的效率,展示了在地理空间推理方面的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01407', 'title': 'Nested Attention: Semantic-aware Attention Values for Concept Personalization', 'url': 'https://huggingface.co/papers/2501.01407', 'abstract': "Personalizing text-to-image models to generate images of specific subjects across diverse scenes and styles is a rapidly advancing field. Current approaches often face challenges in maintaining a balance between identity preservation and alignment with the input text prompt. Some methods rely on a single textual token to represent a subject, which limits expressiveness, while others employ richer representations but disrupt the model's prior, diminishing prompt alignment. In this work, we introduce Nested Attention, a novel mechanism that injects a rich and expressive image representation into the model's existing cross-attention layers. Our key idea is to generate query-dependent subject values, derived from nested attention layers that learn to select relevant subject features for each region in the generated image. We integrate these nested layers into an encoder-based personalization method, and show that they enable high identity preservation while adhering to input text prompts. Our approach is general and can be trained on various domains. Additionally, its prior preservation allows us to combine multiple personalized subjects from different domains in a single image.", 'score': 7, 'issue_id': 1487, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '537e7bcc16fb17f5', 'authors': ['Or Patashnik', 'Rinon Gal', 'Daniil Ostashev', 'Sergey Tulyakov', 'Kfir Aberman', 'Daniel Cohen-Or'], 'affiliations': ['Snap Research', 'Tel Aviv University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01407.jpg', 'data': {'categories': ['#multimodal', '#architecture', '#cv'], 'emoji': '🎨', 'ru': {'title': 'Nested Attention: новый подход к персонализации генерации изображений', 'desc': "Статья представляет новый метод под названием 'Nested Attention' для персонализации моделей text-to-image. Этот механизм внедряет богатое и выразительное представление изображения в существующие слои кросс-внимания модели. Ключевая идея заключается в генерации зависимых от запроса значений субъекта, полученных из вложенных слоев внимания. Метод позволяет достичь высокого сохранения идентичности при соблюдении входных текстовых подсказок."}, 'en': {'title': 'Nested Attention: Balancing Identity and Text Alignment in Image Generation', 'desc': 'This paper presents a new method called Nested Attention for personalizing text-to-image models. The method addresses the challenge of balancing identity preservation of subjects with the alignment to text prompts. By using query-dependent subject values from nested attention layers, the model can effectively select relevant features for each part of the generated image. This approach not only maintains high identity fidelity but also allows for the integration of multiple personalized subjects from different domains into a single image.'}, 'zh': {'title': '嵌套注意力:个性化图像生成的新方法', 'desc': '本文介绍了一种新的机制,称为嵌套注意力,用于个性化文本到图像模型。该方法通过在模型的交叉注意力层中注入丰富的图像表示,解决了身份保留与文本提示对齐之间的平衡问题。嵌套注意力层能够为生成图像的每个区域选择相关的主题特征,从而实现高效的个性化。我们的研究表明,这种方法可以在多个领域进行训练,并允许在单个图像中结合来自不同领域的多个个性化主题。'}}}, {'id': 'https://huggingface.co/papers/2501.00658', 'title': 'Understanding and Mitigating Bottlenecks of State Space Models through the Lens of Recency and Over-smoothing', 'url': 'https://huggingface.co/papers/2501.00658', 'abstract': "Structured State Space Models (SSMs) have emerged as alternatives to transformers. While SSMs are often regarded as effective in capturing long-sequence dependencies, we rigorously demonstrate that they are inherently limited by strong recency bias. Our empirical studies also reveal that this bias impairs the models' ability to recall distant information and introduces robustness issues. Our scaling experiments then discovered that deeper structures in SSMs can facilitate the learning of long contexts. However, subsequent theoretical analysis reveals that as SSMs increase in depth, they exhibit another inevitable tendency toward over-smoothing, e.g., token representations becoming increasingly indistinguishable. This fundamental dilemma between recency and over-smoothing hinders the scalability of existing SSMs. Inspired by our theoretical findings, we propose to polarize two channels of the state transition matrices in SSMs, setting them to zero and one, respectively, simultaneously addressing recency bias and over-smoothing. Experiments demonstrate that our polarization technique consistently enhances the associative recall accuracy of long-range tokens and unlocks SSMs to benefit further from deeper architectures. All source codes are released at https://github.com/VITA-Group/SSM-Bottleneck.", 'score': 6, 'issue_id': 1476, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': '253304ea64defbe0', 'authors': ['Peihao Wang', 'Ruisi Cai', 'Yuehao Wang', 'Jiajun Zhu', 'Pragya Srivastava', 'Zhangyang Wang', 'Pan Li'], 'affiliations': ['Georgia Tech', 'Google DeepMind', 'University of Texas at Austin', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00658.jpg', 'data': {'categories': ['#training', '#open_source', '#long_context', '#optimization', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Преодоление ограничений SSM: баланс между недавностью и сглаживанием', 'desc': 'Структурированные модели пространства состояний (SSM) рассматриваются как альтернатива трансформерам в обработке длинных последовательностей. Исследование показало, что SSM имеют существенное ограничение в виде сильного смещения к недавним данным, что затрудняет запоминание отдаленной информации. Увеличение глубины SSM улучшает обработку длинных контекстов, но приводит к проблеме чрезмерного сглаживания. Авторы предлагают метод поляризации каналов матриц перехода состояний для решения этих проблем, что улучшает точность ассоциативного извлечения дальних токенов.'}, 'en': {'title': 'Balancing Recency and Over-Smoothing in SSMs', 'desc': "This paper discusses Structured State Space Models (SSMs) as alternatives to transformers, highlighting their limitations due to strong recency bias. This bias affects the models' ability to remember distant information and creates robustness issues. The authors propose a solution by polarizing the state transition matrices, which helps mitigate both recency bias and over-smoothing that occurs with deeper architectures. Their experiments show that this new approach improves the accuracy of recalling long-range tokens, allowing SSMs to effectively utilize deeper structures."}, 'zh': {'title': '解决近期偏见与过平滑的双重挑战', 'desc': '结构状态空间模型(SSMs)作为变换器的替代方案,虽然在捕捉长序列依赖性方面表现出色,但存在强烈的近期偏见限制。我们的实证研究表明,这种偏见影响了模型对远程信息的回忆能力,并引入了鲁棒性问题。通过扩展实验,我们发现SSMs的深层结构可以促进长上下文的学习,但理论分析显示,随着深度增加,模型会出现过平滑的趋势,使得标记表示变得难以区分。我们提出的极化技术通过将状态转移矩阵的两个通道设置为零和一,解决了近期偏见和过平滑的问题,显著提高了长距离标记的关联回忆准确性。'}}}, {'id': 'https://huggingface.co/papers/2501.01245', 'title': 'SeFAR: Semi-supervised Fine-grained Action Recognition with Temporal Perturbation and Learning Stabilization', 'url': 'https://huggingface.co/papers/2501.01245', 'abstract': 'Human action understanding is crucial for the advancement of multimodal systems. While recent developments, driven by powerful large language models (LLMs), aim to be general enough to cover a wide range of categories, they often overlook the need for more specific capabilities. In this work, we address the more challenging task of Fine-grained Action Recognition (FAR), which focuses on detailed semantic labels within shorter temporal duration (e.g., "salto backward tucked with 1 turn"). Given the high costs of annotating fine-grained labels and the substantial data needed for fine-tuning LLMs, we propose to adopt semi-supervised learning (SSL). Our framework, SeFAR, incorporates several innovative designs to tackle these challenges. Specifically, to capture sufficient visual details, we construct Dual-level temporal elements as more effective representations, based on which we design a new strong augmentation strategy for the Teacher-Student learning paradigm through involving moderate temporal perturbation. Furthermore, to handle the high uncertainty within the teacher model\'s predictions for FAR, we propose the Adaptive Regulation to stabilize the learning process. Experiments show that SeFAR achieves state-of-the-art performance on two FAR datasets, FineGym and FineDiving, across various data scopes. It also outperforms other semi-supervised methods on two classical coarse-grained datasets, UCF101 and HMDB51. Further analysis and ablation studies validate the effectiveness of our designs. Additionally, we show that the features extracted by our SeFAR could largely promote the ability of multimodal foundation models to understand fine-grained and domain-specific semantics.', 'score': 5, 'issue_id': 1475, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '30d94590a5c78569', 'authors': ['Yongle Huang', 'Haodong Chen', 'Zhenbang Xu', 'Zihan Jia', 'Haozhou Sun', 'Dian Shao'], 'affiliations': ['School of Automation, Northwestern Polytechnical University, Xian, China', 'School of Computer Science, Northwestern Polytechnical University, Xian, China', 'School of Software, Northwestern Polytechnical University, Xian, China', 'Unmanned System Research Institute, Northwestern Polytechnical University, Xian, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01245.jpg', 'data': {'categories': ['#dataset', '#transfer_learning', '#multimodal', '#optimization', '#training'], 'emoji': '🤸', 'ru': {'title': 'SeFAR: Прорыв в распознавании детализированных действий с помощью полу-контролируемого обучения', 'desc': 'Статья представляет новый подход к задаче распознавания детализированных действий (Fine-grained Action Recognition, FAR) с использованием полу-контролируемого обучения. Авторы предлагают фреймворк SeFAR, который включает в себя двухуровневые временные элементы для более эффективного представления действий и новую стратегию аугментации данных. SeFAR также использует адаптивную регуляцию для стабилизации процесса обучения при работе с неопределенностью в предсказаниях модели-учителя. Эксперименты показывают, что SeFAR достигает лучших результатов на нескольких наборах данных FAR и классических наборах данных для распознавания действий.'}, 'en': {'title': 'SeFAR: Elevating Fine-grained Action Recognition with Semi-supervised Learning', 'desc': "This paper focuses on improving Fine-grained Action Recognition (FAR), which identifies specific actions in short time frames. The authors introduce a semi-supervised learning framework called SeFAR, which uses innovative techniques to enhance the learning process despite the challenges of limited labeled data. They develop Dual-level temporal elements for better visual representation and implement a strong augmentation strategy within a Teacher-Student learning setup. The results demonstrate that SeFAR achieves top performance on FAR datasets and enhances multimodal models' understanding of detailed actions."}, 'zh': {'title': '细粒度动作识别的新突破', 'desc': '人类动作理解对多模态系统的发展至关重要。本文提出了一种新的框架SeFAR,专注于细粒度动作识别(FAR),旨在处理短时间内的详细语义标签。我们采用半监督学习(SSL)来减少对大量标注数据的需求,并通过构建双层时间元素和新的强增强策略来提高模型的表现。实验结果表明,SeFAR在多个数据集上达到了最先进的性能,证明了我们设计的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.00910', 'title': 'Population Aware Diffusion for Time Series Generation', 'url': 'https://huggingface.co/papers/2501.00910', 'abstract': 'Diffusion models have shown promising ability in generating high-quality time series (TS) data. Despite the initial success, existing works mostly focus on the authenticity of data at the individual level, but pay less attention to preserving the population-level properties on the entire dataset. Such population-level properties include value distributions for each dimension and distributions of certain functional dependencies (e.g., cross-correlation, CC) between different dimensions. For instance, when generating house energy consumption TS data, the value distributions of the outside temperature and the kitchen temperature should be preserved, as well as the distribution of CC between them. Preserving such TS population-level properties is critical in maintaining the statistical insights of the datasets, mitigating model bias, and augmenting downstream tasks like TS prediction. Yet, it is often overlooked by existing models. Hence, data generated by existing models often bear distribution shifts from the original data. We propose Population-aware Diffusion for Time Series (PaD-TS), a new TS generation model that better preserves the population-level properties. The key novelties of PaD-TS include 1) a new training method explicitly incorporating TS population-level property preservation, and 2) a new dual-channel encoder model architecture that better captures the TS data structure. Empirical results in major benchmark datasets show that PaD-TS can improve the average CC distribution shift score between real and synthetic data by 5.9x while maintaining a performance comparable to state-of-the-art models on individual-level authenticity.', 'score': 4, 'issue_id': 1486, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'cd3f9282d55e15f2', 'authors': ['Yang Li', 'Han Meng', 'Zhenyu Bi', 'Ingolv T. Urnes', 'Haipeng Chen'], 'affiliations': ['Generated Health', 'Virginia Tech', 'William & Mary'], 'pdf_title_img': 'assets/pdf/title_img/2501.00910.jpg', 'data': {'categories': ['#synthetic', '#benchmark', '#dataset', '#data', '#training', '#architecture', '#diffusion'], 'emoji': '📊', 'ru': {'title': 'Генерация временных рядов с сохранением свойств популяции', 'desc': 'Статья представляет новую модель генерации временных рядов под названием PaD-TS (Population-aware Diffusion for Time Series). Модель нацелена на сохранение свойств на уровне популяции, таких как распределения значений и функциональные зависимости между измерениями. PaD-TS использует новый метод обучения, явно включающий сохранение свойств временных рядов на уровне популяции, и новую архитектуру модели с двухканальным энкодером. Эмпирические результаты показывают значительное улучшение в сохранении распределения кросс-корреляций при сравнимой аутентичности на индивидуальном уровне.'}, 'en': {'title': 'Preserving Population Insights in Time Series Generation', 'desc': 'This paper introduces a new model called Population-aware Diffusion for Time Series (PaD-TS) that focuses on generating time series data while preserving important population-level properties. Unlike previous models that mainly ensure individual data authenticity, PaD-TS emphasizes maintaining the overall statistical characteristics of the dataset, such as value distributions and cross-correlations between different dimensions. The model employs a novel training method and a dual-channel encoder architecture to effectively capture the structure of time series data. Experimental results demonstrate that PaD-TS significantly reduces distribution shifts in generated data while achieving comparable performance in individual-level authenticity to existing state-of-the-art models.'}, 'zh': {'title': '保留人口级特性,提升时间序列生成质量', 'desc': '扩散模型在生成高质量时间序列数据方面表现出色。然而,现有研究主要关注个体数据的真实性,而忽视了整个数据集的人口级特性。我们提出了一种新的时间序列生成模型PaD-TS,旨在更好地保留这些人口级特性,包括值分布和不同维度之间的交叉相关性。实验结果表明,PaD-TS在保持个体级真实性的同时,显著改善了真实数据与合成数据之间的分布差异。'}}}, {'id': 'https://huggingface.co/papers/2501.00712', 'title': 'Rethinking Addressing in Language Models via Contexualized Equivariant Positional Encoding', 'url': 'https://huggingface.co/papers/2501.00712', 'abstract': 'Transformers rely on both content-based and position-based addressing mechanisms to make predictions, but existing positional encoding techniques often diminish the effectiveness of position-based addressing. Many current methods enforce rigid patterns in attention maps, limiting the ability to model long-range dependencies and adapt to diverse tasks. Additionally, most positional encodings are learned as general biases, lacking the specialization required for different instances within a dataset. To address this, we propose conTextualized equivariAnt Position Embedding (TAPE), a novel framework that enhances positional embeddings by incorporating sequence content across layers. TAPE introduces dynamic, context-aware positional encodings, overcoming the constraints of traditional fixed patterns. By enforcing permutation and orthogonal equivariance, TAPE ensures the stability of positional encodings during updates, improving robustness and adaptability. Our method can be easily integrated into pre-trained transformers, offering parameter-efficient fine-tuning with minimal overhead. Extensive experiments shows that TAPE achieves superior performance in language modeling, arithmetic reasoning, and long-context retrieval tasks compared to existing positional embedding techniques.', 'score': 4, 'issue_id': 1485, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'e5119d0e83ce2af2', 'authors': ['Jiajun Zhu', 'Peihao Wang', 'Ruisi Cai', 'Jason D. Lee', 'Pan Li', 'Zhangyang Wang'], 'affiliations': ['Georgia Tech', 'Princeton University', 'University of Texas at Austin', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00712.jpg', 'data': {'categories': ['#long_context', '#optimization', '#training', '#architecture', '#reasoning'], 'emoji': '🔀', 'ru': {'title': 'Динамические позиционные эмбеддинги для улучшения работы трансформеров', 'desc': 'Авторы предлагают новый метод позиционного кодирования для трансформеров под названием TAPE. Этот подход учитывает контекст последовательности и создает динамические позиционные эмбеддинги, адаптированные к конкретным задачам. TAPE обеспечивает стабильность кодирования благодаря свойствам перестановочной и ортогональной эквивариантности. Метод легко интегрируется в предобученные модели и показывает превосходные результаты в задачах языкового моделирования, арифметических рассуждений и поиска в длинных контекстах.'}, 'en': {'title': 'Enhancing Transformers with Context-Aware Positional Embeddings', 'desc': "This paper introduces a new method called conTextualized equivariAnt Position Embedding (TAPE) to improve how transformers use positional information. Traditional positional encodings often restrict the model's ability to understand long-range relationships in data. TAPE enhances these encodings by making them dynamic and context-aware, allowing them to adapt to different sequences and tasks. The method shows better performance in various applications, such as language modeling and reasoning, while being easy to integrate into existing transformer models."}, 'zh': {'title': '提升变换器模型的位置信息处理能力', 'desc': '本文提出了一种新的位置编码方法,称为TAPE(conTextualized equivariAnt Position Embedding),旨在提高变换器模型的预测能力。传统的位置编码方法往往限制了模型对长距离依赖关系的建模能力,而TAPE通过引入动态的、上下文感知的位置编码来克服这一问题。该方法确保了位置编码在更新过程中的稳定性,从而提高了模型的鲁棒性和适应性。实验结果表明,TAPE在语言建模、算术推理和长上下文检索任务中表现优于现有的位置编码技术。'}}}, {'id': 'https://huggingface.co/papers/2412.19723', 'title': 'OS-Genesis: Automating GUI Agent Trajectory Construction via Reverse Task Synthesis', 'url': 'https://huggingface.co/papers/2412.19723', 'abstract': "Graphical User Interface (GUI) agents powered by Vision-Language Models (VLMs) have demonstrated human-like computer control capability. Despite their utility in advancing digital automation, a critical bottleneck persists: collecting high-quality trajectory data for training. Common practices for collecting such data rely on human supervision or synthetic data generation through executing pre-defined tasks, which are either resource-intensive or unable to guarantee data quality. Moreover, these methods suffer from limited data diversity and significant gaps between synthetic data and real-world environments. To address these challenges, we propose OS-Genesis, a novel GUI data synthesis pipeline that reverses the conventional trajectory collection process. Instead of relying on pre-defined tasks, OS-Genesis enables agents first to perceive environments and perform step-wise interactions, then retrospectively derive high-quality tasks to enable trajectory-level exploration. A trajectory reward model is then employed to ensure the quality of the generated trajectories. We demonstrate that training GUI agents with OS-Genesis significantly improves their performance on highly challenging online benchmarks. In-depth analysis further validates OS-Genesis's efficiency and its superior data quality and diversity compared to existing synthesis methods. Our codes, data, and checkpoints are available at https://qiushisun.github.io/OS-Genesis-Home/{OS-Genesis Homepage}.", 'score': 50, 'issue_id': 1455, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': 'b331198d09aa8650', 'authors': ['Qiushi Sun', 'Kanzhi Cheng', 'Zichen Ding', 'Chuanyang Jin', 'Yian Wang', 'Fangzhi Xu', 'Zhenyu Wu', 'Chengyou Jia', 'Liheng Chen', 'Zhoumianze Liu', 'Ben Kao', 'Guohao Li', 'Junxian He', 'Yu Qiao', 'Zhiyong Wu'], 'affiliations': ['Hong Kong University of Science and Technology', 'Johns Hopkins University', 'Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The University of Hong Kong', 'University of Oxford'], 'pdf_title_img': 'assets/pdf/title_img/2412.19723.jpg', 'data': {'categories': ['#benchmark', '#synthetic', '#dataset', '#optimization', '#training', '#data', '#agents'], 'emoji': '🖥️', 'ru': {'title': 'Революция в обучении ИИ-агентов: от заданий к исследованию', 'desc': 'Статья представляет OS-Genesis - новый метод синтеза данных для обучения ИИ-агентов взаимодействию с графическим интерфейсом. Вместо предопределенных задач, агенты сначала исследуют среду и выполняют пошаговые действия, а затем ретроспективно формируют качественные траектории. Используется модель вознаграждения для обеспечения качества сгенерированных траекторий. Результаты показывают значительное улучшение производительности агентов на сложных онлайн-бенчмарках по сравнению с существующими методами.'}, 'en': {'title': 'Revolutionizing GUI Agent Training with OS-Genesis', 'desc': 'This paper introduces OS-Genesis, a new method for generating high-quality trajectory data for training GUI agents using Vision-Language Models (VLMs). Unlike traditional methods that rely on human supervision or predefined tasks, OS-Genesis allows agents to first interact with their environment and then derive tasks retrospectively. This approach enhances data diversity and quality by enabling agents to explore and learn from real-world interactions. The results show that GUI agents trained with OS-Genesis perform significantly better on challenging benchmarks, demonstrating the effectiveness of this novel data synthesis pipeline.'}, 'zh': {'title': 'OS-Genesis:提升GUI代理性能的新方法', 'desc': '本论文提出了一种名为OS-Genesis的新型图形用户界面(GUI)数据合成管道,旨在解决高质量轨迹数据收集的瓶颈。传统方法依赖于人类监督或合成数据生成,往往资源消耗大且数据质量难以保证。OS-Genesis通过让代理先感知环境并进行逐步交互,随后回溯生成高质量任务,从而实现轨迹级探索。实验结果表明,使用OS-Genesis训练的GUI代理在复杂的在线基准测试中表现显著提升,且其数据质量和多样性优于现有合成方法。'}}}, {'id': 'https://huggingface.co/papers/2412.19638', 'title': 'Xmodel-2 Technical Report', 'url': 'https://huggingface.co/papers/2412.19638', 'abstract': 'Xmodel-2 is a 1.2-billion-parameter large language model designed specifically for reasoning tasks. Its architecture enables different model scales to share a unified set of hyperparameters, allowing for extensive experimentation on smaller models and seamless transfer of optimal configurations to larger models. To maximize training efficiency and stability, Xmodel-2 employs the WSD learning rate scheduler from MiniCPM. Pretrained on 1.5 trillion tokens from diverse sources, Xmodel-2 achieves state-of-the-art performance in complex reasoning and agent-based tasks, while maintaining low training costs. These results highlight the potential of efficient model design and training strategies in advancing reasoning capabilities. Model checkpoints and code are publicly available on GitHub at https://github.com/XiaoduoAILab/Xmodel-2', 'score': 11, 'issue_id': 1453, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': '4707dc8ac5a87e66', 'authors': ['Wang Qun', 'Liu Yang', 'Lin Qingquan', 'Qu Zhijiu', 'Jiang Ling'], 'affiliations': ['AI Lab, Xiaodu Technology'], 'pdf_title_img': 'assets/pdf/title_img/2412.19638.jpg', 'data': {'categories': ['#optimization', '#training', '#small_models', '#reasoning', '#open_source', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективное рассуждение с Xmodel-2: мощь в компактности', 'desc': 'Xmodel-2 - это языковая модель с 1,2 миллиардами параметров, специализирующаяся на задачах рассуждения. Её архитектура позволяет разным масштабам модели использовать единый набор гиперпараметров, что облегчает эксперименты и перенос оптимальных конфигураций. Модель использует планировщик скорости обучения WSD из MiniCPM для повышения эффективности и стабильности. Предобученная на 1,5 триллионах токенов, Xmodel-2 достигает передовых результатов в сложных задачах рассуждения, сохраняя низкие затраты на обучение.'}, 'en': {'title': 'Unlocking Reasoning Power with Efficient Model Design', 'desc': 'Xmodel-2 is a large language model with 1.2 billion parameters, specifically built for reasoning tasks. It features a flexible architecture that allows different model sizes to use the same hyperparameters, facilitating experimentation and optimization across scales. The model utilizes the WSD learning rate scheduler to enhance training efficiency and stability. With pretraining on 1.5 trillion tokens, Xmodel-2 demonstrates superior performance in complex reasoning tasks while keeping training costs low, showcasing the benefits of efficient model design.'}, 'zh': {'title': '高效推理能力的模型设计与训练策略', 'desc': 'Xmodel-2 是一个拥有 12 亿参数的大型语言模型,专门设计用于推理任务。它的架构允许不同规模的模型共享统一的超参数,从而可以在较小的模型上进行广泛实验,并将最佳配置无缝转移到更大的模型上。为了最大化训练效率和稳定性,Xmodel-2 采用了 MiniCPM 的 WSD 学习率调度器。经过在 1.5 万亿个来自多样化来源的标记上进行预训练,Xmodel-2 在复杂推理和基于代理的任务中达到了最先进的性能,同时保持了较低的训练成本。'}}}, {'id': 'https://huggingface.co/papers/2412.20735', 'title': 'HUNYUANPROVER: A Scalable Data Synthesis Framework and Guided Tree Search for Automated Theorem Proving', 'url': 'https://huggingface.co/papers/2412.20735', 'abstract': 'We introduce HunyuanProver, an language model finetuned from the Hunyuan 7B for interactive automatic theorem proving with LEAN4. To alleviate the data sparsity issue, we design a scalable framework to iterative synthesize data with low cost. Besides, guided tree search algorithms are designed to enable effective ``system 2 thinking`` of the prover. HunyuanProver achieves state-of-the-art (SOTA) performances on major benchmarks. Specifically, it achieves a pass of 68.4% on the miniF2F-test compared to 65.9%, the current SOTA results. It proves 4 IMO statements (imo_1960_p2, imo_1962_p2}, imo_1964_p2 and imo_1983_p6) in miniF2F-test. To benefit the community, we will open-source a dataset of 30k synthesized instances, where each instance contains the original question in natural language, the converted statement by autoformalization, and the proof by HunyuanProver.', 'score': 3, 'issue_id': 1464, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '18d70581e862bf86', 'authors': ['Yang Li', 'Dong Du', 'Linfeng Song', 'Chen Li', 'Weikang Wang', 'Tao Yang', 'Haitao Mi'], 'affiliations': ['Tencent', 'Tencent Hunyuan Teams'], 'pdf_title_img': 'assets/pdf/title_img/2412.20735.jpg', 'data': {'categories': ['#dataset', '#synthetic', '#data', '#benchmark', '#reasoning', '#open_source', '#training', '#math'], 'emoji': '🧠', 'ru': {'title': 'Прорыв в автоматическом доказательстве теорем с помощью ИИ', 'desc': "HunyuanProver - это языковая модель, настроенная для автоматического доказательства теорем с использованием LEAN4. Модель использует масштабируемую структуру для итеративного синтеза данных и алгоритмы направленного поиска по дереву для эффективного 'системного мышления'. HunyuanProver достигает лучших результатов на основных бенчмарках, включая 68.4% прохождения на miniF2F-test. Авторы планируют открыть доступ к набору данных из 30 тысяч синтезированных примеров для пользы сообщества."}, 'en': {'title': 'HunyuanProver: Advancing Theorem Proving with AI', 'desc': 'HunyuanProver is a language model specifically fine-tuned for interactive automatic theorem proving using LEAN4. To address the challenge of data sparsity, the authors developed a scalable framework that allows for the iterative synthesis of data at a low cost. They also implemented guided tree search algorithms to enhance the reasoning capabilities of the prover, enabling it to perform complex logical deductions. HunyuanProver has achieved state-of-the-art performance on key benchmarks, including a notable pass rate of 68.4% on the miniF2F-test, surpassing previous results and proving several significant mathematical statements.'}, 'zh': {'title': 'HunyuanProver:自动定理证明的新突破', 'desc': '本文介绍了HunyuanProver,这是一个基于Hunyuan 7B微调的语言模型,旨在与LEAN4进行交互式自动定理证明。为了缓解数据稀疏问题,我们设计了一个可扩展的框架,以低成本迭代合成数据。此外,我们还设计了引导树搜索算法,以实现证明者的有效“系统2思维”。HunyuanProver在主要基准测试中达到了最先进的性能,特别是在miniF2F-test中取得了68.4%的通过率,超越了当前的65.9%最先进结果。'}}}, {'id': 'https://huggingface.co/papers/2501.05441', 'title': 'The GAN is dead; long live the GAN! A Modern GAN Baseline', 'url': 'https://huggingface.co/papers/2501.05441', 'abstract': 'There is a widely-spread claim that GANs are difficult to train, and GAN architectures in the literature are littered with empirical tricks. We provide evidence against this claim and build a modern GAN baseline in a more principled manner. First, we derive a well-behaved regularized relativistic GAN loss that addresses issues of mode dropping and non-convergence that were previously tackled via a bag of ad-hoc tricks. We analyze our loss mathematically and prove that it admits local convergence guarantees, unlike most existing relativistic losses. Second, our new loss allows us to discard all ad-hoc tricks and replace outdated backbones used in common GANs with modern architectures. Using StyleGAN2 as an example, we present a roadmap of simplification and modernization that results in a new minimalist baseline -- R3GAN. Despite being simple, our approach surpasses StyleGAN2 on FFHQ, ImageNet, CIFAR, and Stacked MNIST datasets, and compares favorably against state-of-the-art GANs and diffusion models.', 'score': 51, 'issue_id': 1596, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'eb1cd90c4d5cb0ef', 'authors': ['Yiwen Huang', 'Aaron Gokaslan', 'Volodymyr Kuleshov', 'James Tompkin'], 'affiliations': ['Brown University', 'Cornell University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05441.jpg', 'data': {'categories': ['#training', '#architecture', '#diffusion', '#optimization', '#cv'], 'emoji': '🔬', 'ru': {'title': 'Упрощение и модернизация GAN: новый взгляд на обучение генеративных моделей', 'desc': 'Исследователи опровергают распространенное мнение о сложности обучения генеративно-состязательных сетей (GAN). Они разработали новый регуляризованный релятивистский GAN-лосс, который решает проблемы потери мод и отсутствия сходимости. Авторы математически доказывают, что их лосс обеспечивает локальную сходимость, в отличие от большинства существующих релятивистских лоссов. На основе этого подхода они создали минималистичную базовую модель R3GAN, которая превосходит StyleGAN2 и другие современные GAN на нескольких наборах данных.'}, 'en': {'title': 'Simplifying GAN Training with R3GAN: A New Era of Efficiency', 'desc': 'This paper challenges the common belief that Generative Adversarial Networks (GANs) are inherently difficult to train. It introduces a new GAN loss function called the regularized relativistic GAN loss, which effectively addresses issues like mode dropping and non-convergence without relying on numerous empirical tricks. The authors provide mathematical analysis showing that their loss function guarantees local convergence, which is a significant improvement over existing methods. By applying this new loss to modern architectures like StyleGAN2, they create a simplified and efficient GAN model named R3GAN, which outperforms previous models on several benchmark datasets.'}, 'zh': {'title': '简化GAN训练,超越传统架构', 'desc': '这篇论文探讨了生成对抗网络(GAN)训练的难点,并提出了一种新的方法来简化这一过程。作者提出了一种正则化的相对GAN损失函数,解决了模式丢失和非收敛的问题。通过数学分析,证明了这种损失函数具有局部收敛的保证,优于现有的相对损失函数。最终,作者展示了一个新的简约基线R3GAN,其在多个数据集上的表现超过了StyleGAN2,并与最先进的GAN和扩散模型相媲美。'}}}, {'id': 'https://huggingface.co/papers/2501.05032', 'title': 'Enhancing Human-Like Responses in Large Language Models', 'url': 'https://huggingface.co/papers/2501.05032', 'abstract': 'This paper explores the advancements in making large language models (LLMs) more human-like. We focus on techniques that enhance natural language understanding, conversational coherence, and emotional intelligence in AI systems. The study evaluates various approaches, including fine-tuning with diverse datasets, incorporating psychological principles, and designing models that better mimic human reasoning patterns. Our findings demonstrate that these enhancements not only improve user interactions but also open new possibilities for AI applications across different domains. Future work will address the ethical implications and potential biases introduced by these human-like attributes.', 'score': 28, 'issue_id': 1609, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '64e14687fd1e5dab', 'authors': ['Ethem Yağız Çalık', 'Talha Rüzgar Akkuş'], 'affiliations': ['Hugging Face'], 'pdf_title_img': 'assets/pdf/title_img/2501.05032.jpg', 'data': {'categories': ['#training', '#alignment', '#rlhf', '#ethics', '#multimodal'], 'emoji': '🤖', 'ru': {'title': 'Путь к человекоподобному ИИ: улучшение больших языковых моделей', 'desc': 'Статья исследует методы повышения человекоподобности больших языковых моделей (LLM). Авторы рассматривают техники улучшения понимания естественного языка, связности диалогов и эмоционального интеллекта в системах искусственного интеллекта. Исследование оценивает различные подходы, включая дообучение на разнообразных датасетах, внедрение психологических принципов и разработку моделей, лучше имитирующих человеческие паттерны мышления. Результаты показывают, что эти улучшения не только совершенствуют взаимодействие с пользователем, но и открывают новые возможности для применения ИИ в различных областях.'}, 'en': {'title': 'Enhancing AI: Making Language Models More Human-Like', 'desc': 'This paper investigates how to make large language models (LLMs) behave more like humans. It emphasizes improving natural language understanding, making conversations more coherent, and increasing emotional intelligence in AI. The research assesses methods such as fine-tuning models with varied datasets and applying psychological principles to enhance human-like reasoning. The results show that these improvements lead to better user experiences and expand the potential uses of AI, while also highlighting the need to consider ethical issues and biases that may arise.'}, 'zh': {'title': '让人工智能更像人类的未来', 'desc': '本文探讨了使大型语言模型(LLMs)更具人性化的进展。我们重点关注增强自然语言理解、对话连贯性和情感智能的技术。研究评估了多种方法,包括使用多样化数据集进行微调、融入心理学原理,以及设计更好模拟人类推理模式的模型。我们的发现表明,这些增强不仅改善了用户互动,还为不同领域的人工智能应用开辟了新可能。'}}}, {'id': 'https://huggingface.co/papers/2501.05453', 'title': 'An Empirical Study of Autoregressive Pre-training from Videos', 'url': 'https://huggingface.co/papers/2501.05453', 'abstract': 'We empirically study autoregressive pre-training from videos. To perform our study, we construct a series of autoregressive video models, called Toto. We treat videos as sequences of visual tokens and train transformer models to autoregressively predict future tokens. Our models are pre-trained on a diverse dataset of videos and images comprising over 1 trillion visual tokens. We explore different architectural, training, and inference design choices. We evaluate the learned visual representations on a range of downstream tasks including image recognition, video classification, object tracking, and robotics. Our results demonstrate that, despite minimal inductive biases, autoregressive pre-training leads to competitive performance across all benchmarks. Finally, we find that scaling our video models results in similar scaling curves to those seen in language models, albeit with a different rate. More details at https://brjathu.github.io/toto/', 'score': 28, 'issue_id': 1596, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '3846ea8507d046be', 'authors': ['Jathushan Rajasegaran', 'Ilija Radosavovic', 'Rahul Ravishankar', 'Yossi Gandelsman', 'Christoph Feichtenhofer', 'Jitendra Malik'], 'affiliations': ['Meta FAIR', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.05453.jpg', 'data': {'categories': ['#training', '#dataset', '#benchmark', '#architecture', '#robotics', '#video', '#cv'], 'emoji': '🎬', 'ru': {'title': 'Авторегрессионное предобучение видео: путь к универсальному компьютерному зрению', 'desc': 'В статье исследуется авторегрессионное предобучение на видеоданных с использованием модели Toto. Авторы рассматривают видео как последовательности визуальных токенов и обучают трансформеры предсказывать будущие токены. Модели предобучаются на разнообразном наборе данных из более чем триллиона визуальных токенов. Результаты показывают, что такой подход дает конкурентоспособную производительность на различных задачах компьютерного зрения.'}, 'en': {'title': 'Unlocking Video Understanding with Autoregressive Models', 'desc': 'This paper investigates the use of autoregressive pre-training for video data through a series of models named Toto. The authors treat videos as sequences of visual tokens and employ transformer architectures to predict future tokens in these sequences. They pre-train their models on a massive dataset containing over 1 trillion visual tokens, exploring various design choices in architecture and training. The results show that these autoregressive models achieve strong performance on tasks like image recognition and video classification, indicating that scaling video models can yield similar benefits as seen in language models.'}, 'zh': {'title': '自回归预训练:视频模型的新突破', 'desc': '本文研究了视频的自回归预训练。我们构建了一系列名为Toto的自回归视频模型,将视频视为视觉标记的序列,并训练变换器模型以自回归方式预测未来的标记。我们的模型在一个包含超过1万亿视觉标记的多样化视频和图像数据集上进行预训练,并在多个下游任务上评估学习到的视觉表示。结果表明,尽管诱导偏差较小,自回归预训练在所有基准测试中表现出竞争力的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04003', 'title': 'Are VLMs Ready for Autonomous Driving? An Empirical Study from the Reliability, Data, and Metric Perspectives', 'url': 'https://huggingface.co/papers/2501.04003', 'abstract': "Recent advancements in Vision-Language Models (VLMs) have sparked interest in their use for autonomous driving, particularly in generating interpretable driving decisions through natural language. However, the assumption that VLMs inherently provide visually grounded, reliable, and interpretable explanations for driving remains largely unexamined. To address this gap, we introduce DriveBench, a benchmark dataset designed to evaluate VLM reliability across 17 settings (clean, corrupted, and text-only inputs), encompassing 19,200 frames, 20,498 question-answer pairs, three question types, four mainstream driving tasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often generate plausible responses derived from general knowledge or textual cues rather than true visual grounding, especially under degraded or missing visual inputs. This behavior, concealed by dataset imbalances and insufficient evaluation metrics, poses significant risks in safety-critical scenarios like autonomous driving. We further observe that VLMs struggle with multi-modal reasoning and display heightened sensitivity to input corruptions, leading to inconsistencies in performance. To address these challenges, we propose refined evaluation metrics that prioritize robust visual grounding and multi-modal understanding. Additionally, we highlight the potential of leveraging VLMs' awareness of corruptions to enhance their reliability, offering a roadmap for developing more trustworthy and interpretable decision-making systems in real-world autonomous driving contexts. The benchmark toolkit is publicly accessible.", 'score': 20, 'issue_id': 1599, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '720b493a608f478a', 'authors': ['Shaoyuan Xie', 'Lingdong Kong', 'Yuhao Dong', 'Chonghao Sima', 'Wenwei Zhang', 'Qi Alfred Chen', 'Ziwei Liu', 'Liang Pan'], 'affiliations': ['National University of Singapore', 'S-Lab, Nanyang Technological University', 'Shanghai AI Laboratory', 'The University of Hong Kong', 'University of California, Irvine'], 'pdf_title_img': 'assets/pdf/title_img/2501.04003.jpg', 'data': {'categories': ['#security', '#interpretability', '#dataset', '#multimodal', '#reasoning', '#benchmark', '#cv'], 'emoji': '🚗', 'ru': {'title': 'Проверка надёжности VLM для безопасного автономного вождения', 'desc': 'Статья представляет DriveBench - набор данных для оценки надёжности мультимодальных языковых моделей (VLM) в контексте автономного вождения. Исследование выявило, что VLM часто генерируют правдоподобные ответы на основе общих знаний, а не визуальной информации, что опасно в критически важных сценариях. Авторы предлагают усовершенствованные метрики оценки, ориентированные на надёжную визуальную привязку и мультимодальное понимание. Также отмечается потенциал использования осведомленности VLM о искажениях для повышения их надёжности.'}, 'en': {'title': 'Enhancing Trust in Vision-Language Models for Safer Autonomous Driving', 'desc': 'This paper discusses the limitations of Vision-Language Models (VLMs) in the context of autonomous driving, particularly their ability to provide reliable and interpretable driving decisions. The authors introduce DriveBench, a comprehensive benchmark dataset that tests VLM performance across various conditions, including clean and corrupted inputs. Their research shows that VLMs often rely on general knowledge rather than true visual understanding, especially when visual data is compromised. To improve VLM reliability, the paper suggests new evaluation metrics focused on visual grounding and multi-modal reasoning, aiming to enhance the safety of autonomous driving systems.'}, 'zh': {'title': '提升自动驾驶决策的可靠性与可解释性', 'desc': '本文介绍了DriveBench,一个用于评估视觉语言模型(VLMs)在自动驾驶中可靠性的基准数据集。该数据集包含19200帧图像和20498个问答对,涵盖了多种驾驶任务和输入类型。研究发现,VLMs在处理受损或缺失的视觉输入时,往往依赖于一般知识而非真实的视觉信息,导致安全隐患。为了解决这些问题,本文提出了改进的评估指标,强调视觉基础和多模态理解的重要性。'}}}, {'id': 'https://huggingface.co/papers/2501.05122', 'title': 'Centurio: On Drivers of Multilingual Ability of Large Vision-Language Model', 'url': 'https://huggingface.co/papers/2501.05122', 'abstract': 'Most Large Vision-Language Models (LVLMs) to date are trained predominantly on English data, which makes them struggle to understand non-English input and fail to generate output in the desired target language. Existing efforts mitigate these issues by adding multilingual training data, but do so in a largely ad-hoc manner, lacking insight into how different training mixes tip the scale for different groups of languages. In this work, we present a comprehensive investigation into the training strategies for massively multilingual LVLMs. First, we conduct a series of multi-stage experiments spanning 13 downstream vision-language tasks and 43 languages, systematically examining: (1) the number of training languages that can be included without degrading English performance and (2) optimal language distributions of pre-training as well as (3) instruction-tuning data. Further, we (4) investigate how to improve multilingual text-in-image understanding, and introduce a new benchmark for the task. Surprisingly, our analysis reveals that one can (i) include as many as 100 training languages simultaneously (ii) with as little as 25-50\\% of non-English data, to greatly improve multilingual performance while retaining strong English performance. We further find that (iii) including non-English OCR data in pre-training and instruction-tuning is paramount for improving multilingual text-in-image understanding. Finally, we put all our findings together and train Centurio, a 100-language LVLM, offering state-of-the-art performance in an evaluation covering 14 tasks and 56 languages.', 'score': 13, 'issue_id': 1604, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '92d74f3bbeb4a400', 'authors': ['Gregor Geigle', 'Florian Schneider', 'Carolin Holtermann', 'Chris Biemann', 'Radu Timofte', 'Anne Lauscher', 'Goran Glavaš'], 'affiliations': ['Data Science Group, University of Hamburg', 'Language Technology Group', 'WüNLP, Computer Vision Lab, CAIDAS, University of Würzburg'], 'pdf_title_img': 'assets/pdf/title_img/2501.05122.jpg', 'data': {'categories': ['#machine_translation', '#multilingual', '#benchmark', '#low_resource'], 'emoji': '🌍', 'ru': {'title': 'Centurio: Прорыв в многоязычном визуально-языковом ИИ', 'desc': 'В статье описывается исследование стратегий обучения многоязычных крупномасштабных визуально-языковых моделей (LVLMs). Авторы проводят эксперименты на 13 задачах и 43 языках, изучая оптимальное распределение языков в данных для предобучения и инструктивной настройки. Они обнаруживают, что можно включить до 100 языков обучения, используя всего 25-50% неанглийских данных, значительно улучшив многоязычную производительность при сохранении высокой эффективности на английском. На основе полученных результатов авторы обучают Centurio - 100-язычную LVLM, демонстрирующую передовые результаты на 14 задачах и 56 языках.'}, 'en': {'title': 'Unlocking Multilingual Mastery in Vision-Language Models', 'desc': 'This paper investigates how to effectively train Large Vision-Language Models (LVLMs) on multiple languages, particularly focusing on improving their performance in non-English languages. The authors conduct experiments across various tasks and languages to determine the best strategies for including multilingual data without harming English performance. They discover that including up to 100 languages and using a smaller proportion of non-English data can enhance multilingual capabilities while maintaining strong English results. Additionally, they emphasize the importance of incorporating non-English OCR data to boost understanding of text within images, culminating in the development of Centurio, a 100-language LVLM with state-of-the-art performance.'}, 'zh': {'title': '提升多语言理解,Centurio引领新潮流', 'desc': '本文研究了大规模多语言视觉-语言模型(LVLM)的训练策略,特别关注如何提高模型对非英语输入的理解和输出能力。我们通过多阶段实验,分析了包含多种语言的训练数据对英语性能的影响,并探索了最佳的语言分布策略。研究发现,最多可以同时包含100种语言的训练数据,并且只需25-50%的非英语数据即可显著提升多语言性能。最后,我们结合所有发现,训练了Centurio,一个支持100种语言的LVLM,在14个任务和56种语言的评估中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.03489', 'title': 'Entropy-Guided Attention for Private LLMs', 'url': 'https://huggingface.co/papers/2501.03489', 'abstract': "The pervasiveness of proprietary language models has raised critical privacy concerns, necessitating advancements in private inference (PI), where computations are performed directly on encrypted data without revealing users' sensitive information. While PI offers a promising solution, its practical deployment is hindered by substantial communication and latency overheads, primarily stemming from nonlinear operations. To address this, we introduce an information-theoretic framework to characterize the role of nonlinearities in decoder-only language models, laying a principled foundation for optimizing transformer-architectures tailored to the demands of PI. By leveraging Shannon's entropy as a quantitative measure, we uncover the previously unexplored dual significance of nonlinearities: beyond ensuring training stability, they are crucial for maintaining attention head diversity. Specifically, we find that their removal triggers two critical failure modes: {\\em entropy collapse} in deeper layers that destabilizes training, and {\\em entropic overload} in earlier layers that leads to under-utilization of Multi-Head Attention's (MHA) representational capacity. We propose an entropy-guided attention mechanism paired with a novel entropy regularization technique to mitigate entropic overload. Additionally, we explore PI-friendly alternatives to layer normalization for preventing entropy collapse and stabilizing the training of LLMs with reduced-nonlinearities. Our study bridges the gap between information theory and architectural design, establishing entropy dynamics as a principled guide for developing efficient PI architectures. The code and implementation are available at https://github.com/Nandan91/entropy-guided-attention-llm{entropy-guided-llm}.", 'score': 11, 'issue_id': 1597, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '18abcfb3fe1b209b', 'authors': ['Nandan Kumar Jha', 'Brandon Reagen'], 'affiliations': ['New York University'], 'pdf_title_img': 'assets/pdf/title_img/2501.03489.jpg', 'data': {'categories': ['#security', '#inference', '#optimization', '#architecture', '#training', '#open_source'], 'emoji': '🔐', 'ru': {'title': 'Энтропия как ключ к конфиденциальным языковым моделям', 'desc': 'Статья рассматривает проблему конфиденциальности при использовании языковых моделей и предлагает решение через частное вычисление (PI). Авторы представляют информационно-теоретическую основу для оптимизации архитектур трансформеров под задачи PI, используя энтропию Шеннона как количественную меру. Исследование выявляет двойную роль нелинейностей в моделях: обеспечение стабильности обучения и поддержание разнообразия в механизме внимания. Предложен энтропийно-управляемый механизм внимания и новая техника регуляризации энтропии для улучшения эффективности PI-архитектур.'}, 'en': {'title': 'Optimizing Language Models for Privacy with Entropy Dynamics', 'desc': 'This paper addresses privacy concerns related to proprietary language models by focusing on private inference (PI), which allows computations on encrypted data. The authors introduce an information-theoretic framework to analyze the impact of nonlinearities in decoder-only language models, which are essential for optimizing transformer architectures for PI. They identify two critical issues caused by the removal of nonlinearities: entropy collapse in deeper layers and entropic overload in earlier layers, both of which affect training stability and attention mechanisms. To resolve these issues, the paper proposes an entropy-guided attention mechanism and explores alternatives to layer normalization, aiming to enhance the efficiency of PI architectures while maintaining model performance.'}, 'zh': {'title': '优化私密推理架构的熵动态', 'desc': '本论文探讨了在加密数据上进行私密推理(PI)时,非线性操作对解码器语言模型的影响。我们提出了一种信息论框架,帮助优化适合PI需求的变换器架构。研究发现,非线性不仅确保了训练的稳定性,还对注意力头的多样性至关重要。为了解决熵崩溃和熵过载问题,我们提出了一种基于熵的注意力机制和新的熵正则化技术。'}}}, {'id': 'https://huggingface.co/papers/2501.05040', 'title': 'SWE-Fixer: Training Open-Source LLMs for Effective and Efficient GitHub Issue Resolution', 'url': 'https://huggingface.co/papers/2501.05040', 'abstract': 'Large Language Models (LLMs) have demonstrated remarkable proficiency across a variety of complex tasks. One significant application of LLMs is in tackling software engineering challenges, particularly in resolving real-world tasks on GitHub by fixing code based on the issues reported by the users. However, many current approaches rely on proprietary LLMs, which limits reproducibility, accessibility, and transparency. The critical components of LLMs for addressing software engineering issues and how their capabilities can be effectively enhanced remain unclear. To address these challenges, we introduce SWE-Fixer, a novel open-source LLM designed to effectively and efficiently resolve GitHub issues. SWE-Fixer comprises two essential modules: a code file retrieval module and a code editing module. The retrieval module employs BM25 along with a lightweight LLM model to achieve coarse-to-fine file retrieval. Subsequently, the code editing module utilizes the other LLM model to generate patches for the identified files. Then, to mitigate the lack of publicly available datasets, we compile an extensive dataset that includes 110K GitHub issues along with their corresponding patches, and train the two modules of SWE-Fixer separately. We assess our approach on the SWE-Bench Lite and Verified benchmarks, achieving state-of-the-art performance among open-source models with scores of 23.3% and 30.2%, respectively. These outcomes highlight the efficacy of our approach. We will make our model, dataset, and code publicly available at https://github.com/InternLM/SWE-Fixer.', 'score': 8, 'issue_id': 1608, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '54d8f8a0fe5436c6', 'authors': ['Chengxing Xie', 'Bowen Li', 'Chang Gao', 'He Du', 'Wai Lam', 'Difan Zou', 'Kai Chen'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong', 'The University of Hong Kong', 'Xidian University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05040.jpg', 'data': {'categories': ['#data', '#open_source', '#dataset', '#architecture', '#benchmark', '#training', '#science'], 'emoji': '🛠️', 'ru': {'title': 'Открытая языковая модель для эффективного решения проблем на GitHub', 'desc': 'SWE-Fixer - это новая модель с открытым исходным кодом для решения проблем на GitHub. Она состоит из модуля поиска файлов кода и модуля редактирования кода, использующих легковесные языковые модели. Авторы создали обширный датасет из 110 тысяч GitHub-issues с патчами для обучения модели. SWE-Fixer достигла лучших результатов среди моделей с открытым кодом на бенчмарках SWE-Bench Lite и Verified.'}, 'en': {'title': 'SWE-Fixer: Open-Source Solutions for GitHub Issues', 'desc': 'This paper presents SWE-Fixer, an open-source Large Language Model (LLM) specifically designed to address software engineering challenges on GitHub. It features two main components: a code file retrieval module that uses BM25 and a lightweight LLM for efficient file identification, and a code editing module that generates code patches using another LLM. The authors also created a comprehensive dataset of 110,000 GitHub issues and their corresponding patches to train the model effectively. SWE-Fixer achieves state-of-the-art performance on benchmark tests, demonstrating its potential to enhance accessibility and transparency in software engineering solutions.'}, 'zh': {'title': '开源LLM助力软件工程问题解决', 'desc': '大型语言模型(LLMs)在处理复杂任务方面表现出色,尤其是在软件工程领域。本文介绍了一种新颖的开源LLM,名为SWE-Fixer,旨在有效解决GitHub上的问题。SWE-Fixer包含两个主要模块:代码文件检索模块和代码编辑模块,前者使用BM25和轻量级LLM进行文件检索,后者生成代码补丁。通过构建包含11万个GitHub问题及其补丁的数据集,SWE-Fixer在开源模型中实现了领先的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04377', 'title': 'On Computational Limits and Provably Efficient Criteria of Visual Autoregressive Models: A Fine-Grained Complexity Analysis', 'url': 'https://huggingface.co/papers/2501.04377', 'abstract': 'Recently, Visual Autoregressive (VAR) Models introduced a groundbreaking advancement in the field of image generation, offering a scalable approach through a coarse-to-fine "next-scale prediction" paradigm. However, the state-of-the-art algorithm of VAR models in [Tian, Jiang, Yuan, Peng and Wang, NeurIPS 2024] takes O(n^4) time, which is computationally inefficient. In this work, we analyze the computational limits and efficiency criteria of VAR Models through a fine-grained complexity lens. Our key contribution is identifying the conditions under which VAR computations can achieve sub-quadratic time complexity. Specifically, we establish a critical threshold for the norm of input matrices used in VAR attention mechanisms. Above this threshold, assuming the Strong Exponential Time Hypothesis (SETH) from fine-grained complexity theory, a sub-quartic time algorithm for VAR models is impossible. To substantiate our theoretical findings, we present efficient constructions leveraging low-rank approximations that align with the derived criteria. This work initiates the study of the computational efficiency of the VAR model from a theoretical perspective. Our technique will shed light on advancing scalable and efficient image generation in VAR frameworks.', 'score': 8, 'issue_id': 1597, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'be8a0f20db676680', 'authors': ['Yekun Ke', 'Xiaoyu Li', 'Yingyu Liang', 'Zhizhou Sha', 'Zhenmei Shi', 'Zhao Song'], 'affiliations': ['The Simons Institute for the Theory of Computing at UC Berkeley', 'The University of Hong Kong', 'Tsinghua University', 'University of Wisconsin-Madison'], 'pdf_title_img': 'assets/pdf/title_img/2501.04377.jpg', 'data': {'categories': ['#math', '#optimization', '#cv'], 'emoji': '🔬', 'ru': {'title': 'Преодоление вычислительных барьеров в VAR моделях', 'desc': 'Статья исследует вычислительные ограничения и критерии эффективности Визуальных Авторегрессионных (VAR) моделей с точки зрения тонкой теории сложности. Авторы определяют условия, при которых вычисления VAR могут достичь субквадратичной временной сложности. Они устанавливают критический порог для нормы входных матриц, используемых в механизмах внимания VAR, выше которого невозможен субкварцевый алгоритм времени для моделей VAR. Представлены эффективные конструкции, использующие аппроксимации низкого ранга, которые соответствуют выведенным критериям.'}, 'en': {'title': 'Unlocking Efficiency in Image Generation with VAR Models', 'desc': 'This paper explores the computational efficiency of Visual Autoregressive (VAR) Models, which are used for generating images. The authors identify that the current state-of-the-art VAR algorithm is computationally expensive, operating in O(n^4) time complexity. They establish conditions under which VAR computations can be optimized to achieve sub-quadratic time complexity, particularly focusing on the input matrix norms in the attention mechanisms. By applying low-rank approximations, the authors provide practical constructions that meet their theoretical criteria, paving the way for more efficient image generation techniques in VAR frameworks.'}, 'zh': {'title': '提升VAR模型的计算效率', 'desc': '最近,视觉自回归(VAR)模型在图像生成领域取得了突破性进展,采用粗到细的“下一个尺度预测”范式。然而,VAR模型的最新算法在计算上效率低下,时间复杂度为O(n^4)。本文通过细粒度复杂性分析,探讨了VAR模型的计算限制和效率标准。我们确定了VAR计算可以实现亚二次时间复杂度的条件,并提出了利用低秩近似的高效构造,以支持我们的理论发现。'}}}, {'id': 'https://huggingface.co/papers/2501.04828', 'title': 'Building Foundations for Natural Language Processing of Historical Turkish: Resources and Models', 'url': 'https://huggingface.co/papers/2501.04828', 'abstract': 'This paper introduces foundational resources and models for natural language processing (NLP) of historical Turkish, a domain that has remained underexplored in computational linguistics. We present the first named entity recognition (NER) dataset, HisTR and the first Universal Dependencies treebank, OTA-BOUN for a historical form of the Turkish language along with transformer-based models trained using these datasets for named entity recognition, dependency parsing, and part-of-speech tagging tasks. Additionally, we introduce Ottoman Text Corpus (OTC), a clean corpus of transliterated historical Turkish texts that spans a wide range of historical periods. Our experimental results show significant improvements in the computational analysis of historical Turkish, achieving promising results in tasks that require understanding of historical linguistic structures. They also highlight existing challenges, such as domain adaptation and language variations across time periods. All of the presented resources and models are made available at https://huggingface.co/bucolin to serve as a benchmark for future progress in historical Turkish NLP.', 'score': 6, 'issue_id': 1603, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '40fe69c40d907fc4', 'authors': ['Şaziye Betül Özateş', 'Tarık Emre Tıraş', 'Ece Elif Adak', 'Berat Doğan', 'Fatih Burak Karagöz', 'Efe Eren Genç', 'Esma F. Bilgin Taşdemir'], 'affiliations': ['Bogaziçi University', 'Medeniyet University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04828.jpg', 'data': {'categories': ['#dataset', '#data', '#low_resource', '#science', '#multilingual', '#benchmark'], 'emoji': '🏛️', 'ru': {'title': 'Прорыв в NLP для исторического турецкого языка', 'desc': 'Статья представляет первые ресурсы и модели для обработки естественного языка (NLP) исторического турецкого языка. Авторы создали первый датасет для распознавания именованных сущностей (NER) HisTR и первый Universal Dependencies тривбанк OTA-BOUN для исторической формы турецкого языка. Также были разработаны трансформерные модели для задач NER, синтаксического анализа и морфологической разметки. Дополнительно представлен Османский текстовый корпус (OTC) - очищенный корпус транслитерированных исторических турецких текстов разных периодов.'}, 'en': {'title': 'Unlocking Historical Turkish: New Resources for NLP', 'desc': 'This paper provides essential resources and models for processing historical Turkish language using natural language processing (NLP) techniques. It introduces the first named entity recognition (NER) dataset, HisTR, and the first Universal Dependencies treebank, OTA-BOUN, specifically for historical Turkish. The authors also present the Ottoman Text Corpus (OTC), a comprehensive collection of transliterated texts from various historical periods. The results demonstrate advancements in analyzing historical Turkish, while also addressing challenges like domain adaptation and linguistic variations over time.'}, 'zh': {'title': '推动历史土耳其语NLP的进步', 'desc': '本文介绍了历史土耳其语自然语言处理(NLP)的基础资源和模型,这是一个在计算语言学中尚未深入研究的领域。我们首次发布了命名实体识别(NER)数据集HisTR和历史土耳其语的Universal Dependencies树库OTA-BOUN,并基于这些数据集训练了用于命名实体识别、依存句法分析和词性标注任务的变换器模型。此外,我们还推出了奥斯曼文本语料库(OTC),这是一个涵盖多个历史时期的清晰转写历史土耳其语文本的语料库。实验结果显示,在历史土耳其语的计算分析中取得了显著进展,但也突显了领域适应和语言随时间变化等挑战。'}}}];
+ const articlesData = [{'id': 'https://huggingface.co/papers/2501.02976', 'title': 'STAR: Spatial-Temporal Augmentation with Text-to-Video Models for Real-World Video Super-Resolution', 'url': 'https://huggingface.co/papers/2501.02976', 'abstract': 'Image diffusion models have been adapted for real-world video super-resolution to tackle over-smoothing issues in GAN-based methods. However, these models struggle to maintain temporal consistency, as they are trained on static images, limiting their ability to capture temporal dynamics effectively. Integrating text-to-video (T2V) models into video super-resolution for improved temporal modeling is straightforward. However, two key challenges remain: artifacts introduced by complex degradations in real-world scenarios, and compromised fidelity due to the strong generative capacity of powerful T2V models (e.g., CogVideoX-5B). To enhance the spatio-temporal quality of restored videos, we introduce~\\name (Spatial-Temporal Augmentation with T2V models for Real-world video super-resolution), a novel approach that leverages T2V models for real-world video super-resolution, achieving realistic spatial details and robust temporal consistency. Specifically, we introduce a Local Information Enhancement Module (LIEM) before the global attention block to enrich local details and mitigate degradation artifacts. Moreover, we propose a Dynamic Frequency (DF) Loss to reinforce fidelity, guiding the model to focus on different frequency components across diffusion steps. Extensive experiments demonstrate~\\name~outperforms state-of-the-art methods on both synthetic and real-world datasets.', 'score': 36, 'issue_id': 1527, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '13ac412646c508f5', 'authors': ['Rui Xie', 'Yinhong Liu', 'Penghao Zhou', 'Chen Zhao', 'Jun Zhou', 'Kai Zhang', 'Zhenyu Zhang', 'Jian Yang', 'Zhenheng Yang', 'Ying Tai'], 'affiliations': ['ByteDance', 'Nanjing University', 'Southwest University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02976.jpg', 'data': {'categories': ['#cv', '#optimization', '#diffusion', '#multimodal', '#video'], 'emoji': '🎥', 'ru': {'title': 'Качественное суперразрешение видео с помощью T2V моделей', 'desc': 'Представлена новая методика STAR для суперразрешения видео в реальных условиях с использованием моделей text-to-video. Предложен модуль LIEM для улучшения локальных деталей и устранения артефактов деградации. Введена функция потерь Dynamic Frequency для усиления точности восстановления на разных частотах. Эксперименты показывают превосходство STAR над современными методами на синтетических и реальных датасетах.'}, 'en': {'title': 'Enhancing Video Quality with T2V Models for Real-World Super-Resolution', 'desc': 'This paper presents a new method called Spatial-Temporal Augmentation with T2V models for Real-world video super-resolution, which aims to improve video quality by addressing issues of over-smoothing and temporal consistency. Traditional image diffusion models struggle with video because they are designed for static images, leading to challenges in capturing motion dynamics. The proposed approach incorporates a Local Information Enhancement Module to enhance local details and reduce artifacts, along with a Dynamic Frequency Loss to maintain fidelity across different frequency components. Experimental results show that this method outperforms existing techniques in both synthetic and real-world scenarios, providing better spatial and temporal quality in restored videos.'}, 'zh': {'title': '提升视频超分辨率的时空一致性', 'desc': '本文提出了一种新方法,名为~\\name~,用于提高真实世界视频超分辨率的时空质量。该方法结合了文本到视频(T2V)模型,以解决传统生成对抗网络(GAN)方法中的过平滑问题。通过引入局部信息增强模块(LIEM)和动态频率损失(DF Loss),该方法能够有效改善视频的局部细节和时间一致性。实验结果表明,~\\name~在合成和真实世界数据集上均优于现有的最先进方法。'}}}, {'id': 'https://huggingface.co/papers/2501.03226', 'title': 'BoostStep: Boosting mathematical capability of Large Language Models via improved single-step reasoning', 'url': 'https://huggingface.co/papers/2501.03226', 'abstract': "Cutting-edge large language models (LLMs) demonstrate promising performance in solving complex math problems with a divide-and-conquer pipeline and the assistance of in-context learning (ICL) examples. However, their potential for improvement is limited by two critical problems within their ICL examples: granularity-mismatch and the ensuing negative-effect noise problem. Specifically, the LLMs are capable of the dividing process yet mostly failed by inaccurate reasoning within a few conquer steps, while the ICL examples retrieved in question-grained sometimes lack relevant steps for a specific challenging reasoning step. Further, this disconnect may hinder the correct reasoning due to its irrelevance. To this end, we focus on improving the reasoning quality within each step and present BoostStep. BoostStep aligns the granularity between the retrieving and reasoning on step grained, and provides highly related ICL examples for each reasoning step with a novel `first-try' strategy. BoostStep provides more relevant examples than the coarse question-grained strategy, enhancing the model reasoning quality within each step steadily. BoostStep is a general and robust reasoning-enhancing method that not only improves standalone reasoning performance but also integrates seamlessly with Monte Carlo Tree Search methods (MCTS) to refine both candidate generation and decision-making. Quantitatively, it improves GPT-4o and Qwen2.5-Math-72B by 3.6\\% and 2.0\\% respectively on various mathematical benchmarks, and 7.5\\% gain combined with MCTS.", 'score': 21, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '94a01c7d4516c725', 'authors': ['Beichen Zhang', 'Yuhong Liu', 'Xiaoyi Dong', 'Yuhang Zang', 'Pan Zhang', 'Haodong Duan', 'Yuhang Cao', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.03226.jpg', 'data': {'categories': ['#training', '#optimization', '#math', '#reasoning'], 'emoji': '🧮', 'ru': {'title': 'BoostStep: Повышение точности рассуждений ИИ в решении математических задач', 'desc': 'Статья представляет метод BoostStep для улучшения решения сложных математических задач большими языковыми моделями. BoostStep решает проблемы несоответствия детализации и негативного шума в примерах обучения в контексте. Метод выравнивает гранулярность между извлечением и рассуждением на уровне шагов, предоставляя релевантные примеры для каждого шага рассуждения. BoostStep повышает качество рассуждений модели и может интегрироваться с методами поиска по дереву Монте-Карло для улучшения генерации кандидатов и принятия решений.'}, 'en': {'title': 'Boosting Reasoning Quality in Large Language Models with BoostStep', 'desc': "This paper introduces BoostStep, a method designed to enhance the reasoning quality of large language models (LLMs) when solving complex math problems. It addresses two main issues: granularity-mismatch and negative-effect noise in in-context learning (ICL) examples, which can lead to inaccurate reasoning. By aligning the granularity of retrieved examples with the specific reasoning steps required, BoostStep provides more relevant ICL examples, improving the model's performance. The method not only boosts standalone reasoning but also integrates effectively with Monte Carlo Tree Search (MCTS) to enhance decision-making processes."}, 'zh': {'title': '提升推理质量的BoostStep方法', 'desc': '这篇论文探讨了大型语言模型(LLMs)在解决复杂数学问题时的表现,特别是通过分而治之的策略和上下文学习(ICL)示例的辅助。研究发现,ICL示例中的粒度不匹配和负面噪声问题限制了模型的改进潜力。为了解决这些问题,论文提出了BoostStep方法,它通过对每个推理步骤的粒度进行对齐,提供更相关的ICL示例,从而提高推理质量。BoostStep不仅提升了独立推理的性能,还能与蒙特卡洛树搜索(MCTS)方法无缝集成,进一步优化候选生成和决策过程。'}}}, {'id': 'https://huggingface.co/papers/2501.03218', 'title': 'Dispider: Enabling Video LLMs with Active Real-Time Interaction via Disentangled Perception, Decision, and Reaction', 'url': 'https://huggingface.co/papers/2501.03218', 'abstract': 'Active Real-time interaction with video LLMs introduces a new paradigm for human-computer interaction, where the model not only understands user intent but also responds while continuously processing streaming video on the fly. Unlike offline video LLMs, which analyze the entire video before answering questions, active real-time interaction requires three capabilities: 1) Perception: real-time video monitoring and interaction capturing. 2) Decision: raising proactive interaction in proper situations, 3) Reaction: continuous interaction with users. However, inherent conflicts exist among the desired capabilities. The Decision and Reaction require a contrary Perception scale and grain, and the autoregressive decoding blocks the real-time Perception and Decision during the Reaction. To unify the conflicted capabilities within a harmonious system, we present Dispider, a system that disentangles Perception, Decision, and Reaction. Dispider features a lightweight proactive streaming video processing module that tracks the video stream and identifies optimal moments for interaction. Once the interaction is triggered, an asynchronous interaction module provides detailed responses, while the processing module continues to monitor the video in the meantime. Our disentangled and asynchronous design ensures timely, contextually accurate, and computationally efficient responses, making Dispider ideal for active real-time interaction for long-duration video streams. Experiments show that Dispider not only maintains strong performance in conventional video QA tasks, but also significantly surpasses previous online models in streaming scenario responses, thereby validating the effectiveness of our architecture. The code and model are released at https://github.com/Mark12Ding/Dispider.', 'score': 20, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '1e9974be2d206516', 'authors': ['Rui Qian', 'Shuangrui Ding', 'Xiaoyi Dong', 'Pan Zhang', 'Yuhang Zang', 'Yuhang Cao', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.03218.jpg', 'data': {'categories': ['#long_context', '#video', '#optimization', '#architecture', '#interpretability'], 'emoji': '🎥', 'ru': {'title': 'Dispider: Интеллектуальное взаимодействие с видео в реальном времени', 'desc': 'Статья представляет систему Dispider для активного взаимодействия с видео в реальном времени с использованием языковых моделей. Система разделяет процессы восприятия, принятия решений и реакции, что позволяет эффективно обрабатывать потоковое видео и взаимодействовать с пользователем. Dispider использует легковесный модуль обработки видео для отслеживания потока и определения оптимальных моментов для взаимодействия. Асинхронная архитектура обеспечивает своевременные и точные ответы при длительной обработке видеопотоков.'}, 'en': {'title': 'Dispider: Real-time Interaction Redefined for Video LLMs', 'desc': 'This paper introduces Dispider, a system designed for active real-time interaction with video using large language models (LLMs). Unlike traditional offline models, Dispider can process video streams continuously while engaging with users, requiring three key capabilities: Perception, Decision, and Reaction. The system addresses conflicts between these capabilities by disentangling them, allowing for efficient monitoring and interaction without lag. Experimental results demonstrate that Dispider outperforms previous models in streaming scenarios, providing timely and contextually relevant responses during long-duration video interactions.'}, 'zh': {'title': '主动实时交互的新范式', 'desc': '本论文介绍了一种名为Dispider的系统,旨在实现视频大语言模型的主动实时交互。该系统通过分离感知、决策和反应三个能力,解决了实时交互中的固有冲突。Dispider具备轻量级的流媒体处理模块,能够实时监控视频流并识别最佳交互时机。实验结果表明,Dispider在传统视频问答任务中表现优异,并在流媒体场景响应上显著超越了之前的在线模型。'}}}, {'id': 'https://huggingface.co/papers/2501.02157', 'title': 'Personalized Graph-Based Retrieval for Large Language Models', 'url': 'https://huggingface.co/papers/2501.02157', 'abstract': 'As large language models (LLMs) evolve, their ability to deliver personalized and context-aware responses offers transformative potential for improving user experiences. Existing personalization approaches, however, often rely solely on user history to augment the prompt, limiting their effectiveness in generating tailored outputs, especially in cold-start scenarios with sparse data. To address these limitations, we propose Personalized Graph-based Retrieval-Augmented Generation (PGraphRAG), a framework that leverages user-centric knowledge graphs to enrich personalization. By directly integrating structured user knowledge into the retrieval process and augmenting prompts with user-relevant context, PGraphRAG enhances contextual understanding and output quality. We also introduce the Personalized Graph-based Benchmark for Text Generation, designed to evaluate personalized text generation tasks in real-world settings where user history is sparse or unavailable. Experimental results show that PGraphRAG significantly outperforms state-of-the-art personalization methods across diverse tasks, demonstrating the unique advantages of graph-based retrieval for personalization.', 'score': 16, 'issue_id': 1527, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': '65e3736cfc1e3295', 'authors': ['Steven Au', 'Cameron J. Dimacali', 'Ojasmitha Pedirappagari', 'Namyong Park', 'Franck Dernoncourt', 'Yu Wang', 'Nikos Kanakaris', 'Hanieh Deilamsalehy', 'Ryan A. Rossi', 'Nesreen K. Ahmed'], 'affiliations': ['Adobe Research', 'Cisco AI Research', 'Meta AI', 'University of California Santa Cruz', 'University of Oregon', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.02157.jpg', 'data': {'categories': ['#rag', '#optimization', '#graphs', '#multimodal', '#benchmark', '#games'], 'emoji': '🕸️', 'ru': {'title': 'Графы знаний на службе персонализации языковых моделей', 'desc': 'Статья представляет новый подход к персонализации ответов больших языковых моделей (LLM) под названием PGraphRAG. В отличие от существующих методов, полагающихся на историю пользователя, PGraphRAG использует ориентированные на пользователя графы знаний для обогащения контекста. Этот метод улучшает понимание контекста и качество генерируемых ответов, особенно в сценариях с ограниченными данными о пользователе. Экспериментальные результаты показывают, что PGraphRAG превосходит современные методы персонализации в различных задачах.'}, 'en': {'title': 'Revolutionizing Personalization with Graph-based Retrieval', 'desc': "This paper introduces a new framework called Personalized Graph-based Retrieval-Augmented Generation (PGraphRAG) that enhances the personalization of large language models (LLMs). Unlike traditional methods that depend only on user history, PGraphRAG utilizes user-centric knowledge graphs to provide richer context for generating responses. By integrating structured user information into the retrieval process, it improves the model's understanding and the quality of its outputs, especially in situations where user data is limited. The authors also present a benchmark for evaluating personalized text generation, showing that PGraphRAG outperforms existing methods in various tasks."}, 'zh': {'title': '个性化图谱提升生成质量', 'desc': '随着大型语言模型的发展,它们在提供个性化和上下文感知的响应方面展现出巨大的潜力。现有的个性化方法通常仅依赖用户历史数据来增强提示,这在数据稀疏的冷启动场景中效果有限。为了解决这些问题,我们提出了个性化图谱检索增强生成(PGraphRAG)框架,利用以用户为中心的知识图谱来丰富个性化。实验结果表明,PGraphRAG在多种任务中显著优于现有的个性化方法,展示了基于图谱的检索在个性化中的独特优势。'}}}, {'id': 'https://huggingface.co/papers/2501.02497', 'title': 'Test-time Computing: from System-1 Thinking to System-2 Thinking', 'url': 'https://huggingface.co/papers/2501.02497', 'abstract': "The remarkable performance of the o1 model in complex reasoning demonstrates that test-time computing scaling can further unlock the model's potential, enabling powerful System-2 thinking. However, there is still a lack of comprehensive surveys for test-time computing scaling. We trace the concept of test-time computing back to System-1 models. In System-1 models, test-time computing addresses distribution shifts and improves robustness and generalization through parameter updating, input modification, representation editing, and output calibration. In System-2 models, it enhances the model's reasoning ability to solve complex problems through repeated sampling, self-correction, and tree search. We organize this survey according to the trend of System-1 to System-2 thinking, highlighting the key role of test-time computing in the transition from System-1 models to weak System-2 models, and then to strong System-2 models. We also point out a few possible future directions.", 'score': 15, 'issue_id': 1528, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': '7d9414c60fe7701d', 'authors': ['Yixin Ji', 'Juntao Li', 'Hai Ye', 'Kaixin Wu', 'Jia Xu', 'Linjian Mo', 'Min Zhang'], 'affiliations': ['Ant Group', 'Department of Computer Science, National University of Singapore', 'School of Computer Science and Technology, Soochow University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02497.jpg', 'data': {'categories': ['#reasoning', '#math', '#survey', '#training'], 'emoji': '🧠', 'ru': {'title': 'Масштабирование вычислений: путь к мышлению System-2', 'desc': 'Эта статья рассматривает масштабирование вычислений во время тестирования для улучшения производительности моделей машинного обучения. Авторы прослеживают эволюцию этой концепции от моделей System-1 до моделей System-2. В работе описываются различные методы, такие как обновление параметров, модификация входных данных и древовидный поиск. Исследование подчеркивает ключевую роль вычислений во время тестирования в переходе от моделей System-1 к сильным моделям System-2.'}, 'en': {'title': 'Unlocking Model Potential: The Power of Test-Time Computing', 'desc': 'This paper explores the concept of test-time computing scaling and its impact on machine learning models, particularly in enhancing reasoning capabilities. It distinguishes between System-1 models, which focus on improving robustness and generalization through techniques like parameter updating and output calibration, and System-2 models, which utilize methods such as repeated sampling and self-correction for complex problem-solving. The authors trace the evolution from System-1 to System-2 thinking, emphasizing how test-time computing plays a crucial role in this transition. Additionally, the paper identifies potential future research directions in this area.'}, 'zh': {'title': '测试时计算:从系统-1到强系统-2的关键转变', 'desc': '这篇论文探讨了测试时计算扩展对机器学习模型的影响,特别是在复杂推理中的应用。作者指出,测试时计算可以通过参数更新、输入修改、表示编辑和输出校准来提高模型的鲁棒性和泛化能力。对于系统-2模型,测试时计算通过重复采样、自我修正和树搜索来增强模型的推理能力。论文还强调了测试时计算在从系统-1模型向弱系统-2模型再到强系统-2模型转变中的关键作用,并提出了一些未来的研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.02045', 'title': 'METAGENE-1: Metagenomic Foundation Model for Pandemic Monitoring', 'url': 'https://huggingface.co/papers/2501.02045', 'abstract': 'We pretrain METAGENE-1, a 7-billion-parameter autoregressive transformer model, which we refer to as a metagenomic foundation model, on a novel corpus of diverse metagenomic DNA and RNA sequences comprising over 1.5 trillion base pairs. This dataset is sourced from a large collection of human wastewater samples, processed and sequenced using deep metagenomic (next-generation) sequencing methods. Unlike genomic models that focus on individual genomes or curated sets of specific species, the aim of METAGENE-1 is to capture the full distribution of genomic information present within this wastewater, to aid in tasks relevant to pandemic monitoring and pathogen detection. We carry out byte-pair encoding (BPE) tokenization on our dataset, tailored for metagenomic sequences, and then pretrain our model. In this paper, we first detail the pretraining dataset, tokenization strategy, and model architecture, highlighting the considerations and design choices that enable the effective modeling of metagenomic data. We then show results of pretraining this model on our metagenomic dataset, providing details about our losses, system metrics, and training stability over the course of pretraining. Finally, we demonstrate the performance of METAGENE-1, which achieves state-of-the-art results on a set of genomic benchmarks and new evaluations focused on human-pathogen detection and genomic sequence embedding, showcasing its potential for public health applications in pandemic monitoring, biosurveillance, and early detection of emerging health threats.', 'score': 12, 'issue_id': 1528, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '60a3568f555ed60f', 'authors': ['Ollie Liu', 'Sami Jaghouar', 'Johannes Hagemann', 'Shangshang Wang', 'Jason Wiemels', 'Jeff Kaufman', 'Willie Neiswanger'], 'affiliations': ['Nucleic Acid Observatory', 'Prime Intellect', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.02045.jpg', 'data': {'categories': ['#benchmark', '#data', '#training', '#architecture', '#science', '#dataset', '#healthcare'], 'emoji': '🧬', 'ru': {'title': 'METAGENE-1: Метагеномная модель для мониторинга здоровья населения', 'desc': 'METAGENE-1 - это автореграссивная трансформерная модель с 7 миллиардами параметров, обученная на разнообразных метагеномных последовательностях ДНК и РНК. Модель создана для анализа геномной информации из образцов сточных вод с целью мониторинга пандемий и обнаружения патогенов. Авторы описывают процесс предобучения, включая токенизацию и архитектуру модели, а также демонстрируют результаты на различных геномных задачах. METAGENE-1 показывает высокую эффективность в обнаружении патогенов человека и встраивании геномных последовательностей, что открывает перспективы для применения в общественном здравоохранении.'}, 'en': {'title': 'Unlocking Metagenomics: METAGENE-1 for Pandemic Preparedness', 'desc': 'The paper introduces METAGENE-1, a large autoregressive transformer model designed for metagenomic data analysis. It is pretrained on a vast dataset of metagenomic DNA and RNA sequences derived from human wastewater, totaling over 1.5 trillion base pairs. The model aims to enhance pandemic monitoring and pathogen detection by capturing the diverse genomic information present in wastewater samples. The authors detail their tokenization strategy and model architecture, demonstrating that METAGENE-1 achieves state-of-the-art performance in genomic benchmarks and applications related to public health.'}, 'zh': {'title': 'METAGENE-1:元基因组基础模型助力公共卫生监测', 'desc': '我们预训练了METAGENE-1,这是一个拥有70亿参数的自回归变换器模型,称为元基因组基础模型。该模型在一个包含超过1.5万亿碱基对的多样化元基因组DNA和RNA序列的新数据集上进行训练,这些数据来自大量人类废水样本。METAGENE-1的目标是捕捉废水中存在的基因组信息的完整分布,以帮助进行疫情监测和病原体检测。我们展示了该模型在元基因组数据集上的预训练结果,证明其在公共卫生应用中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.02690', 'title': 'GS-DiT: Advancing Video Generation with Pseudo 4D Gaussian Fields through Efficient Dense 3D Point Tracking', 'url': 'https://huggingface.co/papers/2501.02690', 'abstract': '4D video control is essential in video generation as it enables the use of sophisticated lens techniques, such as multi-camera shooting and dolly zoom, which are currently unsupported by existing methods. Training a video Diffusion Transformer (DiT) directly to control 4D content requires expensive multi-view videos. Inspired by Monocular Dynamic novel View Synthesis (MDVS) that optimizes a 4D representation and renders videos according to different 4D elements, such as camera pose and object motion editing, we bring pseudo 4D Gaussian fields to video generation. Specifically, we propose a novel framework that constructs a pseudo 4D Gaussian field with dense 3D point tracking and renders the Gaussian field for all video frames. Then we finetune a pretrained DiT to generate videos following the guidance of the rendered video, dubbed as GS-DiT. To boost the training of the GS-DiT, we also propose an efficient Dense 3D Point Tracking (D3D-PT) method for the pseudo 4D Gaussian field construction. Our D3D-PT outperforms SpatialTracker, the state-of-the-art sparse 3D point tracking method, in accuracy and accelerates the inference speed by two orders of magnitude. During the inference stage, GS-DiT can generate videos with the same dynamic content while adhering to different camera parameters, addressing a significant limitation of current video generation models. GS-DiT demonstrates strong generalization capabilities and extends the 4D controllability of Gaussian splatting to video generation beyond just camera poses. It supports advanced cinematic effects through the manipulation of the Gaussian field and camera intrinsics, making it a powerful tool for creative video production. Demos are available at https://wkbian.github.io/Projects/GS-DiT/.', 'score': 11, 'issue_id': 1530, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'b4c147a2637166a8', 'authors': ['Weikang Bian', 'Zhaoyang Huang', 'Xiaoyu Shi', 'Yijin Li', 'Fu-Yun Wang', 'Hongsheng Li'], 'affiliations': ['Avolution AI', 'Centre for Perceptual and Interactive Intelligence', 'Multimedia Laboratory, The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.02690.jpg', 'data': {'categories': ['#video', '#games', '#diffusion', '#3d'], 'emoji': '🎥', 'ru': {'title': 'Революция в генерации видео: 4D-контроль с помощью гауссовых полей', 'desc': 'Эта статья представляет инновационный подход к генерации видео с 4D-контролем, используя псевдо-4D гауссовы поля и модель Diffusion Transformer (DiT). Авторы предлагают метод Dense 3D Point Tracking (D3D-PT) для эффективного построения гауссовых полей, превосходящий существующие решения по точности и скорости. Разработанная система GS-DiT позволяет генерировать видео с одинаковым динамическим содержанием, но с разными параметрами камеры, что открывает новые возможности для создания кинематографических эффектов. Метод демонстрирует сильные обобщающие способности и расширяет возможности 4D-контроля в генерации видео.'}, 'en': {'title': 'Revolutionizing Video Generation with 4D Control', 'desc': 'This paper introduces a new method for generating videos that can be controlled in four dimensions (4D), which includes both camera movement and object motion. The authors propose a framework called GS-DiT that utilizes pseudo 4D Gaussian fields to enhance video generation, allowing for advanced cinematic effects. They also present a Dense 3D Point Tracking (D3D-PT) technique that improves the accuracy and speed of tracking 3D points compared to existing methods. Overall, GS-DiT enables the creation of dynamic videos with flexible camera parameters, significantly advancing the capabilities of video generation models.'}, 'zh': {'title': '伪4D高斯场:视频生成的新突破', 'desc': '本论文提出了一种新颖的框架,利用伪4D高斯场进行视频生成,以支持复杂的镜头技术。我们通过密集的3D点跟踪构建伪4D高斯场,并为所有视频帧渲染该高斯场。为了提升GS-DiT的训练效果,我们还提出了一种高效的密集3D点跟踪方法,显著提高了准确性和推理速度。GS-DiT能够在不同的相机参数下生成具有相同动态内容的视频,扩展了视频生成的4D可控性,成为创意视频制作的强大工具。'}}}, {'id': 'https://huggingface.co/papers/2501.03059', 'title': 'Through-The-Mask: Mask-based Motion Trajectories for Image-to-Video Generation', 'url': 'https://huggingface.co/papers/2501.03059', 'abstract': "We consider the task of Image-to-Video (I2V) generation, which involves transforming static images into realistic video sequences based on a textual description. While recent advancements produce photorealistic outputs, they frequently struggle to create videos with accurate and consistent object motion, especially in multi-object scenarios. To address these limitations, we propose a two-stage compositional framework that decomposes I2V generation into: (i) An explicit intermediate representation generation stage, followed by (ii) A video generation stage that is conditioned on this representation. Our key innovation is the introduction of a mask-based motion trajectory as an intermediate representation, that captures both semantic object information and motion, enabling an expressive but compact representation of motion and semantics. To incorporate the learned representation in the second stage, we utilize object-level attention objectives. Specifically, we consider a spatial, per-object, masked-cross attention objective, integrating object-specific prompts into corresponding latent space regions and a masked spatio-temporal self-attention objective, ensuring frame-to-frame consistency for each object. We evaluate our method on challenging benchmarks with multi-object and high-motion scenarios and empirically demonstrate that the proposed method achieves state-of-the-art results in temporal coherence, motion realism, and text-prompt faithfulness. Additionally, we introduce \\benchmark, a new challenging benchmark for single-object and multi-object I2V generation, and demonstrate our method's superiority on this benchmark. Project page is available at https://guyyariv.github.io/TTM/.", 'score': 10, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '4f24667b663efb7d', 'authors': ['Guy Yariv', 'Yuval Kirstain', 'Amit Zohar', 'Shelly Sheynin', 'Yaniv Taigman', 'Yossi Adi', 'Sagie Benaim', 'Adam Polyak'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'The Hebrew University of Jerusalem'], 'pdf_title_img': 'assets/pdf/title_img/2501.03059.jpg', 'data': {'categories': ['#video', '#multimodal', '#benchmark'], 'emoji': '🎬', 'ru': {'title': 'Генерация реалистичных видео из статичных изображений с помощью масок траекторий движения', 'desc': 'Статья представляет новый подход к генерации видео из изображений (I2V) на основе текстового описания. Авторы предлагают двухэтапную композиционную модель, которая сначала генерирует промежуточное представление в виде маски траектории движения объектов. Затем это представление используется для генерации видео с применением объектно-ориентированных целевых функций внимания. Эксперименты показывают, что предложенный метод достигает лучших результатов по временной согласованности, реалистичности движения и соответствию текстовому описанию.'}, 'en': {'title': 'Transforming Images into Realistic Videos with Motion Precision', 'desc': 'This paper addresses the challenge of generating videos from static images using textual descriptions, known as Image-to-Video (I2V) generation. The authors propose a two-stage framework that first creates an intermediate representation to capture object semantics and motion, followed by a video generation stage that utilizes this representation. A key innovation is the use of a mask-based motion trajectory, which helps maintain accurate object motion and consistency across frames. The method is evaluated against challenging benchmarks and shows superior performance in terms of motion realism and coherence, while also introducing a new benchmark for I2V generation.'}, 'zh': {'title': '图像到视频生成的新突破', 'desc': '本文探讨了图像到视频(I2V)生成的任务,即根据文本描述将静态图像转换为逼真的视频序列。尽管近期的进展能够生成照片级真实感的输出,但在多物体场景中,视频的物体运动准确性和一致性仍然存在挑战。为了解决这些问题,我们提出了一种两阶段的组合框架,首先生成明确的中间表示,然后基于该表示生成视频。我们的创新在于引入了一种基于掩码的运动轨迹作为中间表示,能够捕捉语义物体信息和运动,从而实现运动和语义的紧凑而富有表现力的表示。'}}}, {'id': 'https://huggingface.co/papers/2501.03006', 'title': 'TransPixar: Advancing Text-to-Video Generation with Transparency', 'url': 'https://huggingface.co/papers/2501.03006', 'abstract': 'Text-to-video generative models have made significant strides, enabling diverse applications in entertainment, advertising, and education. However, generating RGBA video, which includes alpha channels for transparency, remains a challenge due to limited datasets and the difficulty of adapting existing models. Alpha channels are crucial for visual effects (VFX), allowing transparent elements like smoke and reflections to blend seamlessly into scenes. We introduce TransPixar, a method to extend pretrained video models for RGBA generation while retaining the original RGB capabilities. TransPixar leverages a diffusion transformer (DiT) architecture, incorporating alpha-specific tokens and using LoRA-based fine-tuning to jointly generate RGB and alpha channels with high consistency. By optimizing attention mechanisms, TransPixar preserves the strengths of the original RGB model and achieves strong alignment between RGB and alpha channels despite limited training data. Our approach effectively generates diverse and consistent RGBA videos, advancing the possibilities for VFX and interactive content creation.', 'score': 8, 'issue_id': 1527, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'e85e5fa9a03d5d04', 'authors': ['Luozhou Wang', 'Yijun Li', 'Zhifei Chen', 'Jui-Hsien Wang', 'Zhifei Zhang', 'He Zhang', 'Zhe Lin', 'Yingcong Chen'], 'affiliations': ['Adobe Research', 'HKUST', 'HKUST(GZ)'], 'pdf_title_img': 'assets/pdf/title_img/2501.03006.jpg', 'data': {'categories': ['#optimization', '#architecture', '#training', '#diffusion', '#video'], 'emoji': '🎬', 'ru': {'title': 'TransPixar: Прорыв в генерации RGBA-видео для визуальных эффектов', 'desc': 'TransPixar - это новый метод генерации RGBA-видео, расширяющий возможности предобученных видеомоделей. Он использует архитектуру диффузионного трансформера (DiT) и токены, специфичные для альфа-канала, для совместной генерации RGB и альфа-каналов с высокой согласованностью. Метод применяет тонкую настройку на основе LoRA и оптимизирует механизмы внимания для сохранения сильных сторон исходной RGB-модели. TransPixar эффективно генерирует разнообразные и согласованные RGBA-видео, открывая новые возможности для создания визуальных эффектов и интерактивного контента.'}, 'en': {'title': 'TransPixar: Bridging RGB and Alpha for Enhanced Video Generation', 'desc': 'This paper presents TransPixar, a novel method for generating RGBA videos, which include transparency information crucial for visual effects. The challenge lies in the limited datasets and the need to adapt existing models to handle alpha channels effectively. TransPixar utilizes a diffusion transformer architecture and incorporates alpha-specific tokens, allowing it to generate both RGB and alpha channels simultaneously. By optimizing attention mechanisms and employing LoRA-based fine-tuning, TransPixar achieves high consistency between RGB and alpha outputs, enhancing the quality of video generation for applications in VFX and interactive media.'}, 'zh': {'title': 'TransPixar:生成高质量RGBA视频的新方法', 'desc': '本文介绍了一种名为TransPixar的方法,旨在生成包含透明通道的RGBA视频。传统的视频生成模型在处理透明效果时面临挑战,TransPixar通过扩展预训练模型来解决这一问题。该方法利用扩散变换器架构,结合特定的透明通道标记,并通过LoRA微调实现RGB和透明通道的高一致性生成。最终,TransPixar在有限的数据集上优化了注意力机制,成功生成多样且一致的RGBA视频,推动了视觉特效和互动内容创作的可能性。'}}}, {'id': 'https://huggingface.co/papers/2501.01790', 'title': 'Ingredients: Blending Custom Photos with Video Diffusion Transformers', 'url': 'https://huggingface.co/papers/2501.01790', 'abstract': 'This paper presents a powerful framework to customize video creations by incorporating multiple specific identity (ID) photos, with video diffusion Transformers, referred to as Ingredients. Generally, our method consists of three primary modules: (i) a facial extractor that captures versatile and precise facial features for each human ID from both global and local perspectives; (ii) a multi-scale projector that maps face embeddings into the contextual space of image query in video diffusion transformers; (iii) an ID router that dynamically combines and allocates multiple ID embedding to the corresponding space-time regions. Leveraging a meticulously curated text-video dataset and a multi-stage training protocol, Ingredients demonstrates superior performance in turning custom photos into dynamic and personalized video content. Qualitative evaluations highlight the advantages of proposed method, positioning it as a significant advancement toward more effective generative video control tools in Transformer-based architecture, compared to existing methods. The data, code, and model weights are publicly available at: https://github.com/feizc/Ingredients.', 'score': 6, 'issue_id': 1528, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'dd1ccebdd2fcf276', 'authors': ['Zhengcong Fei', 'Debang Li', 'Di Qiu', 'Changqian Yu', 'Mingyuan Fan'], 'affiliations': ['Kunlun Inc. Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01790.jpg', 'data': {'categories': ['#open_source', '#training', '#architecture', '#video', '#dataset', '#diffusion', '#multimodal'], 'emoji': '🎬', 'ru': {'title': 'Персонализированное видео из фотографий: новый уровень контроля в генеративных моделях', 'desc': 'Статья представляет новый метод под названием Ingredients для создания персонализированных видео с использованием нескольких фотографий конкретных людей. Метод состоит из трех основных модулей: экстрактора лицевых признаков, многомасштабного проектора и маршрутизатора идентификаторов. Ingredients использует тщательно подобранный набор данных текст-видео и многоэтапный протокол обучения для достижения превосходных результатов. Качественная оценка показывает преимущества предложенного метода по сравнению с существующими подходами в области генеративного контроля видео на основе архитектуры Transformer.'}, 'en': {'title': 'Transforming Photos into Personalized Videos with Ingredients', 'desc': 'This paper introduces a novel framework called Ingredients for creating personalized videos using multiple identity photos. It employs a facial extractor to accurately capture facial features, a multi-scale projector to integrate these features into video diffusion transformers, and an ID router to manage the allocation of identity embeddings across different time and space regions in the video. The framework is trained on a carefully selected text-video dataset, enhancing its ability to generate dynamic video content from custom images. The results show that Ingredients outperforms existing methods, marking a significant step forward in generative video control using Transformer architectures.'}, 'zh': {'title': '个性化视频创作的新突破', 'desc': '本文提出了一种强大的框架,通过结合多个特定身份照片,定制视频创作,称为Ingredients。该方法主要由三个模块组成:面部提取器、多个尺度投影器和身份路由器,分别用于提取面部特征、映射面部嵌入和动态分配身份嵌入。通过精心策划的文本-视频数据集和多阶段训练协议,Ingredients在将自定义照片转化为动态个性化视频内容方面表现出色。定性评估显示,该方法在基于Transformer的架构中,相较于现有方法,显著提升了生成视频控制工具的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.02576', 'title': 'DepthMaster: Taming Diffusion Models for Monocular Depth Estimation', 'url': 'https://huggingface.co/papers/2501.02576', 'abstract': "Monocular depth estimation within the diffusion-denoising paradigm demonstrates impressive generalization ability but suffers from low inference speed. Recent methods adopt a single-step deterministic paradigm to improve inference efficiency while maintaining comparable performance. However, they overlook the gap between generative and discriminative features, leading to suboptimal results. In this work, we propose DepthMaster, a single-step diffusion model designed to adapt generative features for the discriminative depth estimation task. First, to mitigate overfitting to texture details introduced by generative features, we propose a Feature Alignment module, which incorporates high-quality semantic features to enhance the denoising network's representation capability. Second, to address the lack of fine-grained details in the single-step deterministic framework, we propose a Fourier Enhancement module to adaptively balance low-frequency structure and high-frequency details. We adopt a two-stage training strategy to fully leverage the potential of the two modules. In the first stage, we focus on learning the global scene structure with the Feature Alignment module, while in the second stage, we exploit the Fourier Enhancement module to improve the visual quality. Through these efforts, our model achieves state-of-the-art performance in terms of generalization and detail preservation, outperforming other diffusion-based methods across various datasets. Our project page can be found at https://indu1ge.github.io/DepthMaster_page.", 'score': 5, 'issue_id': 1536, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'a8429b95ef4eb7b7', 'authors': ['Ziyang Song', 'Zerong Wang', 'Bo Li', 'Hao Zhang', 'Ruijie Zhu', 'Li Liu', 'Peng-Tao Jiang', 'Tianzhu Zhang'], 'affiliations': ['School of Information Science and Technology, University of Science and Technology of China (USTC), Hefei 230026, P.R.China', 'vivo Mobile Communication Co., Ltd., Hangzhou 310030, P.R.China'], 'pdf_title_img': 'assets/pdf/title_img/2501.02576.jpg', 'data': {'categories': ['#optimization', '#training', '#diffusion', '#cv', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'DepthMaster: Однопроходная диффузионная модель для точной оценки глубины с улучшенной генерализацией', 'desc': 'DepthMaster - это однопроходная диффузионная модель для монокулярной оценки глубины. Она использует модуль выравнивания признаков для улучшения представления семантических особенностей и модуль улучшения Фурье для балансировки низкочастотной структуры и высокочастотных деталей. Модель обучается в два этапа: сначала фокусируется на глобальной структуре сцены, затем улучшает визуальное качество. DepthMaster превосходит другие диффузионные методы по обобщающей способности и сохранению деталей на различных наборах данных.'}, 'en': {'title': 'DepthMaster: Bridging Generative and Discriminative Depth Estimation', 'desc': 'This paper introduces DepthMaster, a single-step diffusion model aimed at improving monocular depth estimation. It addresses the inefficiencies of previous methods by integrating a Feature Alignment module to enhance the representation of semantic features and reduce overfitting to textures. Additionally, a Fourier Enhancement module is proposed to balance low-frequency structures with high-frequency details, ensuring finer depth estimation. The two-stage training strategy allows the model to first learn global scene structures and then refine visual quality, resulting in state-of-the-art performance across various datasets.'}, 'zh': {'title': 'DepthMaster:提升深度估计的单步扩散模型', 'desc': '本文提出了一种名为DepthMaster的单步扩散模型,用于单目深度估计。该模型通过特征对齐模块和傅里叶增强模块,优化生成特征以适应判别性深度估计任务。特征对齐模块增强了去噪网络的表示能力,而傅里叶增强模块则平衡了低频结构和高频细节。通过两阶段训练策略,DepthMaster在泛化能力和细节保留方面达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.01830', 'title': 'Auto-RT: Automatic Jailbreak Strategy Exploration for Red-Teaming Large Language Models', 'url': 'https://huggingface.co/papers/2501.01830', 'abstract': 'Automated red-teaming has become a crucial approach for uncovering vulnerabilities in large language models (LLMs). However, most existing methods focus on isolated safety flaws, limiting their ability to adapt to dynamic defenses and uncover complex vulnerabilities efficiently. To address this challenge, we propose Auto-RT, a reinforcement learning framework that automatically explores and optimizes complex attack strategies to effectively uncover security vulnerabilities through malicious queries. Specifically, we introduce two key mechanisms to reduce exploration complexity and improve strategy optimization: 1) Early-terminated Exploration, which accelerate exploration by focusing on high-potential attack strategies; and 2) Progressive Reward Tracking algorithm with intermediate downgrade models, which dynamically refine the search trajectory toward successful vulnerability exploitation. Extensive experiments across diverse LLMs demonstrate that, by significantly improving exploration efficiency and automatically optimizing attack strategies, Auto-RT detects a boarder range of vulnerabilities, achieving a faster detection speed and 16.63\\% higher success rates compared to existing methods.', 'score': 5, 'issue_id': 1529, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '5b08b81c52ec8da8', 'authors': ['Yanjiang Liu', 'Shuhen Zhou', 'Yaojie Lu', 'Huijia Zhu', 'Weiqiang Wang', 'Hongyu Lin', 'Ben He', 'Xianpei Han', 'Le Sun'], 'affiliations': ['Ant Group', 'Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences, Beijing, China', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01830.jpg', 'data': {'categories': ['#security', '#rl', '#rlhf'], 'emoji': '🛡️', 'ru': {'title': 'Auto-RT: Умная защита больших языковых моделей', 'desc': 'Авторы представляют Auto-RT - фреймворк на основе обучения с подкреплением для автоматизированного поиска уязвимостей в больших языковых моделях (LLM). Система использует механизмы раннего прекращения исследования и прогрессивного отслеживания наград для оптимизации стратегий атак. Auto-RT превосходит существующие методы, обнаруживая более широкий спектр уязвимостей с большей скоростью и на 16.63% более высоким уровнем успеха. Этот подход позволяет эффективно выявлять сложные уязвимости в LLM через вредоносные запросы.'}, 'en': {'title': 'Auto-RT: Revolutionizing Vulnerability Detection in LLMs', 'desc': 'This paper presents Auto-RT, a reinforcement learning framework designed to enhance automated red-teaming for large language models (LLMs). Unlike traditional methods that target isolated safety flaws, Auto-RT efficiently uncovers complex vulnerabilities by optimizing attack strategies through malicious queries. It introduces two innovative mechanisms: Early-terminated Exploration to prioritize promising attack strategies, and Progressive Reward Tracking to refine the search process dynamically. Experimental results show that Auto-RT significantly improves exploration efficiency and detection success rates, outperforming existing approaches.'}, 'zh': {'title': '自动化红队:高效发现语言模型漏洞的利器', 'desc': '自动化红队技术在发现大型语言模型(LLMs)中的漏洞方面变得至关重要。现有方法大多集中于孤立的安全缺陷,限制了其适应动态防御和高效发现复杂漏洞的能力。为了解决这个问题,我们提出了Auto-RT,一个强化学习框架,能够自动探索和优化复杂的攻击策略,通过恶意查询有效发现安全漏洞。我们的实验表明,Auto-RT显著提高了探索效率和攻击策略的自动优化,检测到更广泛的漏洞,检测速度更快,成功率提高了16.63%。'}}}, {'id': 'https://huggingface.co/papers/2501.02506', 'title': 'ToolHop: A Query-Driven Benchmark for Evaluating Large Language Models in Multi-Hop Tool Use', 'url': 'https://huggingface.co/papers/2501.02506', 'abstract': 'Effective evaluation of multi-hop tool use is critical for analyzing the understanding, reasoning, and function-calling capabilities of large language models (LLMs). However, progress has been hindered by a lack of reliable evaluation datasets. To address this, we present ToolHop, a dataset comprising 995 user queries and 3,912 associated tools, specifically designed for rigorous evaluation of multi-hop tool use. ToolHop ensures diverse queries, meaningful interdependencies, locally executable tools, detailed feedback, and verifiable answers through a novel query-driven data construction approach that includes tool creation, document refinement, and code generation. We evaluate 14 LLMs across five model families (i.e., LLaMA3.1, Qwen2.5, Gemini1.5, Claude3.5, and GPT), uncovering significant challenges in handling multi-hop tool-use scenarios. The leading model, GPT-4o, achieves an accuracy of 49.04%, underscoring substantial room for improvement. Further analysis reveals variations in tool-use strategies for various families, offering actionable insights to guide the development of more effective approaches. Code and data can be found in https://huggingface.co/bytedance-research/ToolHop.', 'score': 5, 'issue_id': 1529, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'f785173226e5f9fc', 'authors': ['Junjie Ye', 'Zhengyin Du', 'Xuesong Yao', 'Weijian Lin', 'Yufei Xu', 'Zehui Chen', 'Zaiyuan Wang', 'Sining Zhu', 'Zhiheng Xi', 'Siyu Yuan', 'Tao Gui', 'Qi Zhang', 'Xuanjing Huang', 'Jiechao Chen'], 'affiliations': ['ByteDance', 'Institute of Modern Languages and Linguistics, Fudan University', 'School of Computer Science, Fudan University', 'School of Data Science, Fudan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02506.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#dataset', '#optimization'], 'emoji': '🛠️', 'ru': {'title': 'ToolHop: новый стандарт для оценки многоэтапного использования инструментов в LLM', 'desc': 'Статья представляет новый набор данных ToolHop для оценки многоэтапного использования инструментов большими языковыми моделями (LLM). ToolHop содержит 995 пользовательских запросов и 3912 связанных инструментов, обеспечивая разнообразие запросов, взаимозависимости и возможность локального выполнения. Авторы оценили 14 LLM из пяти семейств моделей, выявив значительные трудности в обработке сценариев многоэтапного использования инструментов. Лучшая модель, GPT-4o, достигла точности 49.04%, что указывает на большой потенциал для улучшения.'}, 'en': {'title': 'ToolHop: Advancing Multi-Hop Tool Use Evaluation for LLMs', 'desc': 'This paper introduces ToolHop, a new dataset designed to evaluate how well large language models (LLMs) can use multiple tools in a single task. It includes 995 user queries and 3,912 tools, focusing on diverse and interdependent queries that can be executed locally. The authors tested 14 different LLMs, revealing that even the best-performing model, GPT-4o, only achieved 49.04% accuracy, indicating significant challenges in multi-hop tool use. The findings highlight different strategies employed by various model families, providing insights for future improvements in LLM capabilities.'}, 'zh': {'title': 'ToolHop:多跳工具使用的有效评估数据集', 'desc': '本文介绍了ToolHop数据集,该数据集包含995个用户查询和3912个相关工具,旨在有效评估大型语言模型(LLMs)在多跳工具使用中的理解、推理和功能调用能力。通过新颖的查询驱动数据构建方法,ToolHop确保了查询的多样性、工具的局部可执行性和可验证的答案。我们对14个不同模型(如LLaMA3.1、Qwen2.5等)进行了评估,发现它们在处理多跳工具使用场景时面临显著挑战。尽管GPT-4o模型的准确率为49.04%,但仍有很大的改进空间,分析还揭示了不同模型家族在工具使用策略上的差异,为未来的研究提供了有价值的见解。'}}}, {'id': 'https://huggingface.co/papers/2501.02423', 'title': 'Scaling Laws for Floating Point Quantization Training', 'url': 'https://huggingface.co/papers/2501.02423', 'abstract': 'Low-precision training is considered an effective strategy for reducing both training and downstream inference costs. Previous scaling laws for precision mainly focus on integer quantization, which pay less attention to the constituents in floating-point quantization and thus cannot well fit the LLM losses in this scenario. In contrast, while floating-point quantization training is more commonly implemented in production, the research on it has been relatively superficial. In this paper, we thoroughly explore the effects of floating-point quantization targets, exponent bits, mantissa bits, and the calculation granularity of the scaling factor in floating-point quantization training performance of LLM models. While presenting an accurate floating-point quantization unified scaling law, we also provide valuable suggestions for the community: (1) Exponent bits contribute slightly more to the model performance than mantissa bits. We provide the optimal exponent-mantissa bit ratio for different bit numbers, which is available for future reference by hardware manufacturers; (2) We discover the formation of the critical data size in low-precision LLM training. Too much training data exceeding the critical data size will inversely bring in degradation of LLM performance; (3) The optimal floating-point quantization precision is directly proportional to the computational power, but within a wide computational power range, we estimate that the best cost-performance precision lies between 4-8 bits.', 'score': 4, 'issue_id': 1537, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'be6872257cb9a129', 'authors': ['Xingwu Sun', 'Shuaipeng Li', 'Ruobing Xie', 'Weidong Han', 'Kan Wu', 'Zhen Yang', 'Yixing Li', 'An Wang', 'Shuai Li', 'Jinbao Xue', 'Yu Cheng', 'Yangyu Tao', 'Zhanhui Kang', 'Chengzhong Xu', 'Di Wang', 'Jie Jiang'], 'affiliations': ['Tencent Hunyuan', 'The Chinese University of Hong Kong', 'Tokyo Institute of Technology', 'University of Macau'], 'pdf_title_img': 'assets/pdf/title_img/2501.02423.jpg', 'data': {'categories': ['#training', '#optimization', '#inference'], 'emoji': '🧮', 'ru': {'title': 'Оптимизация точности вычислений в обучении языковых моделей', 'desc': 'Статья исследует влияние квантования с плавающей запятой на обучение больших языковых моделей (LLM). Авторы анализируют роль экспоненциальных и мантиссных битов, а также размера обучающих данных в производительности моделей. Они представляют унифицированный закон масштабирования для квантования с плавающей запятой и дают рекомендации по оптимальному соотношению битов и размеру данных. Исследование показывает, что оптимальная точность квантования находится в диапазоне 4-8 бит для широкого спектра вычислительных мощностей.'}, 'en': {'title': 'Optimizing Floating-Point Quantization for Better LLM Performance', 'desc': 'This paper investigates the impact of floating-point quantization on the training performance of large language models (LLMs). It highlights that previous research primarily focused on integer quantization, neglecting the nuances of floating-point quantization. The authors establish a unified scaling law for floating-point quantization and provide insights on the optimal ratio of exponent to mantissa bits, emphasizing that exponent bits have a greater influence on model performance. Additionally, they identify a critical data size threshold, beyond which performance may degrade, and suggest that the best precision for cost-performance lies between 4-8 bits, depending on computational power.'}, 'zh': {'title': '低精度训练:优化浮点量化的关键', 'desc': '低精度训练被认为是降低训练和推理成本的有效策略。以往的研究主要集中在整数量化上,而对浮点量化的研究相对较少,导致无法很好地适应大语言模型的损失情况。本文深入探讨了浮点量化训练中目标、指数位、尾数位和缩放因子的计算粒度对大语言模型性能的影响,并提出了统一的浮点量化缩放法则。研究结果表明,指数位对模型性能的贡献略高于尾数位,并发现了低精度训练中的关键数据大小。'}}}, {'id': 'https://huggingface.co/papers/2501.02832', 'title': 'Samba-asr state-of-the-art speech recognition leveraging structured state-space models', 'url': 'https://huggingface.co/papers/2501.02832', 'abstract': 'We propose Samba ASR, the first state-of-the-art Automatic Speech Recognition (ASR) model leveraging the novel Mamba architecture as both encoder and decoder, built on the foundation of state-space models (SSMs). Unlike transformer-based ASR models, which rely on self-attention mechanisms to capture dependencies, Samba ASR effectively models both local and global temporal dependencies using efficient state-space dynamics, achieving remarkable performance gains. By addressing the limitations of transformers, such as quadratic scaling with input length and difficulty in handling long-range dependencies, Samba ASR achieves superior accuracy and efficiency. Experimental results demonstrate that Samba ASR surpasses existing open-source transformer-based ASR models across various standard benchmarks, establishing it as the new state of the art in ASR. Extensive evaluations on benchmark datasets show significant improvements in Word Error Rate (WER), with competitive performance even in low-resource scenarios. Furthermore, the computational efficiency and parameter optimization of the Mamba architecture make Samba ASR a scalable and robust solution for diverse ASR tasks. Our contributions include: A new Samba ASR architecture demonstrating the superiority of SSMs over transformer-based models for speech sequence processing. A comprehensive evaluation on public benchmarks showcasing state-of-the-art performance. An analysis of computational efficiency, robustness to noise, and sequence generalization. This work highlights the viability of Mamba SSMs as a transformer-free alternative for efficient and accurate ASR. By leveraging state-space modeling advancements, Samba ASR sets a new benchmark for ASR performance and future research.', 'score': 4, 'issue_id': 1530, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'ed3c4a6192d0c5f9', 'authors': ['Syed Abdul Gaffar Shakhadri', 'Kruthika KR', 'Kartik Basavaraj Angadi'], 'affiliations': ['SandLogic Technologies Pvt Ltd'], 'pdf_title_img': 'assets/pdf/title_img/2501.02832.jpg', 'data': {'categories': ['#audio', '#architecture', '#benchmark', '#low_resource', '#open_source'], 'emoji': '🎙️', 'ru': {'title': 'Samba ASR: революция в распознавании речи с помощью моделей пространства состояний', 'desc': 'Представлена модель Samba ASR - первая современная система автоматического распознавания речи, использующая архитектуру Mamba в качестве энкодера и декодера на основе моделей пространства состояний (SSM). В отличие от трансформерных моделей, Samba ASR эффективно моделирует локальные и глобальные временные зависимости, достигая значительных улучшений производительности. Экспериментальные результаты показывают, что Samba ASR превосходит существующие модели с открытым исходным кодом на основе трансформеров по различным стандартным показателям. Модель демонстрирует значительное снижение показателя Word Error Rate (WER) и высокую эффективность даже при ограниченных ресурсах.'}, 'en': {'title': 'Samba ASR: Redefining Speech Recognition with State-Space Models', 'desc': 'Samba ASR is a groundbreaking Automatic Speech Recognition model that utilizes the innovative Mamba architecture, which functions as both the encoder and decoder. This model departs from traditional transformer-based approaches by employing state-space models (SSMs) to effectively capture both local and global temporal dependencies, leading to enhanced performance. By overcoming the challenges associated with transformers, such as their inefficiency with long input sequences, Samba ASR achieves superior accuracy and efficiency in recognizing speech. Extensive testing shows that Samba ASR not only outperforms existing transformer-based models but also excels in low-resource environments, making it a robust solution for various ASR applications.'}, 'zh': {'title': 'Samba ASR:超越变换器的语音识别新标杆', 'desc': '我们提出了Samba ASR,这是第一个利用新型Mamba架构作为编码器和解码器的最先进自动语音识别(ASR)模型。与基于变换器的ASR模型不同,Samba ASR通过高效的状态空间动态建模局部和全局时间依赖关系,从而实现显著的性能提升。该模型克服了变换器在处理长距离依赖和输入长度的平方扩展等方面的局限性,展现出更高的准确性和效率。实验结果表明,Samba ASR在多个标准基准测试中超越了现有的开源变换器ASR模型,确立了其在ASR领域的新标杆。'}}}, {'id': 'https://huggingface.co/papers/2501.00912', 'title': 'AutoPresent: Designing Structured Visuals from Scratch', 'url': 'https://huggingface.co/papers/2501.00912', 'abstract': "Designing structured visuals such as presentation slides is essential for communicative needs, necessitating both content creation and visual planning skills. In this work, we tackle the challenge of automated slide generation, where models produce slide presentations from natural language (NL) instructions. We first introduce the SlidesBench benchmark, the first benchmark for slide generation with 7k training and 585 testing examples derived from 310 slide decks across 10 domains. SlidesBench supports evaluations that are (i)reference-based to measure similarity to a target slide, and (ii)reference-free to measure the design quality of generated slides alone. We benchmark end-to-end image generation and program generation methods with a variety of models, and find that programmatic methods produce higher-quality slides in user-interactable formats. Built on the success of program generation, we create AutoPresent, an 8B Llama-based model trained on 7k pairs of instructions paired with code for slide generation, and achieve results comparable to the closed-source model GPT-4o. We further explore iterative design refinement where the model is tasked to self-refine its own output, and we found that this process improves the slide's quality. We hope that our work will provide a basis for future work on generating structured visuals.", 'score': 3, 'issue_id': 1539, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'ea7b88fcc0a2025b', 'authors': ['Jiaxin Ge', 'Zora Zhiruo Wang', 'Xuhui Zhou', 'Yi-Hao Peng', 'Sanjay Subramanian', 'Qinyue Tan', 'Maarten Sap', 'Alane Suhr', 'Daniel Fried', 'Graham Neubig', 'Trevor Darrell'], 'affiliations': ['Carnegie Mellon University', 'University of California, Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.00912.jpg', 'data': {'categories': ['#dataset', '#story_generation', '#training', '#benchmark', '#multimodal'], 'emoji': '🎯', 'ru': {'title': 'Автоматизация создания презентаций: от текста к структурированным визуальным материалам', 'desc': 'Эта статья представляет новый бенчмарк SlidesBench для автоматической генерации слайдов презентаций на основе текстовых инструкций. Авторы сравнивают методы генерации изображений и программного кода, обнаружив преимущество последнего подхода. Они создают модель AutoPresent на базе Llama для генерации кода слайдов, достигающую результатов, сопоставимых с GPT-4. Исследователи также изучают итеративное улучшение дизайна слайдов с помощью самооптимизации модели.'}, 'en': {'title': 'Automating Slide Generation with Advanced Models', 'desc': 'This paper addresses the challenge of creating automated slide presentations from natural language instructions. It introduces the SlidesBench benchmark, which includes a large dataset for training and testing slide generation models. The authors evaluate various methods, finding that programmatic approaches yield higher-quality slides. They also present AutoPresent, a model that competes with advanced models like GPT-4o, and demonstrate that iterative design refinement enhances the quality of generated slides.'}, 'zh': {'title': '自动生成高质量演示幻灯片的未来', 'desc': '本研究旨在自动生成演示幻灯片,解决内容创作和视觉规划的挑战。我们首次引入SlidesBench基准,包含7000个训练样本和585个测试样本,涵盖10个领域的310个幻灯片集。通过对比不同模型的图像生成和程序生成方法,我们发现程序生成方法在用户交互格式中生成的幻灯片质量更高。基于程序生成的成功,我们开发了AutoPresent模型,并通过自我优化过程进一步提升幻灯片的质量。'}}}, {'id': 'https://huggingface.co/papers/2501.03225', 'title': 'Automated Generation of Challenging Multiple-Choice Questions for Vision Language Model Evaluation', 'url': 'https://huggingface.co/papers/2501.03225', 'abstract': 'The rapid development of vision language models (VLMs) demands rigorous and reliable evaluation. However, current visual question answering (VQA) benchmarks often depend on open-ended questions, making accurate evaluation difficult due to the variability in natural language responses. To address this, we introduce AutoConverter, an agentic framework that automatically converts these open-ended questions into multiple-choice format, enabling objective evaluation while reducing the costly question creation process. Our experiments demonstrate that AutoConverter can generate correct and challenging multiple-choice questions, with VLMs demonstrating consistently similar or lower accuracy on these questions compared to human-created ones. Using AutoConverter, we construct VMCBench, a benchmark created by transforming 20 existing VQA datasets into a unified multiple-choice format, totaling 9,018 questions. We comprehensively evaluate 33 state-of-the-art VLMs on VMCBench, setting a new standard for scalable, consistent, and reproducible VLM evaluation.', 'score': 1, 'issue_id': 1542, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'aa212f5e596ed0e6', 'authors': ['Yuhui Zhang', 'Yuchang Su', 'Yiming Liu', 'Xiaohan Wang', 'James Burgess', 'Elaine Sui', 'Chenyu Wang', 'Josiah Aklilu', 'Alejandro Lozano', 'Anjiang Wei', 'Ludwig Schmidt', 'Serena Yeung-Levy'], 'affiliations': ['MIT', 'Stanford University', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.03225.jpg', 'data': {'categories': ['#interpretability', '#agents', '#benchmark', '#cv', '#survey', '#games', '#optimization'], 'emoji': '🔄', 'ru': {'title': 'Автоматизация оценки моделей машинного зрения и языка', 'desc': 'Исследователи представили AutoConverter - агентную систему для автоматического преобразования открытых вопросов в вопросы с множественным выбором для оценки моделей машинного зрения и языка (VLM). Эта система позволяет объективно оценивать VLM, избегая сложностей, связанных с вариативностью естественно-языковых ответов. На основе AutoConverter был создан бенчмарк VMCBench, включающий 9018 вопросов из 20 существующих наборов данных для визуальных вопросов и ответов (VQA). VMCBench был использован для всесторонней оценки 33 современных VLM, устанавливая новый стандарт масштабируемой и воспроизводимой оценки таких моделей.'}, 'en': {'title': 'Transforming VQA for Objective Evaluation with AutoConverter', 'desc': 'This paper presents AutoConverter, a framework designed to improve the evaluation of vision language models (VLMs) by converting open-ended visual question answering (VQA) questions into a multiple-choice format. This transformation allows for more objective assessments of VLM performance, addressing the challenges posed by the variability of natural language responses. The authors demonstrate that VLMs perform similarly or worse on these newly generated questions compared to those created by humans, indicating the rigor of the new benchmark. Additionally, they introduce VMCBench, a comprehensive dataset that standardizes 20 existing VQA datasets into a unified multiple-choice format, facilitating scalable and reproducible evaluations of VLMs.'}, 'zh': {'title': '自动化评估视觉语言模型的新标准', 'desc': '随着视觉语言模型(VLMs)的快速发展,评估这些模型的准确性变得尤为重要。现有的视觉问答(VQA)基准往往依赖开放式问题,这使得评估变得困难,因为自然语言回答的多样性很大。为了解决这个问题,我们提出了AutoConverter,这是一种自动将开放式问题转换为多项选择格式的框架,从而实现客观评估并减少问题创建的成本。通过使用AutoConverter,我们构建了VMCBench,这是一个将20个现有VQA数据集转化为统一多项选择格式的基准,包含9,018个问题,全面评估了33个最先进的VLMs,设定了可扩展、一致和可重复的VLM评估新标准。'}}}, {'id': 'https://huggingface.co/papers/2412.18525', 'title': 'Explanatory Instructions: Towards Unified Vision Tasks Understanding and Zero-shot Generalization', 'url': 'https://huggingface.co/papers/2412.18525', 'abstract': "Computer Vision (CV) has yet to fully achieve the zero-shot task generalization observed in Natural Language Processing (NLP), despite following many of the milestones established in NLP, such as large transformer models, extensive pre-training, and the auto-regression paradigm, among others. In this paper, we explore the idea that CV adopts discrete and terminological task definitions (\\eg, ``image segmentation''), which may be a key barrier to zero-shot task generalization. Our hypothesis is that without truly understanding previously-seen tasks--due to these terminological definitions--deep models struggle to generalize to novel tasks. To verify this, we introduce Explanatory Instructions, which provide an intuitive way to define CV task objectives through detailed linguistic transformations from input images to outputs. We create a large-scale dataset comprising 12 million ``image input to explanatory instruction to output'' triplets, and train an auto-regressive-based vision-language model (AR-based VLM) that takes both images and explanatory instructions as input. By learning to follow these instructions, the AR-based VLM achieves instruction-level zero-shot capabilities for previously-seen tasks and demonstrates strong zero-shot generalization for unseen CV tasks. Code and dataset will be openly available on our GitHub repository.", 'score': 48, 'issue_id': 1406, 'pub_date': '2024-12-24', 'pub_date_card': {'ru': '24 декабря', 'en': 'December 24', 'zh': '12月24日'}, 'hash': '23f11aceae00534d', 'authors': ['Yang Shen', 'Xiu-Shen Wei', 'Yifan Sun', 'Yuxin Song', 'Tao Yuan', 'Jian Jin', 'Heyang Xu', 'Yazhou Yao', 'Errui Ding'], 'affiliations': ['Baidu', 'Nanjing University of Science and Technology', 'Southeast University'], 'pdf_title_img': 'assets/pdf/title_img/2412.18525.jpg', 'data': {'categories': ['#dataset', '#open_source', '#cv', '#multimodal', '#transfer_learning'], 'emoji': '🔬', 'ru': {'title': 'Лингвистические инструкции - ключ к обобщению в компьютерном зрении', 'desc': "В статье исследуется проблема недостаточной способности моделей компьютерного зрения к обобщению на новые задачи без предварительного обучения. Авторы предлагают использовать подробные лингвистические инструкции для определения задач вместо дискретных терминологических определений. Они создали большой датасет из 12 миллионов примеров 'изображение-инструкция-результат' и обучили авторегрессионную мультимодальную модель следовать этим инструкциям. Эксперименты показали, что такой подход позволяет модели лучше обобщаться на новые задачи компьютерного зрения без дополнительного обучения."}, 'en': {'title': 'Unlocking Zero-Shot Generalization in Computer Vision with Explanatory Instructions', 'desc': "This paper addresses the challenge of zero-shot task generalization in Computer Vision (CV), which has not reached the levels seen in Natural Language Processing (NLP). The authors argue that the use of specific terminological definitions for tasks in CV, like 'image segmentation', limits the models' ability to generalize to new tasks. To overcome this, they propose 'Explanatory Instructions' that transform image inputs into detailed linguistic outputs, helping models understand tasks better. They introduce a large dataset of 12 million triplets and train an auto-regressive vision-language model that successfully demonstrates zero-shot capabilities for both seen and unseen tasks."}, 'zh': {'title': '突破计算机视觉的零样本任务泛化', 'desc': '本文探讨了计算机视觉(CV)在零样本任务泛化方面的挑战,尤其是与自然语言处理(NLP)的对比。我们认为,CV使用的术语性任务定义(如“图像分割”)可能是阻碍零样本任务泛化的关键因素。为了解决这个问题,我们引入了“解释性指令”,通过详细的语言转换来直观地定义CV任务目标。我们创建了一个包含1200万对“图像输入、解释性指令和输出”的大规模数据集,并训练了一个基于自回归的视觉语言模型,实现了对已见任务的指令级零样本能力,并在未见的CV任务上展示了强大的零样本泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2412.20070', 'title': 'On the Compositional Generalization of Multimodal LLMs for Medical Imaging', 'url': 'https://huggingface.co/papers/2412.20070', 'abstract': 'Multimodal large language models (MLLMs) hold significant potential in the medical field, but their capabilities are often limited by insufficient data in certain medical domains, highlighting the need for understanding what kinds of images can be used by MLLMs for generalization. Current research suggests that multi-task training outperforms single-task as different tasks can benefit each other, but they often overlook the internal relationships within these tasks, providing limited guidance on selecting datasets to enhance specific tasks. To analyze this phenomenon, we attempted to employ compositional generalization (CG)-the ability of models to understand novel combinations by recombining learned elements-as a guiding framework. Since medical images can be precisely defined by Modality, Anatomical area, and Task, naturally providing an environment for exploring CG. Therefore, we assembled 106 medical datasets to create Med-MAT for comprehensive experiments. The experiments confirmed that MLLMs can use CG to understand unseen medical images and identified CG as one of the main drivers of the generalization observed in multi-task training. Additionally, further studies demonstrated that CG effectively supports datasets with limited data and delivers consistent performance across different backbones, highlighting its versatility and broad applicability. Med-MAT is publicly available at https://github.com/FreedomIntelligence/Med-MAT.', 'score': 36, 'issue_id': 1405, 'pub_date': '2024-12-28', 'pub_date_card': {'ru': '28 декабря', 'en': 'December 28', 'zh': '12月28日'}, 'hash': '34f9c6ec4611d6ec', 'authors': ['Zhenyang Cai', 'Junying Chen', 'Rongsheng Wang', 'Weihong Wang', 'Yonglin Deng', 'Dingjie Song', 'Yize Chen', 'Zixu Zhang', 'Benyou Wang'], 'affiliations': ['The Chinese University of Hong Kong, Shenzhen'], 'pdf_title_img': 'assets/pdf/title_img/2412.20070.jpg', 'data': {'categories': ['#dataset', '#healthcare', '#open_source', '#multimodal', '#transfer_learning'], 'emoji': '🩺', 'ru': {'title': 'Композиционная генерализация - ключ к пониманию медицинских изображений для MLLM', 'desc': 'Статья исследует возможности мультимодальных больших языковых моделей (MLLM) в медицинской сфере, фокусируясь на композиционной генерализации (CG). Авторы создали набор данных Med-MAT из 106 медицинских датасетов для изучения способности моделей понимать новые комбинации изображений. Эксперименты показали, что MLLM могут использовать CG для интерпретации ранее невиданных медицинских изображений. Исследование также выявило эффективность CG для датасетов с ограниченными данными и стабильность результатов на разных архитектурах моделей.'}, 'en': {'title': 'Unlocking Medical Insights with Compositional Generalization', 'desc': "This paper explores the use of multimodal large language models (MLLMs) in the medical field, focusing on how they can generalize from limited data. It highlights the advantages of multi-task training over single-task training, emphasizing the importance of understanding the relationships between different tasks. The authors introduce compositional generalization (CG) as a framework to enhance the model's ability to interpret new combinations of medical images. They created a dataset called Med-MAT, which consists of 106 medical datasets, and found that CG significantly improves the performance of MLLMs, especially in scenarios with scarce data."}, 'zh': {'title': '组合泛化助力医学图像理解', 'desc': '多模态大型语言模型(MLLMs)在医学领域具有重要潜力,但在某些医学领域的数据不足限制了其能力。当前研究表明,多任务训练优于单任务训练,因为不同任务可以相互促进,但往往忽视了这些任务之间的内部关系。我们采用组合泛化(CG)作为指导框架,分析模型如何理解新组合的能力,并组建了106个医学数据集以创建Med-MAT进行全面实验。实验结果确认,MLLMs能够利用CG理解未见过的医学图像,并且CG是多任务训练中观察到的泛化的主要驱动因素之一。'}}}, {'id': 'https://huggingface.co/papers/2412.20422', 'title': 'Bringing Objects to Life: 4D generation from 3D objects', 'url': 'https://huggingface.co/papers/2412.20422', 'abstract': 'Recent advancements in generative modeling now enable the creation of 4D content (moving 3D objects) controlled with text prompts. 4D generation has large potential in applications like virtual worlds, media, and gaming, but existing methods provide limited control over the appearance and geometry of generated content. In this work, we introduce a method for animating user-provided 3D objects by conditioning on textual prompts to guide 4D generation, enabling custom animations while maintaining the identity of the original object. We first convert a 3D mesh into a ``static" 4D Neural Radiance Field (NeRF) that preserves the visual attributes of the input object. Then, we animate the object using an Image-to-Video diffusion model driven by text. To improve motion realism, we introduce an incremental viewpoint selection protocol for sampling perspectives to promote lifelike movement and a masked Score Distillation Sampling (SDS) loss, which leverages attention maps to focus optimization on relevant regions. We evaluate our model in terms of temporal coherence, prompt adherence, and visual fidelity and find that our method outperforms baselines that are based on other approaches, achieving up to threefold improvements in identity preservation measured using LPIPS scores, and effectively balancing visual quality with dynamic content.', 'score': 29, 'issue_id': 1408, 'pub_date': '2024-12-29', 'pub_date_card': {'ru': '29 декабря', 'en': 'December 29', 'zh': '12月29日'}, 'hash': 'de742e56a5ec379f', 'authors': ['Ohad Rahamim', 'Ori Malca', 'Dvir Samuel', 'Gal Chechik'], 'affiliations': ['Bar-Ilan University', 'NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2412.20422.jpg', 'data': {'categories': ['#optimization', '#multimodal', '#games', '#diffusion', '#video', '#3d'], 'emoji': '🎭', 'ru': {'title': 'Оживление 3D-объектов с помощью текста: новый рубеж в генеративном моделировании', 'desc': 'Статья представляет новый метод анимации 3D-объектов с помощью текстовых подсказок. Авторы используют генеративную модель для создания 4D-контента (движущихся 3D-объектов), сохраняя при этом исходный вид объекта. Метод включает преобразование 3D-меша в статическое 4D нейронное радиальное поле (NeRF) и последующую анимацию с помощью диффузионной модели Image-to-Video. Для улучшения реалистичности движения введены протокол выбора ракурсов и маскированная функция потерь Score Distillation Sampling.'}, 'en': {'title': 'Animating 3D Objects with Text Prompts for Realistic 4D Generation', 'desc': "This paper presents a novel approach to generating 4D content by animating 3D objects based on text prompts. The method involves converting a 3D mesh into a static 4D Neural Radiance Field (NeRF) to retain the object's visual characteristics. It then utilizes an Image-to-Video diffusion model to create animations while ensuring the original object's identity is preserved. The authors enhance motion realism through a viewpoint selection protocol and a masked Score Distillation Sampling loss, leading to significant improvements in visual quality and dynamic content generation."}, 'zh': {'title': '文本驱动的4D动画生成新方法', 'desc': '本研究提出了一种新方法,可以通过文本提示来控制4D内容的生成,特别是动画用户提供的3D对象。我们首先将3D网格转换为静态的4D神经辐射场(NeRF),以保留输入对象的视觉特征。然后,利用图像到视频的扩散模型进行动画制作,确保生成的动画与文本提示相符。通过引入增量视角选择协议和掩码评分蒸馏损失,我们提高了运动的真实感,并在多个评估指标上超越了现有方法。'}}}, {'id': 'https://huggingface.co/papers/2412.20993', 'title': 'Efficiently Serving LLM Reasoning Programs with Certaindex', 'url': 'https://huggingface.co/papers/2412.20993', 'abstract': 'The rapid evolution of large language models (LLMs) has unlocked their capabilities in advanced reasoning tasks like mathematical problem-solving, code generation, and legal analysis. Central to this progress are inference-time reasoning algorithms, which refine outputs by exploring multiple solution paths, at the cost of increasing compute demands and response latencies. Existing serving systems fail to adapt to the scaling behaviors of these algorithms or the varying difficulty of queries, leading to inefficient resource use and unmet latency targets. We present Dynasor, a system that optimizes inference-time compute for LLM reasoning queries. Unlike traditional engines, Dynasor tracks and schedules requests within reasoning queries and uses Certaindex, a proxy that measures statistical reasoning progress based on model certainty, to guide compute allocation dynamically. Dynasor co-adapts scheduling with reasoning progress: it allocates more compute to hard queries, reduces compute for simpler ones, and terminates unpromising queries early, balancing accuracy, latency, and cost. On diverse datasets and algorithms, Dynasor reduces compute by up to 50% in batch processing and sustaining 3.3x higher query rates or 4.7x tighter latency SLOs in online serving.', 'score': 24, 'issue_id': 1406, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '7fe76ed90463d977', 'authors': ['Yichao Fu', 'Junda Chen', 'Siqi Zhu', 'Zheyu Fu', 'Zhongdongming Dai', 'Aurick Qiao', 'Hao Zhang'], 'affiliations': ['Snowflake', 'Tsinghua University', 'UC San Diego'], 'pdf_title_img': 'assets/pdf/title_img/2412.20993.jpg', 'data': {'categories': ['#training', '#reasoning', '#optimization', '#inference'], 'emoji': '🧠', 'ru': {'title': 'Dynasor: умное распределение ресурсов для эффективных LLM-рассуждений', 'desc': 'Статья представляет систему Dynasor, оптимизирующую вычисления для задач рассуждения с использованием больших языковых моделей (LLM). Dynasor отслеживает и планирует запросы, используя прокси Certaindex для измерения прогресса рассуждений на основе уверенности модели. Система динамически распределяет вычислительные ресурсы, уделяя больше внимания сложным запросам и меньше простым, а также прекращая бесперспективные запросы. Dynasor показывает значительное снижение вычислительных затрат и улучшение производительности на различных наборах данных и алгоритмах.'}, 'en': {'title': 'Dynasor: Smart Compute Allocation for Efficient LLM Reasoning', 'desc': "This paper introduces Dynasor, a system designed to optimize the compute resources used during inference for large language models (LLMs) when handling reasoning queries. It addresses the inefficiencies of existing serving systems that do not adapt to the complexity of different queries or the scaling needs of inference-time reasoning algorithms. Dynasor employs a dynamic scheduling approach that allocates compute resources based on the difficulty of the query, using a proxy called Certaindex to measure the model's certainty in its reasoning. As a result, Dynasor can significantly reduce compute usage while improving query processing rates and meeting latency targets more effectively."}, 'zh': {'title': 'Dynasor:优化推理查询的计算效率', 'desc': '这篇论文介绍了Dynasor系统,它优化了大型语言模型(LLM)在推理查询时的计算效率。Dynasor通过跟踪和调度推理查询中的请求,动态分配计算资源,以应对不同难度的查询。该系统使用Certaindex代理,根据模型的确定性来衡量推理进展,从而指导计算分配。通过在多种数据集和算法上测试,Dynasor在批处理时减少了多达50%的计算需求,同时在在线服务中实现了3.3倍更高的查询速率或4.7倍更严格的延迟服务水平目标。'}}}, {'id': 'https://huggingface.co/papers/2412.21037', 'title': 'TangoFlux: Super Fast and Faithful Text to Audio Generation with Flow Matching and Clap-Ranked Preference Optimization', 'url': 'https://huggingface.co/papers/2412.21037', 'abstract': 'We introduce TangoFlux, an efficient Text-to-Audio (TTA) generative model with 515M parameters, capable of generating up to 30 seconds of 44.1kHz audio in just 3.7 seconds on a single A40 GPU. A key challenge in aligning TTA models lies in the difficulty of creating preference pairs, as TTA lacks structured mechanisms like verifiable rewards or gold-standard answers available for Large Language Models (LLMs). To address this, we propose CLAP-Ranked Preference Optimization (CRPO), a novel framework that iteratively generates and optimizes preference data to enhance TTA alignment. We demonstrate that the audio preference dataset generated using CRPO outperforms existing alternatives. With this framework, TangoFlux achieves state-of-the-art performance across both objective and subjective benchmarks. We open source all code and models to support further research in TTA generation.', 'score': 19, 'issue_id': 1405, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'bb669623871df661', 'authors': ['Chia-Yu Hung', 'Navonil Majumder', 'Zhifeng Kong', 'Ambuj Mehrish', 'Rafael Valle', 'Bryan Catanzaro', 'Soujanya Poria'], 'affiliations': ['NVIDIA', 'Singapore University of Technology and Design (SUTD)'], 'pdf_title_img': 'assets/pdf/title_img/2412.21037.jpg', 'data': {'categories': ['#dataset', '#audio', '#open_source', '#benchmark', '#alignment', '#rlhf', '#small_models'], 'emoji': '🎵', 'ru': {'title': 'TangoFlux: Революция в генерации аудио из текста', 'desc': 'TangoFlux - это эффективная генеративная модель для преобразования текста в аудио (Text-to-Audio, TTA) с 515 миллионами параметров. Модель способна генерировать до 30 секунд аудио с частотой 44,1 кГц всего за 3,7 секунды на одном GPU A40. Авторы представляют новую методику CLAP-Ranked Preference Optimization (CRPO) для улучшения согласованности TTA моделей путем итеративной генерации и оптимизации данных о предпочтениях. TangoFlux достигает передовых результатов в объективных и субъективных тестах, а код и модели открыты для дальнейших исследований.'}, 'en': {'title': 'TangoFlux: Revolutionizing Text-to-Audio Generation with CRPO', 'desc': "TangoFlux is a powerful Text-to-Audio generative model that can create high-quality audio quickly and efficiently. It addresses the challenge of aligning TTA models by introducing a new method called CLAP-Ranked Preference Optimization (CRPO), which helps generate and optimize preference data. This approach improves the model's ability to understand and produce audio that aligns with user preferences. The results show that TangoFlux not only meets but exceeds current standards in both objective and subjective evaluations, and the team has made their code and models available for further research."}, 'zh': {'title': 'TangoFlux:高效的文本到音频生成模型', 'desc': '我们介绍了TangoFlux,这是一种高效的文本到音频生成模型,拥有5.15亿个参数,能够在单个A40 GPU上以3.7秒的速度生成最长30秒的44.1kHz音频。TTA模型对齐的一个主要挑战是创建偏好对的困难,因为TTA缺乏像大型语言模型(LLMs)那样的可验证奖励或标准答案的结构化机制。为了解决这个问题,我们提出了CLAP-Ranked Preference Optimization(CRPO),这是一个新颖的框架,通过迭代生成和优化偏好数据来增强TTA的对齐。我们证明了使用CRPO生成的音频偏好数据集在现有替代方案中表现更优,TangoFlux在客观和主观基准测试中都达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2412.21079', 'title': 'Edicho: Consistent Image Editing in the Wild', 'url': 'https://huggingface.co/papers/2412.21079', 'abstract': 'As a verified need, consistent editing across in-the-wild images remains a technical challenge arising from various unmanageable factors, like object poses, lighting conditions, and photography environments. Edicho steps in with a training-free solution based on diffusion models, featuring a fundamental design principle of using explicit image correspondence to direct editing. Specifically, the key components include an attention manipulation module and a carefully refined classifier-free guidance (CFG) denoising strategy, both of which take into account the pre-estimated correspondence. Such an inference-time algorithm enjoys a plug-and-play nature and is compatible to most diffusion-based editing methods, such as ControlNet and BrushNet. Extensive results demonstrate the efficacy of Edicho in consistent cross-image editing under diverse settings. We will release the code to facilitate future studies.', 'score': 17, 'issue_id': 1405, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '8068418a331b2086', 'authors': ['Qingyan Bai', 'Hao Ouyang', 'Yinghao Xu', 'Qiuyu Wang', 'Ceyuan Yang', 'Ka Leong Cheng', 'Yujun Shen', 'Qifeng Chen'], 'affiliations': ['Ant Group', 'CUHK', 'HKUST', 'Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21079.jpg', 'data': {'categories': ['#cv', '#diffusion', '#open_source', '#inference'], 'emoji': '🖼️', 'ru': {'title': 'Edicho: согласованное редактирование изображений без обучения', 'desc': 'Статья представляет Edicho - решение для согласованного редактирования изображений без обучения, основанное на диффузионных моделях. Ключевые компоненты включают модуль манипуляции вниманием и стратегию шумоподавления без классификатора, использующие предварительно оцененное соответствие между изображениями. Этот алгоритм совместим с большинством методов редактирования на основе диффузии, таких как ControlNet и BrushNet. Результаты демонстрируют эффективность Edicho в согласованном редактировании изображений в различных условиях.'}, 'en': {'title': 'Edicho: Consistent Image Editing Made Easy with Diffusion Models', 'desc': 'This paper introduces Edicho, a novel approach for consistent editing of images that addresses challenges like varying object poses and lighting. It utilizes diffusion models without the need for prior training, focusing on explicit image correspondence to guide the editing process. Key innovations include an attention manipulation module and a refined classifier-free guidance denoising strategy, which enhance the editing quality by considering pre-estimated correspondences. The method is designed to be easily integrated with existing diffusion-based editing techniques, showing strong performance across different scenarios.'}, 'zh': {'title': 'Edicho:无训练一致性图像编辑的新方法', 'desc': 'Edicho 是一种基于扩散模型的无训练解决方案,旨在解决在不同环境下进行一致性图像编辑的挑战。它的设计原则是利用显式图像对应关系来指导编辑,确保在不同的拍摄条件下保持一致性。该方法包括一个注意力操作模块和经过精细调整的无分类器引导去噪策略,能够有效处理预估的对应关系。Edicho 具有即插即用的特性,兼容大多数基于扩散的编辑方法,实验结果显示其在多种设置下的有效性。'}}}, {'id': 'https://huggingface.co/papers/2412.21187', 'title': 'Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs', 'url': 'https://huggingface.co/papers/2412.21187', 'abstract': 'The remarkable performance of models like the OpenAI o1 can be attributed to their ability to emulate human-like long-time thinking during inference. These models employ extended chain-of-thought (CoT) processes, exploring multiple strategies to enhance problem-solving capabilities. However, a critical question remains: How to intelligently and efficiently scale computational resources during testing. This paper presents the first comprehensive study on the prevalent issue of overthinking in these models, where excessive computational resources are allocated for simple problems with minimal benefit. We introduce novel efficiency metrics from both outcome and process perspectives to evaluate the rational use of computational resources by o1-like models. Using a self-training paradigm, we propose strategies to mitigate overthinking, streamlining reasoning processes without compromising accuracy. Experimental results show that our approach successfully reduces computational overhead while preserving model performance across a range of testsets with varying difficulty levels, such as GSM8K, MATH500, GPQA, and AIME.', 'score': 11, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '78da22eae14fe26c', 'authors': ['Xingyu Chen', 'Jiahao Xu', 'Tian Liang', 'Zhiwei He', 'Jianhui Pang', 'Dian Yu', 'Linfeng Song', 'Qiuzhi Liu', 'Mengfei Zhou', 'Zhuosheng Zhang', 'Rui Wang', 'Zhaopeng Tu', 'Haitao Mi', 'Dong Yu'], 'affiliations': ['Shanghai Jiao Tong University', 'Tencent AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2412.21187.jpg', 'data': {'categories': ['#optimization', '#reasoning', '#training', '#math', '#inference'], 'emoji': '🧠', 'ru': {'title': 'Эффективное мышление ИИ: борьба с избыточными вычислениями', 'desc': 'Статья исследует проблему избыточных вычислений (overthinking) в крупных языковых моделях типа OpenAI o1 при решении задач. Авторы вводят новые метрики эффективности для оценки рационального использования вычислительных ресурсов такими моделями. Предлагается стратегия на основе самообучения для оптимизации рассуждений модели без потери точности. Экспериментальные результаты показывают успешное снижение вычислительных затрат при сохранении производительности на различных наборах тестов.'}, 'en': {'title': 'Streamlining Reasoning: Tackling Overthinking in AI Models', 'desc': "This paper investigates the phenomenon of overthinking in advanced machine learning models, particularly those like OpenAI's o1, which excel at long-term reasoning. It highlights the inefficiencies that arise when these models allocate excessive computational resources to solve simple problems, leading to minimal gains in performance. The authors propose new efficiency metrics to assess how well these models utilize their computational power during inference. By implementing a self-training approach, they present strategies to reduce overthinking, achieving a balance between computational efficiency and model accuracy across various challenging test sets."}, 'zh': {'title': '优化计算资源,提升模型效率', 'desc': '本文探讨了像OpenAI o1这样的模型在推理过程中模拟人类长期思考的能力。研究指出,这些模型在解决问题时常常会过度思考,导致在简单问题上分配过多的计算资源。我们提出了新的效率指标,从结果和过程两个角度评估计算资源的合理使用,并提出了自我训练的策略来减少过度思考。实验结果表明,我们的方法在不同难度的测试集上成功降低了计算开销,同时保持了模型的性能。'}}}, {'id': 'https://huggingface.co/papers/2412.20005', 'title': 'OneKE: A Dockerized Schema-Guided LLM Agent-based Knowledge Extraction System', 'url': 'https://huggingface.co/papers/2412.20005', 'abstract': "We introduce OneKE, a dockerized schema-guided knowledge extraction system, which can extract knowledge from the Web and raw PDF Books, and support various domains (science, news, etc.). Specifically, we design OneKE with multiple agents and a configure knowledge base. Different agents perform their respective roles, enabling support for various extraction scenarios. The configure knowledge base facilitates schema configuration, error case debugging and correction, further improving the performance. Empirical evaluations on benchmark datasets demonstrate OneKE's efficacy, while case studies further elucidate its adaptability to diverse tasks across multiple domains, highlighting its potential for broad applications. We have open-sourced the Code at https://github.com/zjunlp/OneKE and released a Video at http://oneke.openkg.cn/demo.mp4.", 'score': 10, 'issue_id': 1405, 'pub_date': '2024-12-28', 'pub_date_card': {'ru': '28 декабря', 'en': 'December 28', 'zh': '12月28日'}, 'hash': 'da8469c61421cefb', 'authors': ['Yujie Luo', 'Xiangyuan Ru', 'Kangwei Liu', 'Lin Yuan', 'Mengshu Sun', 'Ningyu Zhang', 'Lei Liang', 'Zhiqiang Zhang', 'Jun Zhou', 'Lanning Wei', 'Da Zheng', 'Haofen Wang', 'Huajun Chen'], 'affiliations': ['Ant Group', 'Tongji University', 'ZJU-Ant Group Joint Research Center for Knowledge Graphs', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2412.20005.jpg', 'data': {'categories': ['#dataset', '#agents', '#open_source', '#benchmark', '#multimodal', '#science'], 'emoji': '🧠', 'ru': {'title': 'OneKE: Универсальный инструмент для извлечения знаний из разнородных источников', 'desc': 'OneKE - это докеризованная система извлечения знаний, управляемая схемой. Она способна извлекать информацию из веб-ресурсов и PDF-книг, поддерживая различные домены, такие как наука и новости. Система использует множество агентов и настраиваемую базу знаний для выполнения различных сценариев извлечения. OneKE демонстрирует высокую эффективность на эталонных наборах данных и адаптируемость к разнообразным задачам в различных областях.'}, 'en': {'title': 'OneKE: Versatile Knowledge Extraction for Diverse Domains', 'desc': "OneKE is a knowledge extraction system designed to gather information from the Web and raw PDF books across various domains like science and news. It utilizes multiple agents, each responsible for specific tasks, which enhances its ability to handle different extraction scenarios effectively. The system includes a configurable knowledge base that aids in schema setup, debugging, and error correction, leading to improved performance. Empirical tests on benchmark datasets confirm OneKE's effectiveness, and case studies showcase its versatility in tackling diverse tasks."}, 'zh': {'title': 'OneKE:多领域知识提取的智能系统', 'desc': 'OneKE是一个基于Docker的知识提取系统,能够从网络和原始PDF书籍中提取知识,支持多个领域(如科学、新闻等)。该系统设计了多个智能代理,各自承担不同的角色,以适应各种提取场景。配置知识库的设计使得模式配置、错误调试和修正变得更加高效,从而提升了系统的性能。通过在基准数据集上的实证评估,OneKE展示了其有效性,并通过案例研究进一步说明了其在多个领域的适应性和广泛应用潜力。'}}}, {'id': 'https://huggingface.co/papers/2412.20631', 'title': "Slow Perception: Let's Perceive Geometric Figures Step-by-step", 'url': 'https://huggingface.co/papers/2412.20631', 'abstract': 'Recently, "visual o1" began to enter people\'s vision, with expectations that this slow-thinking design can solve visual reasoning tasks, especially geometric math problems. However, the reality is that current LVLMs (Large Vision Language Models) can hardly even accurately copy a geometric figure, let alone truly understand the complex inherent logic and spatial relationships within geometric shapes. We believe accurate copying (strong perception) is the first step to visual o1. Accordingly, we introduce the concept of "slow perception" (SP), which guides the model to gradually perceive basic point-line combinations, as our humans, reconstruct complex geometric structures progressively. There are two-fold stages in SP: a) perception decomposition. Perception is not instantaneous. In this stage, complex geometric figures are broken down into basic simple units to unify geometry representation. b) perception flow, which acknowledges that accurately tracing a line is not an easy task. This stage aims to avoid "long visual jumps" in regressing line segments by using a proposed "perceptual ruler" to trace each line stroke-by-stroke. Surprisingly, such a human-like perception manner enjoys an inference time scaling law -- the slower, the better. Researchers strive to speed up the model\'s perception in the past, but we slow it down again, allowing the model to read the image step-by-step and carefully.', 'score': 9, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'f99c59b7ef92c667', 'authors': ['Haoran Wei', 'Youyang Yin', 'Yumeng Li', 'Jia Wang', 'Liang Zhao', 'Jianjian Sun', 'Zheng Ge', 'Xiangyu Zhang'], 'affiliations': ['Beihang University', 'Stepfun'], 'pdf_title_img': 'assets/pdf/title_img/2412.20631.jpg', 'data': {'categories': ['#cv', '#math', '#reasoning'], 'emoji': '🔍', 'ru': {'title': 'Медленнее значит лучше: новый подход к компьютерному зрению', 'desc': "Статья представляет концепцию 'медленного восприятия' (slow perception) для улучшения способности моделей компьютерного зрения копировать геометрические фигуры. Авторы предлагают двухэтапный подход: декомпозиция восприятия, разбивающая сложные фигуры на простые элементы, и поток восприятия, использующий 'перцептивную линейку' для точного отслеживания линий. Исследователи обнаружили, что более медленное восприятие приводит к лучшим результатам, что противоречит традиционному стремлению ускорить обработку изображений. Эта методика может стать первым шагом к решению задач визуального рассуждения и геометрических задач большими визуально-языковыми моделями."}, 'en': {'title': 'Slow Down to See Better: Enhancing Visual Reasoning with Slow Perception', 'desc': "This paper introduces the concept of 'slow perception' (SP) to enhance the capabilities of Large Vision Language Models (LVLMs) in visual reasoning tasks, particularly in understanding geometric shapes. SP consists of two stages: perception decomposition, where complex figures are simplified into basic components, and perception flow, which emphasizes careful tracing of lines to avoid errors. The authors argue that this method mimics human cognitive processes, allowing for a more accurate understanding of spatial relationships. Interestingly, they find that a slower, more deliberate approach to perception improves the model's performance, challenging the traditional focus on speed in machine learning."}, 'zh': {'title': '慢感知:逐步理解几何结构的关键', 'desc': '最近,"视觉o1"开始引起人们的关注,期望这种慢思维设计能够解决视觉推理任务,尤其是几何数学问题。然而,当前的大型视觉语言模型(LVLMs)在准确复制几何图形方面几乎无能为力,更不用说真正理解几何形状内在的复杂逻辑和空间关系。我们提出了"慢感知"(SP)的概念,指导模型逐步感知基本的点线组合,像人类一样逐步重建复杂的几何结构。SP包括两个阶段:感知分解和感知流,前者将复杂的几何图形分解为基本单元,后者通过使用"感知尺"逐步追踪每条线段,避免"长视觉跳跃"。'}}}, {'id': 'https://huggingface.co/papers/2412.21140', 'title': 'Facilitating large language model Russian adaptation with Learned Embedding Propagation', 'url': 'https://huggingface.co/papers/2412.21140', 'abstract': 'Rapid advancements of large language model (LLM) technologies led to the introduction of powerful open-source instruction-tuned LLMs that have the same text generation quality as the state-of-the-art counterparts such as GPT-4. While the emergence of such models accelerates the adoption of LLM technologies in sensitive-information environments the authors of such models don not disclose the training data necessary for replication of the results thus making the achievements model-exclusive. Since those open-source models are also multilingual this in turn reduces the benefits of training a language specific LLMs as improved inference computation efficiency becomes the only guaranteed advantage of such costly procedure. More cost-efficient options such as vocabulary extension and subsequent continued pre-training are also inhibited by the lack of access to high-quality instruction-tuning data since it is the major factor behind the resulting LLM task-solving capabilities. To address the limitations and cut the costs of the language adaptation pipeline we propose Learned Embedding Propagation (LEP). Unlike existing approaches our method has lower training data size requirements due to minimal impact on existing LLM knowledge which we reinforce using novel ad-hoc embedding propagation procedure that allows to skip the instruction-tuning step and instead implant the new language knowledge directly into any existing instruct-tuned variant. We evaluated four Russian vocabulary adaptations for LLaMa-3-8B and Mistral-7B, showing that LEP is competitive with traditional instruction-tuning methods, achieving performance comparable to OpenChat 3.5 and LLaMa-3-8B-Instruct, with further improvements via self-calibration and continued tuning enhancing task-solving capabilities.', 'score': 9, 'issue_id': 1412, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '093f3929e323d180', 'authors': ['Mikhail Tikhomirov', 'Daniil Chernyshev'], 'affiliations': ['Lomonosov Moscow State University, Moscow, Russia'], 'pdf_title_img': 'assets/pdf/title_img/2412.21140.jpg', 'data': {'categories': ['#data', '#training', '#low_resource', '#transfer_learning', '#dataset', '#open_source', '#multilingual'], 'emoji': '🌐', 'ru': {'title': 'Эффективная адаптация языковых моделей без масштабного переобучения', 'desc': 'Статья представляет новый метод адаптации больших языковых моделей (LLM) к другим языкам, называемый Learned Embedding Propagation (LEP). Этот подход позволяет эффективно внедрять знания нового языка в существующие инструктированные LLM без необходимости повторного обучения на больших объемах данных. Авторы провели эксперименты с адаптацией моделей LLaMa-3-8B и Mistral-7B к русскому языку, показав, что LEP конкурентоспособен с традиционными методами инструктирования. Результаты демонстрируют, что LEP достигает производительности, сравнимой с OpenChat 3.5 и LLaMa-3-8B-Instruct, с возможностью дальнейшего улучшения через самокалибровку и дополнительную настройку.'}, 'en': {'title': 'Efficient Language Adaptation with Learned Embedding Propagation', 'desc': 'This paper introduces Learned Embedding Propagation (LEP), a novel method for adapting large language models (LLMs) to new languages without the need for extensive instruction-tuning data. LEP minimizes the training data requirements by directly embedding new language knowledge into existing instruct-tuned models, thus bypassing traditional instruction-tuning steps. The authors demonstrate that LEP can effectively adapt LLaMa-3-8B and Mistral-7B for Russian vocabulary, achieving performance on par with state-of-the-art models like OpenChat 3.5. This approach not only reduces costs but also enhances the efficiency of language adaptation in multilingual contexts.'}, 'zh': {'title': '学习嵌入传播:降低语言适应成本的新方法', 'desc': '这篇论文介绍了一种名为学习嵌入传播(LEP)的方法,旨在降低语言适应过程的成本。LEP方法通过最小化对现有大语言模型(LLM)知识的影响,减少了对训练数据的需求。与传统的指令调优方法相比,LEP能够直接将新的语言知识植入到现有的指令调优模型中,从而跳过指令调优步骤。实验结果表明,LEP在俄语词汇适应方面的表现与传统方法相当,且通过自我校准和持续调优进一步提升了任务解决能力。'}}}, {'id': 'https://huggingface.co/papers/2412.21139', 'title': 'Training Software Engineering Agents and Verifiers with SWE-Gym', 'url': 'https://huggingface.co/papers/2412.21139', 'abstract': 'We present SWE-Gym, the first environment for training real-world software engineering (SWE) agents. SWE-Gym contains 2,438 real-world Python task instances, each comprising a codebase with an executable runtime environment, unit tests, and a task specified in natural language. We use SWE-Gym to train language model based SWE agents , achieving up to 19% absolute gains in resolve rate on the popular SWE-Bench Verified and Lite test sets. We also experiment with inference-time scaling through verifiers trained on agent trajectories sampled from SWE-Gym. When combined with our fine-tuned SWE agents, we achieve 32.0% and 26.0% on SWE-Bench Verified and Lite, respectively, reflecting a new state-of-the-art for open-weight SWE agents. To facilitate further research, we publicly release SWE-Gym, models, and agent trajectories.', 'score': 9, 'issue_id': 1406, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '800bb3f4c48e2cf9', 'authors': ['Jiayi Pan', 'Xingyao Wang', 'Graham Neubig', 'Navdeep Jaitly', 'Heng Ji', 'Alane Suhr', 'Yizhe Zhang'], 'affiliations': ['Apple', 'CMU', 'UC Berkeley', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2412.21139.jpg', 'data': {'categories': ['#dataset', '#open_source', '#agents', '#training'], 'emoji': '🤖', 'ru': {'title': 'SWE-Gym: революция в обучении ИИ-агентов для разработки ПО', 'desc': 'SWE-Gym - это новая среда для обучения агентов программной инженерии на реальных задачах. Она содержит 2438 экземпляров задач на Python с исполняемой средой, юнит-тестами и описанием на естественном языке. Авторы использовали SWE-Gym для обучения агентов на основе языковых моделей, достигнув улучшения на 19% в решении задач из наборов SWE-Bench. Комбинация обученных агентов и верификаторов позволила достичь нового рекорда производительности для открытых моделей в программной инженерии.'}, 'en': {'title': 'Revolutionizing Software Engineering with SWE-Gym', 'desc': 'SWE-Gym is a novel environment designed for training software engineering agents using real-world Python tasks. It includes 2,438 task instances, each with a codebase, executable environment, unit tests, and natural language task descriptions. The paper demonstrates that language model-based agents trained in SWE-Gym can significantly improve their performance, achieving up to 19% higher resolve rates on benchmark tests. Additionally, the authors explore scaling inference through verifiers, leading to state-of-the-art results for open-weight software engineering agents, and they provide resources for further research.'}, 'zh': {'title': 'SWE-Gym:软件工程代理的新起点', 'desc': '我们提出了SWE-Gym,这是第一个用于训练真实世界软件工程(SWE)代理的环境。SWE-Gym包含2438个真实的Python任务实例,每个实例都有可执行的运行环境、单元测试和用自然语言指定的任务。通过使用SWE-Gym,我们训练的基于语言模型的SWE代理在流行的SWE-Bench验证和Lite测试集上实现了高达19%的绝对解决率提升。我们还通过在SWE-Gym中采样的代理轨迹训练验证器,进行推理时的扩展,结合我们微调的SWE代理,在SWE-Bench验证和Lite上分别达到了32.0%和26.0%的新状态,成为开放权重SWE代理的新标杆。'}}}, {'id': 'https://huggingface.co/papers/2412.21206', 'title': 'PERSE: Personalized 3D Generative Avatars from A Single Portrait', 'url': 'https://huggingface.co/papers/2412.21206', 'abstract': "We present PERSE, a method for building an animatable personalized generative avatar from a reference portrait. Our avatar model enables facial attribute editing in a continuous and disentangled latent space to control each facial attribute, while preserving the individual's identity. To achieve this, our method begins by synthesizing large-scale synthetic 2D video datasets, where each video contains consistent changes in the facial expression and viewpoint, combined with a variation in a specific facial attribute from the original input. We propose a novel pipeline to produce high-quality, photorealistic 2D videos with facial attribute editing. Leveraging this synthetic attribute dataset, we present a personalized avatar creation method based on the 3D Gaussian Splatting, learning a continuous and disentangled latent space for intuitive facial attribute manipulation. To enforce smooth transitions in this latent space, we introduce a latent space regularization technique by using interpolated 2D faces as supervision. Compared to previous approaches, we demonstrate that PERSE generates high-quality avatars with interpolated attributes while preserving identity of reference person.", 'score': 8, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '334a60a17f9a9477', 'authors': ['Hyunsoo Cha', 'Inhee Lee', 'Hanbyul Joo'], 'affiliations': ['Seoul National University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21206.jpg', 'data': {'categories': ['#3d', '#cv', '#dataset', '#synthetic'], 'emoji': '🎭', 'ru': {'title': 'Персонализированные аватары с гибким редактированием черт лица', 'desc': 'PERSE - это метод создания анимируемого персонализированного генеративного аватара на основе портрета. Он позволяет редактировать лицевые атрибуты в непрерывном и разделенном латентном пространстве, сохраняя при этом индивидуальность человека. Метод использует синтетические наборы 2D-видео для обучения модели на основе 3D Gaussian Splatting. PERSE демонстрирует высокое качество генерации аватаров с интерполированными атрибутами, сохраняя идентичность исходного человека.'}, 'en': {'title': 'Create Your Unique Avatar with PERSE!', 'desc': "PERSE is a novel method for creating personalized generative avatars from a single reference portrait. It allows users to edit facial attributes in a smooth and controlled manner within a continuous latent space, ensuring that the individual's identity remains intact. The approach involves generating large-scale synthetic 2D video datasets that showcase variations in facial expressions and attributes, which are then used to train the avatar model. By employing 3D Gaussian Splatting and a latent space regularization technique, PERSE achieves high-quality, photorealistic avatars with seamless attribute transitions."}, 'zh': {'title': '个性化生成头像的新方法', 'desc': '本文介绍了一种名为PERSE的方法,用于从参考肖像构建可动画的个性化生成头像。该头像模型能够在连续且解耦的潜在空间中编辑面部属性,同时保持个体的身份。我们的方法首先合成大规模的合成2D视频数据集,每个视频包含面部表情和视角的一致变化,并结合原始输入中特定面部属性的变化。通过引入潜在空间正则化技术,我们实现了高质量、逼真的2D视频生成,并在此基础上提出了一种个性化头像创建方法。'}}}, {'id': 'https://huggingface.co/papers/2412.21199', 'title': 'HumanEval Pro and MBPP Pro: Evaluating Large Language Models on Self-invoking Code Generation', 'url': 'https://huggingface.co/papers/2412.21199', 'abstract': "We introduce self-invoking code generation, a new task designed to evaluate the progressive reasoning and problem-solving capabilities of LLMs. In this task, models are presented with a base problem and a related, more complex problem. They must solve the base problem and then utilize its solution to address the more complex one. This work features three key contributions. First, we propose a general recipe for generating more challenging versions of existing benchmarks, resulting in three new benchmarks: HumanEval Pro, MBPP Pro, and BigCodeBench-Lite Pro, specifically designed to assess LLMs on self-invoking code generation. Second, from the analysis of experimental results over twenty LLMs on our benchmarks, we have two important observations: (i) Most LLMs excel in traditional code generation benchmarks like HumanEval and MBPP, but their performance declines on self-invoking tasks. For example, o1-mini achieves 96.2% pass@1 on HumanEval but only 76.2% on HumanEval Pro. (ii) On self-invoking code generation task, the instruction-tuned models demonstrate only marginal improvements compared to the base models. Third, we disclose the types of failure modes that exist in our evaluation results. All these results underscore the need for further advancements in self-invoking code generation tasks and provide a new direction for future research on enhancing LLMs' code reasoning capabilities.", 'score': 6, 'issue_id': 1408, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '9d2cebc8f30f722c', 'authors': ['Zhaojian Yu', 'Yilun Zhao', 'Arman Cohan', 'Xiao-Ping Zhang'], 'affiliations': ['Tsinghua University', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21199.jpg', 'data': {'categories': ['#dataset', '#reasoning', '#training', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Самовызывающийся код: новый рубеж для языковых моделей', 'desc': 'Статья представляет новую задачу для оценки возможностей языковых моделей (LLM) - генерацию самовызывающегося кода. В рамках этой задачи модели должны решить базовую проблему, а затем использовать ее решение для более сложной задачи. Авторы создали три новых бенчмарка: HumanEval Pro, MBPP Pro и BigCodeBench-Lite Pro. Эксперименты показали, что большинство LLM хорошо справляются с традиционными задачами генерации кода, но их производительность снижается на самовызывающихся задачах. Результаты подчеркивают необходимость дальнейших исследований в области улучшения способностей LLM к рассуждению при работе с кодом.'}, 'en': {'title': 'Enhancing LLMs: The Challenge of Self-Invoking Code Generation', 'desc': 'This paper introduces a new task called self-invoking code generation, which tests the reasoning and problem-solving skills of large language models (LLMs). In this task, models first solve a simple problem and then use that solution to tackle a more complex one. The authors create three new benchmarks to evaluate LLMs on this task, revealing that while many models perform well on standard code generation tasks, their performance drops significantly on self-invoking tasks. The findings highlight the limitations of current models and suggest that more research is needed to improve their code reasoning abilities.'}, 'zh': {'title': '自调用代码生成:提升LLMs推理能力的新方向', 'desc': '本文介绍了一种新的任务——自调用代码生成,旨在评估大型语言模型(LLMs)的推理和问题解决能力。在这个任务中,模型需要先解决一个基础问题,然后利用其解决方案来处理一个更复杂的问题。研究提出了三项重要贡献,包括生成更具挑战性的基准测试的通用方法,并创建了三个新基准:HumanEval Pro、MBPP Pro和BigCodeBench-Lite Pro。实验结果显示,大多数LLMs在传统代码生成基准上表现良好,但在自调用任务上的表现却有所下降,表明在自调用代码生成任务上仍需进一步的研究和改进。'}}}, {'id': 'https://huggingface.co/papers/2501.07301', 'title': 'The Lessons of Developing Process Reward Models in Mathematical Reasoning', 'url': 'https://huggingface.co/papers/2501.07301', 'abstract': 'Process Reward Models (PRMs) emerge as a promising approach for process supervision in mathematical reasoning of Large Language Models (LLMs), which aim to identify and mitigate intermediate errors in the reasoning processes. However, the development of effective PRMs faces significant challenges, particularly in data annotation and evaluation methodologies. In this paper, through extensive experiments, we demonstrate that commonly used Monte Carlo (MC) estimation-based data synthesis for PRMs typically yields inferior performance and generalization compared to LLM-as-a-judge and human annotation methods. MC estimation relies on completion models to evaluate current-step correctness, leading to inaccurate step verification. Furthermore, we identify potential biases in conventional Best-of-N (BoN) evaluation strategies for PRMs: (1) The unreliable policy models generate responses with correct answers but flawed processes, leading to a misalignment between the evaluation criteria of BoN and the PRM objectives of process verification. (2) The tolerance of PRMs of such responses leads to inflated BoN scores. (3) Existing PRMs have a significant proportion of minimum scores concentrated on the final answer steps, revealing the shift from process to outcome-based assessment in BoN Optimized PRMs. To address these challenges, we develop a consensus filtering mechanism that effectively integrates MC estimation with LLM-as-a-judge and advocates a more comprehensive evaluation framework that combines response-level and step-level metrics. Based on the mechanisms, we significantly improve both model performance and data efficiency in the BoN evaluation and the step-wise error identification task. Finally, we release a new state-of-the-art PRM that outperforms existing open-source alternatives and provides practical guidelines for future research in building process supervision models.', 'score': 46, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '98f46bb1e2772efc', 'authors': ['Zhenru Zhang', 'Chujie Zheng', 'Yangzhen Wu', 'Beichen Zhang', 'Runji Lin', 'Bowen Yu', 'Dayiheng Liu', 'Jingren Zhou', 'Junyang Lin'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.07301.jpg', 'data': {'categories': ['#math', '#data', '#reasoning', '#benchmark', '#optimization', '#open_source', '#training'], 'emoji': '🧮', 'ru': {'title': 'Усовершенствование Process Reward Models для более точного контроля математических рассуждений', 'desc': 'Статья посвящена Process Reward Models (PRM) для контроля процесса математических рассуждений в больших языковых моделях. Авторы выявили проблемы в существующих методах синтеза данных и оценки PRMs, таких как Monte Carlo и Best-of-N. Они предложили новый механизм фильтрации на основе консенсуса, объединяющий MC-оценку с подходом LLM-as-a-judge. В результате исследователи создали улучшенную PRM, превосходящую существующие open-source альтернативы.'}, 'en': {'title': 'Enhancing Reasoning in LLMs with Process Reward Models', 'desc': 'This paper introduces Process Reward Models (PRMs) as a method to enhance the reasoning capabilities of Large Language Models (LLMs) by identifying and correcting errors in their reasoning processes. The authors highlight the limitations of traditional Monte Carlo estimation methods for data synthesis, which often lead to poor performance in evaluating reasoning steps. They also point out biases in the Best-of-N evaluation strategies that can misalign with the goals of PRMs, particularly in how they assess the correctness of reasoning processes versus final answers. To overcome these issues, the paper proposes a new consensus filtering mechanism that combines different evaluation methods, resulting in improved model performance and more accurate error identification.'}, 'zh': {'title': '提升过程监督模型的有效性', 'desc': '本文探讨了过程奖励模型(PRMs)在大型语言模型(LLMs)数学推理中的应用,旨在识别和减少推理过程中的中间错误。研究表明,传统的基于蒙特卡洛估计的数据合成方法在性能和泛化能力上不如使用LLM作为评判者和人工标注的方法。我们还发现,现有的最佳选择(BoN)评估策略存在偏差,导致评估标准与PRM的过程验证目标不一致。为了解决这些问题,本文提出了一种共识过滤机制,结合了蒙特卡洛估计和LLM评判者,显著提高了模型性能和数据效率。'}}}, {'id': 'https://huggingface.co/papers/2501.06425', 'title': 'Tensor Product Attention Is All You Need', 'url': 'https://huggingface.co/papers/2501.06425', 'abstract': 'Scaling language models to handle longer input sequences typically necessitates large key-value (KV) caches, resulting in substantial memory overhead during inference. In this paper, we propose Tensor Product Attention (TPA), a novel attention mechanism that uses tensor decompositions to represent queries, keys, and values compactly, significantly shrinking KV cache size at inference time. By factorizing these representations into contextual low-rank components (contextual factorization) and seamlessly integrating with RoPE, TPA achieves improved model quality alongside memory efficiency. Based on TPA, we introduce the Tensor ProducT ATTenTion Transformer (T6), a new model architecture for sequence modeling. Through extensive empirical evaluation of language modeling tasks, we demonstrate that T6 exceeds the performance of standard Transformer baselines including MHA, MQA, GQA, and MLA across various metrics, including perplexity and a range of renowned evaluation benchmarks. Notably, TPAs memory efficiency enables the processing of significantly longer sequences under fixed resource constraints, addressing a critical scalability challenge in modern language models. The code is available at https://github.com/tensorgi/T6.', 'score': 35, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'f723487eccf1ccfe', 'authors': ['Yifan Zhang', 'Yifeng Liu', 'Huizhuo Yuan', 'Zhen Qin', 'Yang Yuan', 'Quanquan Gu', 'Andrew Chi-Chih Yao'], 'affiliations': ['IIIS, Tsinghua University', 'Shanghai Qi Zhi Institute', 'TapTap', 'University of California, Los Angeles'], 'pdf_title_img': 'assets/pdf/title_img/2501.06425.jpg', 'data': {'categories': ['#benchmark', '#long_context', '#optimization', '#inference', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективное внимание: компактные трансформеры для длинных последовательностей', 'desc': 'В статье представлен новый механизм внимания - Tensor Product Attention (TPA), использующий тензорные разложения для компактного представления запросов, ключей и значений. TPA значительно уменьшает размер кэша ключ-значение при выводе, что повышает эффективность использования памяти. На основе TPA авторы разработали новую архитектуру модели - Tensor ProducT ATTenTion Transformer (T6). Эмпирические исследования показали, что T6 превосходит стандартные базовые модели Transformer по различным метрикам. TPA позволяет обрабатывать значительно более длинные последовательности при фиксированных ресурсах, решая важную проблему масштабируемости современных языковых моделей.'}, 'en': {'title': 'Efficient Attention for Longer Sequences with TPA', 'desc': 'This paper introduces Tensor Product Attention (TPA), a new attention mechanism designed to reduce memory usage during inference in language models. TPA achieves this by using tensor decompositions to compactly represent queries, keys, and values, which allows for smaller key-value caches. The authors present the Tensor ProducT ATTenTion Transformer (T6), a model that integrates TPA and shows improved performance on language modeling tasks compared to traditional Transformer architectures. T6 not only enhances model quality but also enables the processing of longer input sequences efficiently, addressing a key limitation in current language models.'}, 'zh': {'title': '张量乘积注意力:高效处理长序列的创新方案', 'desc': '本文提出了一种新的注意力机制,称为张量乘积注意力(TPA),旨在解决长输入序列处理中的内存开销问题。TPA通过张量分解技术,紧凑地表示查询、键和值,从而显著减少推理时的KV缓存大小。该机制结合了上下文低秩分解和RoPE,提升了模型质量和内存效率。基于TPA,我们还引入了一种新的模型架构——张量乘积注意力变换器(T6),在语言建模任务中表现优于传统的Transformer基线。'}}}, {'id': 'https://huggingface.co/papers/2501.06252', 'title': '$\\text{Transformer}^2$: Self-adaptive LLMs', 'url': 'https://huggingface.co/papers/2501.06252', 'abstract': 'Self-adaptive large language models (LLMs) aim to solve the challenges posed by traditional fine-tuning methods, which are often computationally intensive and static in their ability to handle diverse tasks. We introduce \\implname, a novel self-adaptation framework that adapts LLMs for unseen tasks in real-time by selectively adjusting only the singular components of their weight matrices. During inference, \\implname employs a two-pass mechanism: first, a dispatch system identifies the task properties, and then task-specific "expert" vectors, trained using reinforcement learning, are dynamically mixed to obtain targeted behavior for the incoming prompt. Our method outperforms ubiquitous approaches such as LoRA, with fewer parameters and greater efficiency. \\implname demonstrates versatility across different LLM architectures and modalities, including vision-language tasks. \\implname represents a significant leap forward, offering a scalable, efficient solution for enhancing the adaptability and task-specific performance of LLMs, paving the way for truly dynamic, self-organizing AI systems.', 'score': 19, 'issue_id': 1651, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '935c31e095aeeec8', 'authors': ['Qi Sun', 'Edoardo Cetin', 'Yujin Tang'], 'affiliations': ['Institute of Science Tokyo, Japan', 'Sakana AI, Japan'], 'pdf_title_img': 'assets/pdf/title_img/2501.06252.jpg', 'data': {'categories': ['#multimodal', '#agi', '#rl', '#optimization', '#training', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Самоадаптация языковых моделей в реальном времени', 'desc': 'Статья представляет новый фреймворк самоадаптации для больших языковых моделей (LLM), который позволяет адаптироваться к новым задачам в реальном времени. Метод использует двухэтапный механизм: сначала определяются свойства задачи, затем применяются специальные векторы экспертов для настройки поведения модели. Подход превосходит традиционные методы вроде LoRA, используя меньше параметров и работая эффективнее. Фреймворк демонстрирует универсальность для разных архитектур LLM и модальностей, включая задачи компьютерного зрения.'}, 'en': {'title': 'Dynamic Adaptation for Language Models', 'desc': "This paper presents a new framework called \textit{implname} that enhances large language models (LLMs) by allowing them to adapt to new tasks in real-time without the heavy computational costs of traditional fine-tuning. Instead of adjusting the entire model, \textit{implname} selectively modifies specific components of the model's weight matrices, making it more efficient. The framework uses a two-step process during inference: first, it identifies the task requirements, and then it combines specialized 'expert' vectors, which are optimized through reinforcement learning, to tailor the model's response. This approach not only improves performance compared to existing methods like LoRA but also works across various LLM architectures and tasks, including those involving both text and images."}, 'zh': {'title': '自适应LLMs:高效应对多样化任务的未来', 'desc': '自适应大型语言模型(LLMs)旨在解决传统微调方法的挑战,这些方法通常计算密集且在处理多样化任务时能力有限。我们介绍了一种新颖的自适应框架\textit{implname},它通过选择性调整权重矩阵的单个组件,实时适应LLMs以应对未见过的任务。在推理过程中,\textit{implname}采用双重机制:首先,调度系统识别任务属性,然后动态混合经过强化学习训练的任务特定“专家”向量,以获得针对输入提示的目标行为。我们的研究方法在参数更少且效率更高的情况下,超越了广泛使用的方法,如LoRA,展示了在不同LLM架构和模态(包括视觉-语言任务)中的多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.06173', 'title': 'VideoAuteur: Towards Long Narrative Video Generation', 'url': 'https://huggingface.co/papers/2501.06173', 'abstract': 'Recent video generation models have shown promising results in producing high-quality video clips lasting several seconds. However, these models face challenges in generating long sequences that convey clear and informative events, limiting their ability to support coherent narrations. In this paper, we present a large-scale cooking video dataset designed to advance long-form narrative generation in the cooking domain. We validate the quality of our proposed dataset in terms of visual fidelity and textual caption accuracy using state-of-the-art Vision-Language Models (VLMs) and video generation models, respectively. We further introduce a Long Narrative Video Director to enhance both visual and semantic coherence in generated videos and emphasize the role of aligning visual embeddings to achieve improved overall video quality. Our method demonstrates substantial improvements in generating visually detailed and semantically aligned keyframes, supported by finetuning techniques that integrate text and image embeddings within the video generation process. Project page: https://videoauteur.github.io/', 'score': 18, 'issue_id': 1653, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'e110fbe840c50afa', 'authors': ['Junfei Xiao', 'Feng Cheng', 'Lu Qi', 'Liangke Gui', 'Jiepeng Cen', 'Zhibei Ma', 'Alan Yuille', 'Lu Jiang'], 'affiliations': ['ByteDance', 'ByteDance Seed', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06173.jpg', 'data': {'categories': ['#video', '#story_generation', '#dataset', '#long_context', '#training', '#multimodal', '#alignment'], 'emoji': '🍳', 'ru': {'title': 'Готовим длинные видео: новый подход к генерации нарративного контента', 'desc': 'Статья представляет новый датасет видеороликов о приготовлении пищи для улучшения генерации длинных нарративных видео. Авторы проверяют качество датасета с помощью современных моделей компьютерного зрения и генерации видео. Они также предлагают метод Long Narrative Video Director для повышения визуальной и семантической согласованности генерируемых видео. Результаты показывают значительное улучшение в генерации детализированных и семантически согласованных ключевых кадров.'}, 'en': {'title': 'Enhancing Long-Form Video Generation with Coherent Narratives', 'desc': 'This paper addresses the limitations of current video generation models in creating long, coherent videos, particularly in the cooking domain. It introduces a large-scale dataset specifically designed for generating long-form cooking videos, ensuring high visual quality and accurate textual descriptions. The authors propose a Long Narrative Video Director that improves both the visual and semantic coherence of the generated content by aligning visual embeddings. Their approach shows significant advancements in producing detailed keyframes and enhancing overall video quality through the integration of text and image embeddings.'}, 'zh': {'title': '推动烹饪视频的长篇叙事生成', 'desc': '最近的视频生成模型在生成持续几秒的高质量视频片段方面取得了良好效果。然而,这些模型在生成长序列时面临挑战,难以传达清晰且信息丰富的事件,限制了它们支持连贯叙述的能力。本文提出了一个大规模的烹饪视频数据集,旨在推动烹饪领域的长篇叙事生成。我们引入了一种长叙事视频导演,增强生成视频的视觉和语义一致性,并强调对齐视觉嵌入在提高整体视频质量中的重要性。'}}}, {'id': 'https://huggingface.co/papers/2501.07572', 'title': 'WebWalker: Benchmarking LLMs in Web Traversal', 'url': 'https://huggingface.co/papers/2501.07572', 'abstract': "Retrieval-augmented generation (RAG) demonstrates remarkable performance across tasks in open-domain question-answering. However, traditional search engines may retrieve shallow content, limiting the ability of LLMs to handle complex, multi-layered information. To address it, we introduce WebWalkerQA, a benchmark designed to assess the ability of LLMs to perform web traversal. It evaluates the capacity of LLMs to traverse a website's subpages to extract high-quality data systematically. We propose WebWalker, which is a multi-agent framework that mimics human-like web navigation through an explore-critic paradigm. Extensive experimental results show that WebWalkerQA is challenging and demonstrates the effectiveness of RAG combined with WebWalker, through the horizontal and vertical integration in real-world scenarios.", 'score': 14, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '1dd4e60432c1ca54', 'authors': ['Jialong Wu', 'Wenbiao Yin', 'Yong Jiang', 'Zhenglin Wang', 'Zekun Xi', 'Runnan Fang', 'Deyu Zhou', 'Pengjun Xie', 'Fei Huang'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.07572.jpg', 'data': {'categories': ['#rag', '#reasoning', '#benchmark', '#agi', '#optimization', '#games', '#interpretability', '#agents', '#survey'], 'emoji': '🕸️', 'ru': {'title': 'WebWalker: умная навигация по веб-страницам для улучшения вопросно-ответных систем', 'desc': 'В статье представлен новый подход к решению задач открытого вопросно-ответного поиска - WebWalkerQA. Эта система оценивает способность языковых моделей систематически исследовать подстраницы веб-сайтов для извлечения качественной информации. Авторы предлагают фреймворк WebWalker, использующий мультиагентный подход для имитации человеческой навигации по веб-страницам. Экспериментальные результаты демонстрируют эффективность комбинации RAG и WebWalker в реальных сценариях.'}, 'en': {'title': 'Enhancing LLMs with Human-like Web Navigation for Better Information Retrieval', 'desc': "This paper introduces WebWalkerQA, a benchmark for evaluating large language models (LLMs) in open-domain question-answering tasks. It addresses the limitations of traditional search engines that often retrieve superficial content, which hinders LLMs from accessing complex information. The proposed WebWalker framework uses a multi-agent system that simulates human-like web navigation, allowing LLMs to systematically traverse subpages of a website to gather high-quality data. Experimental results indicate that combining retrieval-augmented generation (RAG) with WebWalker enhances the models' performance in real-world scenarios by enabling deeper information extraction."}, 'zh': {'title': 'WebWalkerQA:提升问答系统的网页导航能力', 'desc': '检索增强生成(RAG)在开放领域问答任务中表现出色,但传统搜索引擎可能只检索到表面内容,限制了大型语言模型(LLMs)处理复杂信息的能力。为了解决这个问题,我们引入了WebWalkerQA,这是一个评估LLMs进行网页遍历能力的基准。它评估LLMs系统性地遍历网站子页面以提取高质量数据的能力。我们提出了WebWalker,这是一个多代理框架,通过探索-评估范式模拟人类的网页导航。'}}}, {'id': 'https://huggingface.co/papers/2501.06458', 'title': 'O1 Replication Journey -- Part 3: Inference-time Scaling for Medical Reasoning', 'url': 'https://huggingface.co/papers/2501.06458', 'abstract': "Building upon our previous investigations of O1 replication (Part 1: Journey Learning [Qin et al., 2024] and Part 2: Distillation [Huang et al., 2024]), this work explores the potential of inference-time scaling in large language models (LLMs) for medical reasoning tasks, ranging from diagnostic decision-making to treatment planning. Through extensive experiments on medical benchmarks of varying complexity (MedQA, Medbullets, and JAMA Clinical Challenges), our investigation reveals several key insights: (1) Increasing inference time does lead to improved performance. With a modest training set of 500 samples, our model yields substantial performance improvements of 6%-11%. (2) Task complexity directly correlates with the required length of reasoning chains, confirming the necessity of extended thought processes for challenging problems. (3) The differential diagnoses generated by our model adhere to the principles of the hypothetico-deductive method, producing a list of potential conditions that may explain a patient's symptoms and systematically narrowing these possibilities by evaluating the evidence. These findings demonstrate the promising synergy between inference-time scaling and journey learning in advancing LLMs' real-world clinical reasoning capabilities.", 'score': 14, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'c95817afd181bd85', 'authors': ['Zhongzhen Huang', 'Gui Geng', 'Shengyi Hua', 'Zhen Huang', 'Haoyang Zou', 'Shaoting Zhang', 'Pengfei Liu', 'Xiaofan Zhang'], 'affiliations': ['Generative AI Research Lab (GAIR)', 'SII', 'SPIRAL Lab', 'Shanghai Jiao Tong University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06458.jpg', 'data': {'categories': ['#science', '#inference', '#healthcare', '#reasoning'], 'emoji': '🩺', 'ru': {'title': 'Масштабирование времени вывода LLM улучшает медицинские рассуждения', 'desc': 'Данная работа исследует потенциал масштабирования времени вывода в больших языковых моделях (LLM) для задач медицинского рассуждения. Эксперименты на медицинских бенчмарках показали, что увеличение времени вывода приводит к улучшению производительности модели. Сложность задачи напрямую коррелирует с необходимой длиной цепочек рассуждений. Дифференциальные диагнозы, генерируемые моделью, соответствуют принципам гипотетико-дедуктивного метода.'}, 'en': {'title': 'Enhancing Medical Reasoning in LLMs through Inference-Time Scaling', 'desc': "This paper investigates how increasing inference time can enhance the performance of large language models (LLMs) in medical reasoning tasks. The authors conducted experiments on various medical benchmarks and found that longer inference times lead to significant performance improvements, even with a small training dataset. They also discovered that more complex tasks require longer reasoning chains, highlighting the importance of extended thought processes. Additionally, the model's differential diagnoses align with the hypothetico-deductive method, showcasing its ability to systematically evaluate potential conditions based on patient symptoms."}, 'zh': {'title': '推理时间扩展助力医学推理能力提升', 'desc': '本研究基于我们之前对O1复制的研究,探讨了在大型语言模型(LLMs)中推理时间扩展对医学推理任务的潜力。通过在不同复杂度的医学基准(如MedQA、Medbullets和JAMA临床挑战)上进行广泛实验,我们发现增加推理时间确实能提高模型性能,尤其是在仅有500个样本的训练集上,性能提升可达6%-11%。此外,任务的复杂性与所需推理链的长度直接相关,表明对于复杂问题需要更长的思考过程。最后,我们的模型生成的差异性诊断遵循假设演绎法的原则,系统地评估证据以缩小可能的病症范围。'}}}, {'id': 'https://huggingface.co/papers/2501.06282', 'title': 'MinMo: A Multimodal Large Language Model for Seamless Voice Interaction', 'url': 'https://huggingface.co/papers/2501.06282', 'abstract': 'Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon.', 'score': 13, 'issue_id': 1651, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '2bd352453760208e', 'authors': ['Qian Chen', 'Yafeng Chen', 'Yanni Chen', 'Mengzhe Chen', 'Yingda Chen', 'Chong Deng', 'Zhihao Du', 'Ruize Gao', 'Changfeng Gao', 'Zhifu Gao', 'Yabin Li', 'Xiang Lv', 'Jiaqing Liu', 'Haoneng Luo', 'Bin Ma', 'Chongjia Ni', 'Xian Shi', 'Jialong Tang', 'Hui Wang', 'Hao Wang', 'Wen Wang', 'Yuxuan Wang', 'Yunlan Xu', 'Fan Yu', 'Zhijie Yan', 'Yexin Yang', 'Baosong Yang', 'Xian Yang', 'Guanrou Yang', 'Tianyu Zhao', 'Qinglin Zhang', 'Shiliang Zhang', 'Nan Zhao', 'Pei Zhang', 'Chong Zhang', 'Jinren Zhou'], 'affiliations': ['Tongyi Lab, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.06282.jpg', 'data': {'categories': ['#audio', '#multimodal', '#training'], 'emoji': '🗣️', 'ru': {'title': 'MinMo: революция в голосовом ИИ-взаимодействии', 'desc': 'Статья представляет MinMo - мультимодальную большую языковую модель для беспрепятственного голосового взаимодействия. Модель обучена на 1,4 миллионах часов разнообразных речевых данных и широком спектре речевых задач через несколько этапов выравнивания речи и текста. MinMo достигает передовых результатов в понимании и генерации речи, сохраняя при этом возможности текстовых ЯБМ. Модель также поддерживает полнодуплексное общение и управляемую генерацию речи с различными нюансами, включая эмоции, диалекты и темп речи.'}, 'en': {'title': 'MinMo: Revolutionizing Voice Interactions with Multimodal Learning', 'desc': 'This paper presents MinMo, a Multimodal Large Language Model designed for seamless voice interactions, featuring around 8 billion parameters. It overcomes limitations of previous aligned models by employing a multi-stage training approach that includes speech-to-text, text-to-speech, and duplex interaction alignments, utilizing a vast dataset of 1.4 million hours of diverse speech. MinMo achieves state-of-the-art performance in voice comprehension and generation, enabling full-duplex conversations and enhanced instruction-following capabilities for nuanced speech generation. Additionally, it introduces a novel voice decoder that significantly improves voice generation quality compared to earlier models.'}, 'zh': {'title': 'MinMo:无缝语音交互的新突破', 'desc': '本文介绍了一种名为MinMo的多模态大型语言模型,旨在实现无缝的语音交互。MinMo具有约80亿个参数,通过多阶段的对齐训练,克服了以往模型在语音理解和生成方面的局限性。该模型能够支持全双工对话,允许用户与系统进行实时的双向交流。MinMo还具备根据用户指令生成语音的能力,能够调整情感、方言和语速等细节。'}}}, {'id': 'https://huggingface.co/papers/2501.06842', 'title': 'SPAM: Spike-Aware Adam with Momentum Reset for Stable LLM Training', 'url': 'https://huggingface.co/papers/2501.06842', 'abstract': 'Large Language Models (LLMs) have demonstrated exceptional performance across diverse tasks, yet their training remains highly resource-intensive and susceptible to critical challenges such as training instability. A predominant source of this instability stems from gradient and loss spikes, which disrupt the learning process, often leading to costly interventions like checkpoint recovery and experiment restarts, further amplifying inefficiencies. This paper presents a comprehensive investigation into gradient spikes observed during LLM training, revealing their prevalence across multiple architectures and datasets. Our analysis shows that these spikes can be up to 1000times larger than typical gradients, substantially deteriorating model performance. To address this issue, we propose Spike-Aware Adam with Momentum Reset SPAM, a novel optimizer designed to counteract gradient spikes through momentum reset and spike-aware gradient clipping. Extensive experiments, including both pre-training and fine-tuning, demonstrate that SPAM consistently surpasses Adam and its variants across various tasks, including (1) LLM pre-training from 60M to 1B, (2) 4-bit LLM pre-training,(3) reinforcement learning, and (4) Time Series Forecasting. Additionally, SPAM facilitates memory-efficient training by enabling sparse momentum, where only a subset of momentum terms are maintained and updated. When operating under memory constraints, SPAM outperforms state-of-the-art memory-efficient optimizers such as GaLore and Adam-Mini. Our work underscores the importance of mitigating gradient spikes in LLM training and introduces an effective optimization strategy that enhances both training stability and resource efficiency at scale. Code is available at https://github.com/TianjinYellow/SPAM-Optimizer.git', 'score': 10, 'issue_id': 1658, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': 'd5fec659e34cf867', 'authors': ['Tianjin Huang', 'Ziquan Zhu', 'Gaojie Jin', 'Lu Liu', 'Zhangyang Wang', 'Shiwei Liu'], 'affiliations': ['Eindhoven University of Technology', 'University of Exeter', 'University of Leicester', 'University of Oxford', 'University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.06842.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization'], 'emoji': '📈', 'ru': {'title': 'SPAM: Стабильное и эффективное обучение языковых моделей', 'desc': 'Исследователи представили новый оптимизатор SPAM (Spike-Aware Adam with Momentum Reset) для обучения больших языковых моделей (LLM). SPAM предназначен для решения проблемы резких скачков градиентов, которые могут быть в 1000 раз больше обычных и нарушают процесс обучения. Оптимизатор использует сброс импульса и адаптивное ограничение градиента для противодействия этим скачкам. Эксперименты показали, что SPAM превосходит Adam и его варианты в различных задачах, включая предобучение LLM, обучение с подкреплением и прогнозирование временных рядов.'}, 'en': {'title': 'Taming Gradient Spikes for Stable LLM Training with SPAM', 'desc': 'This paper investigates the issue of gradient spikes during the training of Large Language Models (LLMs), which can lead to instability and inefficiencies. These spikes can be significantly larger than normal gradients, negatively impacting model performance and requiring costly interventions. To combat this problem, the authors propose a new optimizer called Spike-Aware Adam with Momentum Reset (SPAM), which incorporates momentum reset and spike-aware gradient clipping. Experimental results show that SPAM outperforms traditional optimizers like Adam in various tasks while also being more memory-efficient.'}, 'zh': {'title': '应对梯度波动,提升训练稳定性!', 'desc': '大型语言模型(LLMs)在多种任务中表现出色,但其训练过程资源消耗大且容易出现不稳定性。研究发现,梯度和损失的剧烈波动是导致训练不稳定的主要原因,这会影响学习过程并增加干预成本。本文提出了一种新型优化器——Spike-Aware Adam with Momentum Reset(SPAM),旨在通过动量重置和梯度剪切来应对梯度波动。实验结果表明,SPAM在多种任务中均优于传统的Adam优化器,显著提高了训练的稳定性和资源效率。'}}}, {'id': 'https://huggingface.co/papers/2501.07574', 'title': 'UnCommon Objects in 3D', 'url': 'https://huggingface.co/papers/2501.07574', 'abstract': 'We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for 3D deep learning and 3D generative AI. uCO3D is the largest publicly-available collection of high-resolution videos of objects with 3D annotations that ensures full-360^{circ} coverage. uCO3D is significantly more diverse than MVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of higher quality, due to extensive quality checks of both the collected videos and the 3D annotations. Similar to analogous datasets, uCO3D contains annotations for 3D camera poses, depth maps and sparse point clouds. In addition, each object is equipped with a caption and a 3D Gaussian Splat reconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D and obtain superior results using the latter, showing that uCO3D is better for learning applications.', 'score': 7, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '79c40f6997052ddd', 'authors': ['Xingchen Liu', 'Piyush Tayal', 'Jianyuan Wang', 'Jesus Zarzar', 'Tom Monnier', 'Konstantinos Tertikas', 'Jiali Duan', 'Antoine Toisoul', 'Jason Y. Zhang', 'Natalia Neverova', 'Andrea Vedaldi', 'Roman Shapovalov', 'David Novotny'], 'affiliations': ['Carnegie Mellon University', 'KAUST', 'Meta AI', 'NKUA, Greece'], 'pdf_title_img': 'assets/pdf/title_img/2501.07574.jpg', 'data': {'categories': ['#dataset', '#open_source', '#synthetic', '#3d'], 'emoji': '🔍', 'ru': {'title': 'uCO3D: Новый стандарт для 3D-данных в машинном обучении', 'desc': 'Авторы представляют новый набор данных uCO3D для глубокого обучения и генеративного ИИ в 3D. Этот датасет содержит высококачественные видео объектов с полным 360-градусным охватом и 3D-аннотациями. uCO3D превосходит аналоги по разнообразию, охватывая более 1000 категорий объектов, и качеству благодаря тщательным проверкам. Помимо стандартных аннотаций, датасет включает подписи к объектам и 3D-реконструкции на основе гауссовых сплатов.'}, 'en': {'title': 'Unlocking 3D Learning with uCO3D: A New Era of Object-Centric Datasets', 'desc': 'The paper presents Uncommon Objects in 3D (uCO3D), a comprehensive dataset designed for advancing 3D deep learning and generative AI. This dataset features high-resolution videos with full 360-degree coverage and includes over 1,000 diverse object categories, making it larger and more varied than existing datasets like MVImgNet and CO3Dv2. uCO3D provides detailed annotations such as 3D camera poses, depth maps, and sparse point clouds, along with captions and 3D Gaussian Splat reconstructions for each object. Experiments demonstrate that training large 3D models on uCO3D yields superior performance compared to other datasets, highlighting its effectiveness for learning applications.'}, 'zh': {'title': 'uCO3D:提升3D学习的全新数据集', 'desc': '我们介绍了一个新的3D深度学习和生成AI数据集,名为Uncommon Objects in 3D(uCO3D)。uCO3D是一个公开可用的高分辨率视频集合,包含360度的3D注释,涵盖超过1000个物体类别,具有更高的多样性和质量。该数据集提供了3D相机姿态、深度图和稀疏点云的注释,并为每个物体配备了描述和3D高斯点云重建。通过在多个数据集上训练大型3D模型,我们发现uCO3D在学习应用中表现更优。'}}}, {'id': 'https://huggingface.co/papers/2501.07171', 'title': 'BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and Vision-Language Models Derived from Scientific Literature', 'url': 'https://huggingface.co/papers/2501.07171', 'abstract': 'The development of vision-language models (VLMs) is driven by large-scale and diverse multimodal datasets. However, progress toward generalist biomedical VLMs is limited by the lack of annotated, publicly accessible datasets across biology and medicine. Existing efforts are restricted to narrow domains, missing the full diversity of biomedical knowledge encoded in scientific literature. To address this gap, we introduce BIOMEDICA, a scalable, open-source framework to extract, annotate, and serialize the entirety of the PubMed Central Open Access subset into an easy-to-use, publicly accessible dataset.Our framework produces a comprehensive archive with over 24 million unique image-text pairs from over 6 million articles. Metadata and expert-guided annotations are also provided. We demonstrate the utility and accessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style models continuously pre-trained on the BIOMEDICA dataset via streaming, eliminating the need to download 27 TB of data locally.On average, our models achieve state-of-the-art performance across 40 tasks - spanning pathology, radiology, ophthalmology, dermatology, surgery, molecular biology, parasitology, and cell biology - excelling in zero-shot classification with a 6.56% average improvement (as high as 29.8% and 17.5% in dermatology and ophthalmology, respectively), and stronger image-text retrieval, all while using 10x less compute. To foster reproducibility and collaboration, we release our codebase and dataset for the broader research community.', 'score': 3, 'issue_id': 1656, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '07db2230e08b0fde', 'authors': ['Alejandro Lozano', 'Min Woo Sun', 'James Burgess', 'Liangyu Chen', 'Jeffrey J Nirschl', 'Jeffrey Gu', 'Ivan Lopez', 'Josiah Aklilu', 'Austin Wolfgang Katzer', 'Collin Chiu', 'Anita Rau', 'Xiaohan Wang', 'Yuhui Zhang', 'Alfred Seunghoon Song', 'Robert Tibshirani', 'Serena Yeung-Levy'], 'affiliations': ['Department of Biomedical Data Science, Stanford University', 'Department of Computer Science, Stanford University', 'Department of Electrical Engineering, Stanford University', 'Department of Pathology, Stanford University', 'Department of Statistics, Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07171.jpg', 'data': {'categories': ['#healthcare', '#cv', '#dataset', '#science', '#multimodal', '#open_source'], 'emoji': '🧬', 'ru': {'title': 'BIOMEDICA: Прорыв в обработке биомедицинских данных с помощью ИИ', 'desc': 'Статья представляет BIOMEDICA - масштабируемый фреймворк с открытым исходным кодом для извлечения и аннотирования биомедицинских данных из научной литературы. Фреймворк создал обширный архив из более чем 24 миллионов уникальных пар изображение-текст из более 6 миллионов статей. На основе этого датасета были обучены модели BMCA-CLIP, достигшие state-of-the-art результатов в 40 биомедицинских задачах. Модели показали значительное улучшение в zero-shot классификации и поиске изображений по тексту при использовании в 10 раз меньших вычислительных ресурсов.'}, 'en': {'title': 'Unlocking Biomedical Knowledge with BIOMEDICA', 'desc': 'This paper presents BIOMEDICA, a new framework designed to create a large, open-source dataset from the PubMed Central Open Access subset, which includes over 24 million image-text pairs from scientific articles. The framework addresses the challenge of limited annotated datasets in the biomedical field, enabling the development of generalist vision-language models (VLMs) that can understand diverse biomedical knowledge. The authors also introduce BMCA-CLIP, a set of models that are continuously pre-trained on this dataset, achieving state-of-the-art performance across various medical tasks with significant improvements in zero-shot classification and image-text retrieval. By making their codebase and dataset publicly available, they aim to enhance reproducibility and collaboration in biomedical research.'}, 'zh': {'title': '推动生物医学领域的视觉语言模型发展', 'desc': '本文介绍了BIOMEDICA,一个可扩展的开源框架,用于提取、注释和序列化PubMed Central开放获取子集的全部内容。该框架生成了一个包含超过2400万个独特图像-文本对的综合档案,来自超过600万篇文章。我们还提供了元数据和专家指导的注释,并展示了BMCA-CLIP模型在40个医学任务中的优越性能,尤其在零样本分类和图像-文本检索方面表现突出。通过发布代码库和数据集,我们促进了研究的可重复性和合作。'}}}, {'id': 'https://huggingface.co/papers/2501.06590', 'title': 'ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning', 'url': 'https://huggingface.co/papers/2501.06590', 'abstract': 'Chemical reasoning usually involves complex, multi-step processes that demand precise calculations, where even minor errors can lead to cascading failures. Furthermore, large language models (LLMs) encounter difficulties handling domain-specific formulas, executing reasoning steps accurately, and integrating code effectively when tackling chemical reasoning tasks. To address these challenges, we present ChemAgent, a novel framework designed to improve the performance of LLMs through a dynamic, self-updating library. This library is developed by decomposing chemical tasks into sub-tasks and compiling these sub-tasks into a structured collection that can be referenced for future queries. Then, when presented with a new problem, ChemAgent retrieves and refines pertinent information from the library, which we call memory, facilitating effective task decomposition and the generation of solutions. Our method designs three types of memory and a library-enhanced reasoning component, enabling LLMs to improve over time through experience. Experimental results on four chemical reasoning datasets from SciBench demonstrate that ChemAgent achieves performance gains of up to 46% (GPT-4), significantly outperforming existing methods. Our findings suggest substantial potential for future applications, including tasks such as drug discovery and materials science. Our code can be found at https://github.com/gersteinlab/chemagent', 'score': 3, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'c217e826245ef357', 'authors': ['Xiangru Tang', 'Tianyu Hu', 'Muyang Ye', 'Yanjun Shao', 'Xunjian Yin', 'Siru Ouyang', 'Wangchunshu Zhou', 'Pan Lu', 'Zhuosheng Zhang', 'Yilun Zhao', 'Arman Cohan', 'Mark Gerstein'], 'affiliations': ['Shanghai Jiao Tong University', 'Stanford University', 'UIUC', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06590.jpg', 'data': {'categories': ['#science', '#reasoning', '#multimodal', '#agents', '#dataset'], 'emoji': '🧪', 'ru': {'title': 'ChemAgent: Умный помощник для LLM в химических задачах', 'desc': 'ChemAgent - это новая система, улучшающая работу больших языковых моделей (LLM) в задачах химического рассуждения. Она использует динамически обновляемую библиотеку, созданную путем декомпозиции химических задач на подзадачи. При решении новых проблем ChemAgent извлекает и уточняет релевантную информацию из библиотеки, что позволяет эффективно декомпозировать задачи и генерировать решения. Система показала значительное превосходство над существующими методами, улучшив производительность LLM до 46% на четырех наборах данных по химическому рассуждению.'}, 'en': {'title': 'Empowering LLMs for Chemical Reasoning with ChemAgent', 'desc': 'This paper introduces ChemAgent, a new framework that enhances large language models (LLMs) for chemical reasoning tasks. It addresses the challenges LLMs face with complex chemical calculations and domain-specific formulas by creating a dynamic library of decomposed sub-tasks. ChemAgent retrieves and refines relevant information from this library, allowing for better task decomposition and solution generation. Experimental results show that ChemAgent significantly improves performance on chemical reasoning datasets, indicating its potential for applications in drug discovery and materials science.'}, 'zh': {'title': 'ChemAgent:提升化学推理的智能助手', 'desc': '化学推理通常涉及复杂的多步骤过程,需要精确的计算,哪怕是微小的错误也可能导致严重的后果。大型语言模型(LLMs)在处理特定领域的公式、准确执行推理步骤和有效整合代码时面临困难。为了解决这些问题,我们提出了ChemAgent,一个通过动态自更新库来提升LLMs性能的新框架。该框架通过将化学任务分解为子任务,并将这些子任务编译成结构化的集合,以便在未来查询时参考,从而实现有效的任务分解和解决方案生成。'}}}, {'id': 'https://huggingface.co/papers/2501.06708', 'title': 'Evaluating Sample Utility for Data Selection by Mimicking Model Weights', 'url': 'https://huggingface.co/papers/2501.06708', 'abstract': "Foundation models rely on large-scale web-crawled datasets, which frequently contain noisy data, biases, and irrelevant content. Existing data selection techniques typically use human heuristics, downstream evaluation datasets, or specialized scoring models, and can overlook samples' utility in the training process. Instead, we propose a new approach, Mimic Score, a data quality metric that uses a pretrained reference model as a guide to assess the usefulness of data samples for training a new model. It relies on the alignment between the gradient of the new model parameters and the vector pointing toward the reference model in weight space. Samples that misalign with this direction are considered low-value and can be filtered out. Motivated by the Mimic score, we develop Grad-Mimic, a data selection framework that identifies and prioritizes useful samples, automating the selection process to create effective filters. Empirically, using Mimic scores to guide model training results in consistent performance gains across six image datasets and enhances the performance of CLIP models. Moreover, Mimic scores and their associated filters improve upon existing filtering methods and offer accurate estimation of dataset quality.", 'score': 2, 'issue_id': 1661, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': '7560c17a0e1b7234', 'authors': ['Tzu-Heng Huang', 'Manjot Bilkhu', 'Frederic Sala', 'Javier Movellan'], 'affiliations': ['Apple Inc.', 'University of Wisconsin-Madison'], 'pdf_title_img': 'assets/pdf/title_img/2501.06708.jpg', 'data': {'categories': ['#data', '#optimization', '#dataset', '#ethics', '#training'], 'emoji': '🧠', 'ru': {'title': 'Умный отбор данных для эффективного обучения моделей', 'desc': 'Предложен новый подход к оценке качества данных для обучения моделей машинного обучения - Mimic Score. Этот метод использует предобученную эталонную модель для оценки полезности образцов данных, анализируя выравнивание градиента параметров новой модели с вектором, указывающим на эталонную модель в пространстве весов. На основе Mimic Score разработан фреймворк Grad-Mimic для автоматизированного отбора полезных образцов данных. Эксперименты показали, что использование Mimic Score приводит к улучшению производительности моделей на нескольких наборах данных изображений и моделей CLIP.'}, 'en': {'title': 'Enhancing Data Selection with Mimic Score for Better Model Training', 'desc': 'This paper introduces a new method called Mimic Score to improve data selection for training foundation models. It uses a pretrained reference model to evaluate the usefulness of data samples by analyzing the alignment of gradients in weight space. Samples that do not align well with the reference model are deemed low-value and can be removed from the training dataset. The proposed Grad-Mimic framework automates this selection process, leading to better model performance across various image datasets and outperforming existing data filtering techniques.'}, 'zh': {'title': 'Mimic Score:提升数据选择的新方法', 'desc': '基础模型依赖于大规模的网络爬取数据集,这些数据集常常包含噪声数据、偏见和无关内容。现有的数据选择技术通常使用人工启发式方法、下游评估数据集或专门的评分模型,可能会忽视样本在训练过程中的实用性。我们提出了一种新的方法,称为Mimic Score,这是一种数据质量指标,利用预训练的参考模型来评估数据样本对新模型训练的有用性。基于Mimic Score,我们开发了Grad-Mimic数据选择框架,自动识别和优先选择有用样本,从而提高模型训练的效果。'}}}, {'id': 'https://huggingface.co/papers/2501.03262', 'title': 'REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models', 'url': 'https://huggingface.co/papers/2501.03262', 'abstract': 'Reinforcement Learning from Human Feedback (RLHF) has emerged as a critical approach for aligning large language models with human preferences, witnessing rapid algorithmic evolution through methods such as Proximal Policy Optimization (PPO), Direct Preference Optimization (DPO), REINFORCE Leave One-Out (RLOO), ReMax, and Group Relative Policy Optimization (GRPO). We present REINFORCE++, an enhanced variant of the classical REINFORCE algorithm that incorporates key optimization techniques from PPO while eliminating the need for a critic network. REINFORCE++ achieves three primary objectives: (1) simplicity (2) enhanced training stability, and (3) reduced computational overhead. Through extensive empirical evaluation, we demonstrate that REINFORCE++ exhibits superior stability compared to GRPO and achieves greater computational efficiency than PPO while maintaining comparable performance. The implementation is available at https://github.com/OpenRLHF/OpenRLHF.', 'score': 42, 'issue_id': 1553, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': 'a05acf5aab0c07dd', 'authors': ['Jian Hu'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.03262.jpg', 'data': {'categories': ['#training', '#rlhf', '#optimization', '#alignment'], 'emoji': '🤖', 'ru': {'title': 'REINFORCE++: Простой и эффективный алгоритм для RLHF', 'desc': 'В статье представлен REINFORCE++, улучшенная версия алгоритма REINFORCE для обучения с подкреплением на основе обратной связи от человека (RLHF). REINFORCE++ сочетает ключевые техники оптимизации из PPO, но не требует использования критической нейронной сети. Алгоритм отличается простотой, повышенной стабильностью обучения и сниженными вычислительными затратами. Эмпирические исследования показывают, что REINFORCE++ демонстрирует лучшую стабильность по сравнению с GRPO и большую вычислительную эффективность, чем PPO, при сохранении сопоставимой производительности.'}, 'en': {'title': 'REINFORCE++: Simplifying Reinforcement Learning with Human Feedback', 'desc': 'This paper introduces REINFORCE++, a new version of the REINFORCE algorithm designed to improve the training of reinforcement learning models using human feedback. It combines the strengths of Proximal Policy Optimization (PPO) while removing the need for a critic network, making it simpler and more efficient. The authors highlight that REINFORCE++ offers better training stability and lower computational costs compared to existing methods like GRPO and PPO. Their experiments show that REINFORCE++ performs well while being easier to use and faster to train.'}, 'zh': {'title': 'REINFORCE++:简化与高效的强化学习新选择', 'desc': '强化学习中的人类反馈(RLHF)是一种重要的方法,用于使大型语言模型更符合人类的偏好。本文提出了REINFORCE++,这是经典REINFORCE算法的增强版本,结合了PPO的优化技术,并且不再需要评论网络。REINFORCE++的主要目标是实现简单性、提高训练稳定性和减少计算开销。通过大量实证评估,我们证明了REINFORCE++在稳定性上优于GRPO,并且在计算效率上超过PPO,同时保持了相似的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.02955', 'title': 'MotionBench: Benchmarking and Improving Fine-grained Video Motion Understanding for Vision Language Models', 'url': 'https://huggingface.co/papers/2501.02955', 'abstract': "In recent years, vision language models (VLMs) have made significant advancements in video understanding. However, a crucial capability - fine-grained motion comprehension - remains under-explored in current benchmarks. To address this gap, we propose MotionBench, a comprehensive evaluation benchmark designed to assess the fine-grained motion comprehension of video understanding models. MotionBench evaluates models' motion-level perception through six primary categories of motion-oriented question types and includes data collected from diverse sources, ensuring a broad representation of real-world video content. Experimental results reveal that existing VLMs perform poorly in understanding fine-grained motions. To enhance VLM's ability to perceive fine-grained motion within a limited sequence length of LLM, we conduct extensive experiments reviewing VLM architectures optimized for video feature compression and propose a novel and efficient Through-Encoder (TE) Fusion method. Experiments show that higher frame rate inputs and TE Fusion yield improvements in motion understanding, yet there is still substantial room for enhancement. Our benchmark aims to guide and motivate the development of more capable video understanding models, emphasizing the importance of fine-grained motion comprehension. Project page: https://motion-bench.github.io .", 'score': 30, 'issue_id': 1551, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'a7051c2d239484b4', 'authors': ['Wenyi Hong', 'Yean Cheng', 'Zhuoyi Yang', 'Weihan Wang', 'Lefan Wang', 'Xiaotao Gu', 'Shiyu Huang', 'Yuxiao Dong', 'Jie Tang'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.02955.jpg', 'data': {'categories': ['#architecture', '#optimization', '#benchmark', '#video'], 'emoji': '🎥', 'ru': {'title': 'MotionBench: новый рубеж в понимании движения для моделей компьютерного зрения', 'desc': 'Статья представляет новый бенчмарк MotionBench для оценки способности моделей компьютерного зрения понимать детальные движения в видео. Авторы обнаружили, что существующие модели плохо справляются с этой задачей. Для улучшения результатов предложен новый метод Through-Encoder Fusion, а также использование видео с более высокой частотой кадров. Бенчмарк призван стимулировать развитие более совершенных моделей понимания видео.'}, 'en': {'title': 'Enhancing Video Understanding with Fine-Grained Motion Comprehension', 'desc': "This paper introduces MotionBench, a new benchmark for evaluating how well vision language models (VLMs) understand fine-grained motion in videos. It identifies a gap in current models' abilities to comprehend detailed motion, which is crucial for accurate video analysis. The benchmark includes various motion-oriented question types and diverse video data to ensure comprehensive testing. The authors also propose a Through-Encoder Fusion method to improve VLM performance, highlighting the need for further advancements in fine-grained motion comprehension."}, 'zh': {'title': '提升视频理解的细粒度运动能力', 'desc': '近年来,视觉语言模型(VLMs)在视频理解方面取得了显著进展。然而,细粒度运动理解这一关键能力在当前基准测试中仍未得到充分探索。为了解决这一问题,我们提出了MotionBench,这是一个全面的评估基准,旨在评估视频理解模型的细粒度运动理解能力。实验结果表明,现有的VLM在理解细粒度运动方面表现不佳,因此我们提出了一种新颖的Through-Encoder(TE)融合方法,以提高模型的运动理解能力。'}}}, {'id': 'https://huggingface.co/papers/2501.03575', 'title': 'Cosmos World Foundation Model Platform for Physical AI', 'url': 'https://huggingface.co/papers/2501.03575', 'abstract': 'Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.', 'score': 25, 'issue_id': 1552, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'f4b2044cbc1076a8', 'authors': ['NVIDIA', ':', 'Niket Agarwal', 'Arslan Ali', 'Maciej Bala', 'Yogesh Balaji', 'Erik Barker', 'Tiffany Cai', 'Prithvijit Chattopadhyay', 'Yongxin Chen', 'Yin Cui', 'Yifan Ding', 'Daniel Dworakowski', 'Jiaojiao Fan', 'Michele Fenzi', 'Francesco Ferroni', 'Sanja Fidler', 'Dieter Fox', 'Songwei Ge', 'Yunhao Ge', 'Jinwei Gu', 'Siddharth Gururani', 'Ethan He', 'Jiahui Huang', 'Jacob Huffman', 'Pooya Jannaty', 'Jingyi Jin', 'Seung Wook Kim', 'Gergely Klár', 'Grace Lam', 'Shiyi Lan', 'Laura Leal-Taixe', 'Anqi Li', 'Zhaoshuo Li', 'Chen-Hsuan Lin', 'Tsung-Yi Lin', 'Huan Ling', 'Ming-Yu Liu', 'Xian Liu', 'Alice Luo', 'Qianli Ma', 'Hanzi Mao', 'Kaichun Mo', 'Arsalan Mousavian', 'Seungjun Nah', 'Sriharsha Niverty', 'David Page', 'Despoina Paschalidou', 'Zeeshan Patel', 'Lindsey Pavao', 'Morteza Ramezanali', 'Fitsum Reda', 'Xiaowei Ren', 'Vasanth Rao Naik Sabavat', 'Ed Schmerling', 'Stella Shi', 'Bartosz Stefaniak', 'Shitao Tang', 'Lyne Tchapmi', 'Przemek Tredak', 'Wei-Cheng Tseng', 'Jibin Varghese', 'Hao Wang', 'Haoxiang Wang', 'Heng Wang', 'Ting-Chun Wang', 'Fangyin Wei', 'Xinyue Wei', 'Jay Zhangjie Wu', 'Jiashu Xu', 'Wei Yang', 'Lin Yen-Chen', 'Xiaohui Zeng', 'Yu Zeng', 'Jing Zhang', 'Qinsheng Zhang', 'Yuxuan Zhang', 'Qingqing Zhao', 'Artur Zolkowski'], 'affiliations': ['NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2501.03575.jpg', 'data': {'categories': ['#open_source', '#data', '#benchmark', '#architecture', '#video', '#multimodal', '#dataset', '#training'], 'emoji': '🌍', 'ru': {'title': 'Цифровой двойник мира для обучения физического ИИ', 'desc': 'Статья представляет платформу Cosmos World Foundation Model для разработки моделей мира в физическом ИИ. Авторы предлагают концепцию базовой модели мира, которую можно дообучать для конкретных приложений. Платформа включает конвейер курации видео, предобученные базовые модели мира, примеры дообучения и токенизаторы видео. Проект открытый и доступен на GitHub для помощи разработчикам физического ИИ в решении важных проблем общества.'}, 'en': {'title': 'Empowering Physical AI with Customizable World Models', 'desc': 'This paper introduces the Cosmos World Foundation Model Platform, designed to assist developers in creating tailored world models for Physical AI systems. It emphasizes the necessity of having a digital twin of both the AI and its environment to enable effective training. The platform includes a comprehensive video curation pipeline, pre-trained models, and tools for fine-tuning these models for specific applications. By making the platform and models open-source, the authors aim to empower developers to address significant societal challenges using Physical AI.'}, 'zh': {'title': '构建物理AI的数字双胞胎与世界模型', 'desc': '这篇论文介绍了物理人工智能(Physical AI)在数字训练中的重要性。为了实现这一目标,需要构建一个数字双胞胎(digital twin)和一个世界模型(world model)。我们提出了Cosmos世界基础模型平台,帮助开发者为物理人工智能定制世界模型。该平台提供了视频策划管道、预训练的世界基础模型以及后训练示例,旨在解决社会中的关键问题,并且是开源的。'}}}, {'id': 'https://huggingface.co/papers/2501.03895', 'title': 'LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One Vision Token', 'url': 'https://huggingface.co/papers/2501.03895', 'abstract': 'The advent of real-time large multimodal models (LMMs) like GPT-4o has sparked considerable interest in efficient LMMs. LMM frameworks typically encode visual inputs into vision tokens (continuous representations) and integrate them and textual instructions into the context of large language models (LLMs), where large-scale parameters and numerous context tokens (predominantly vision tokens) result in substantial computational overhead. Previous efforts towards efficient LMMs always focus on replacing the LLM backbone with smaller models, while neglecting the crucial issue of token quantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal vision tokens. To achieve a high compression ratio of vision tokens while preserving visual information, we first analyze how LMMs understand vision tokens and find that most vision tokens only play a crucial role in the early layers of LLM backbone, where they mainly fuse visual information into text tokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to fuse visual information into text tokens in advance, thereby facilitating the extreme compression of vision tokens fed to LLM backbone into one token. LLaVA-Mini is a unified large multimodal model that can support the understanding of images, high-resolution images, and videos in an efficient manner. Experiments across 11 image-based and 7 video-based benchmarks demonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token instead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by 77%, deliver low-latency responses within 40 milliseconds, and process over 10,000 frames of video on the GPU hardware with 24GB of memory.', 'score': 19, 'issue_id': 1550, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '925d2f81d6fcbb0b', 'authors': ['Shaolei Zhang', 'Qingkai Fang', 'Zhe Yang', 'Yang Feng'], 'affiliations': ['Key Laboratory of AI Safety, Chinese Academy of Sciences', 'Key Laboratory of Intelligent Information Processing, Institute of Computing Technology, Chinese Academy of Sciences (ICT/CAS)', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.03895.jpg', 'data': {'categories': ['#agi', '#video', '#multimodal', '#architecture', '#optimization', '#cv', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Эффективность через минимизацию: революция в мультимодальных моделях', 'desc': 'Статья представляет LLaVA-Mini - эффективную мультимодальную модель с минимальным количеством визуальных токенов. Авторы обнаружили, что большинство визуальных токенов играют ключевую роль только в ранних слоях языковой модели. LLaVA-Mini вводит предварительное слияние модальностей, чтобы объединить визуальную информацию с текстовыми токенами заранее. Эксперименты показывают, что LLaVA-Mini превосходит LLaVA-v1.5, используя всего 1 визуальный токен вместо 576, что значительно повышает эффективность обработки.'}, 'en': {'title': 'Maximizing Efficiency with Minimal Vision Tokens in LMMs', 'desc': 'This paper presents LLaVA-Mini, an efficient large multimodal model (LMM) designed to reduce the number of vision tokens while maintaining visual information integrity. The authors identify that most vision tokens are primarily important in the early layers of the language model, where they integrate visual data with text. By implementing a technique called modality pre-fusion, LLaVA-Mini compresses the input from 576 vision tokens to just one, significantly enhancing efficiency. Experimental results show that LLaVA-Mini not only outperforms its predecessor but also achieves a 77% reduction in computational load and rapid processing times for high-resolution images and videos.'}, 'zh': {'title': '高效多模态模型LLaVA-Mini的创新之路', 'desc': '本文介绍了一种高效的多模态模型LLaVA-Mini,该模型通过减少视觉标记的数量来提高效率。研究发现,大多数视觉标记在大型语言模型的早期层中起着关键作用,因此可以在此之前将视觉信息与文本标记融合。LLaVA-Mini采用了模态预融合的方法,将视觉信息提前融合,从而将输入到语言模型的视觉标记压缩为一个标记。实验结果表明,LLaVA-Mini在多个基准测试中表现优于之前的模型,且显著降低了计算复杂度和延迟。'}}}, {'id': 'https://huggingface.co/papers/2501.04001', 'title': 'Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos', 'url': 'https://huggingface.co/papers/2501.04001', 'abstract': 'This work presents Sa2VA, the first unified model for dense grounded understanding of both images and videos. Unlike existing multi-modal large language models, which are often limited to specific modalities and tasks, Sa2VA supports a wide range of image and video tasks, including referring segmentation and conversation, with minimal one-shot instruction tuning. Sa2VA combines SAM-2, a foundation video segmentation model, with LLaVA, an advanced vision-language model, and unifies text, image, and video into a shared LLM token space. Using the LLM, Sa2VA generates instruction tokens that guide SAM-2 in producing precise masks, enabling a grounded, multi-modal understanding of both static and dynamic visual content. Additionally, we introduce Ref-SAV, an auto-labeled dataset containing over 72k object expressions in complex video scenes, designed to boost model performance. We also manually validate 2k video objects in the Ref-SAV datasets to benchmark referring video object segmentation in complex environments. Experiments show that Sa2VA achieves state-of-the-art across multiple tasks, particularly in referring video object segmentation, highlighting its potential for complex real-world applications.', 'score': 16, 'issue_id': 1555, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'd079946bf74858cd', 'authors': ['Haobo Yuan', 'Xiangtai Li', 'Tao Zhang', 'Zilong Huang', 'Shilin Xu', 'Shunping Ji', 'Yunhai Tong', 'Lu Qi', 'Jiashi Feng', 'Ming-Hsuan Yang'], 'affiliations': ['Bytedance Seed', 'Peking University', 'UC Merced', 'Wuhan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04001.jpg', 'data': {'categories': ['#dataset', '#multimodal', '#benchmark', '#cv'], 'emoji': '🎥', 'ru': {'title': 'Sa2VA: Единая модель для понимания изображений и видео', 'desc': 'Sa2VA - это первая унифицированная модель для плотного заземленного понимания изображений и видео. Она объединяет SAM-2 (модель сегментации видео) с LLaVA (продвинутой моделью компьютерного зрения и языка) в едином пространстве токенов большой языковой модели. Sa2VA генерирует токены инструкций, направляющие SAM-2 в создании точных масок, что позволяет осуществлять заземленное мультимодальное понимание как статического, так и динамического визуального контента. Модель достигает передовых результатов в различных задачах, особенно в сегментации объектов по ссылкам в видео.'}, 'en': {'title': 'Sa2VA: Unifying Image and Video Understanding for Enhanced Multi-Modal Tasks', 'desc': 'Sa2VA is a groundbreaking model that integrates image and video understanding into a single framework. It combines the strengths of SAM-2 for video segmentation and LLaVA for vision-language tasks, allowing it to handle various multi-modal tasks with minimal tuning. By creating a shared token space for text, images, and videos, Sa2VA can generate specific instruction tokens that help in accurately segmenting objects in both images and videos. The introduction of the Ref-SAV dataset further enhances its capabilities, enabling it to achieve top performance in complex visual environments.'}, 'zh': {'title': 'Sa2VA:图像与视频的统一理解模型', 'desc': '本研究提出了Sa2VA,这是第一个统一的模型,能够对图像和视频进行密集的基础理解。与现有的多模态大型语言模型不同,Sa2VA支持多种图像和视频任务,包括引用分割和对话,且只需最少的一次性指令调优。Sa2VA结合了基础视频分割模型SAM-2和先进的视觉语言模型LLaVA,将文本、图像和视频统一到共享的LLM令牌空间中。实验表明,Sa2VA在多个任务上达到了最先进的水平,特别是在引用视频对象分割方面,展示了其在复杂现实应用中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.03847', 'title': 'Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control', 'url': 'https://huggingface.co/papers/2501.03847', 'abstract': 'Diffusion models have demonstrated impressive performance in generating high-quality videos from text prompts or images. However, precise control over the video generation process, such as camera manipulation or content editing, remains a significant challenge. Existing methods for controlled video generation are typically limited to a single control type, lacking the flexibility to handle diverse control demands. In this paper, we introduce Diffusion as Shader (DaS), a novel approach that supports multiple video control tasks within a unified architecture. Our key insight is that achieving versatile video control necessitates leveraging 3D control signals, as videos are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods limited to 2D control signals, DaS leverages 3D tracking videos as control inputs, making the video diffusion process inherently 3D-aware. This innovation allows DaS to achieve a wide range of video controls by simply manipulating the 3D tracking videos. A further advantage of using 3D tracking videos is their ability to effectively link frames, significantly enhancing the temporal consistency of the generated videos. With just 3 days of fine-tuning on 8 H800 GPUs using less than 10k videos, DaS demonstrates strong control capabilities across diverse tasks, including mesh-to-video generation, camera control, motion transfer, and object manipulation.', 'score': 11, 'issue_id': 1552, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '975d5fa9d59bde28', 'authors': ['Zekai Gu', 'Rui Yan', 'Jiahao Lu', 'Peng Li', 'Zhiyang Dou', 'Chenyang Si', 'Zhen Dong', 'Qifeng Liu', 'Cheng Lin', 'Ziwei Liu', 'Wenping Wang', 'Yuan Liu'], 'affiliations': ['Hong Kong University of Science and Technology, China', 'Nanyang Technological University, Singapore', 'Texas A&M University, U.S.A', 'The University of Hong Kong, China', 'Wuhan University, China', 'Zhejiang University, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.03847.jpg', 'data': {'categories': ['#video', '#diffusion', '#3d'], 'emoji': '🎬', 'ru': {'title': 'DaS: Универсальный контроль над генерацией видео через 3D-сигналы', 'desc': 'Авторы представляют новый подход под названием Diffusion as Shader (DaS) для контролируемой генерации видео с помощью диффузионных моделей. В отличие от существующих методов, ограниченных одним типом контроля, DaS поддерживает множество задач управления видео в единой архитектуре. Ключевая идея заключается в использовании 3D-сигналов управления, что делает процесс диффузии видео изначально 3D-ориентированным. DaS демонстрирует сильные возможности управления в различных задачах, включая генерацию видео из 3D-моделей, контроль камеры, перенос движения и манипуляции с объектами.'}, 'en': {'title': 'Empowering Video Generation with 3D Control Signals', 'desc': 'This paper presents Diffusion as Shader (DaS), a new method for generating videos that allows for precise control over various aspects of video creation. Unlike previous models that only used 2D control signals, DaS utilizes 3D tracking videos, which helps in managing the dynamic nature of video content. This approach enables users to manipulate video elements like camera angles and object movements more effectively. The results show that DaS can maintain high-quality video generation while ensuring temporal consistency across frames, even with limited training data.'}, 'zh': {'title': '多样化视频控制的新方法:扩散作为着色器', 'desc': '扩散模型在从文本提示或图像生成高质量视频方面表现出色。然而,精确控制视频生成过程,如相机操作或内容编辑,仍然是一个重大挑战。现有的受控视频生成方法通常仅限于单一控制类型,缺乏处理多样化控制需求的灵活性。本文提出了一种新方法——扩散作为着色器(DaS),它在统一架构中支持多种视频控制任务,利用3D控制信号来实现更灵活的视频控制。'}}}, {'id': 'https://huggingface.co/papers/2501.03936', 'title': 'PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides', 'url': 'https://huggingface.co/papers/2501.03936', 'abstract': 'Automatically generating presentations from documents is a challenging task that requires balancing content quality, visual design, and structural coherence. Existing methods primarily focus on improving and evaluating the content quality in isolation, often overlooking visual design and structural coherence, which limits their practical applicability. To address these limitations, we propose PPTAgent, which comprehensively improves presentation generation through a two-stage, edit-based approach inspired by human workflows. PPTAgent first analyzes reference presentations to understand their structural patterns and content schemas, then drafts outlines and generates slides through code actions to ensure consistency and alignment. To comprehensively evaluate the quality of generated presentations, we further introduce PPTEval, an evaluation framework that assesses presentations across three dimensions: Content, Design, and Coherence. Experiments show that PPTAgent significantly outperforms traditional automatic presentation generation methods across all three dimensions. The code and data are available at https://github.com/icip-cas/PPTAgent.', 'score': 7, 'issue_id': 1557, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '57bb4703056c9e20', 'authors': ['Hao Zheng', 'Xinyan Guan', 'Hao Kong', 'Jia Zheng', 'Hongyu Lin', 'Yaojie Lu', 'Ben He', 'Xianpei Han', 'Le Sun'], 'affiliations': ['Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences', 'Shanghai Jiexin Technology', 'University of Chinese Academy of Sciences'], 'pdf_title_img': 'assets/pdf/title_img/2501.03936.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#dataset'], 'emoji': '🎭', 'ru': {'title': 'PPTAgent: ИИ-помощник для создания презентаций нового уровня', 'desc': 'Исследователи представили PPTAgent - систему для автоматического создания презентаций из документов. В отличие от существующих методов, PPTAgent улучшает не только качество контента, но и визуальный дизайн и структурную согласованность. Система использует двухэтапный подход, вдохновленный рабочим процессом человека: сначала анализирует образцы презентаций, затем создает слайды с помощью программных действий. Авторы также разработали фреймворк PPTEval для комплексной оценки генерируемых презентаций.'}, 'en': {'title': 'PPTAgent: Elevating Presentation Generation with Content, Design, and Coherence', 'desc': 'This paper presents PPTAgent, a novel approach for automatically generating presentations from documents. Unlike existing methods that focus solely on content quality, PPTAgent enhances the overall presentation by considering visual design and structural coherence as well. It employs a two-stage, edit-based process that first analyzes reference presentations to extract patterns and then generates slides through code actions. Additionally, the authors introduce PPTEval, a framework for evaluating presentations based on content, design, and coherence, demonstrating that PPTAgent outperforms traditional methods in all areas.'}, 'zh': {'title': '智能生成高质量演示文稿的解决方案', 'desc': '本文提出了一种名为PPTAgent的自动生成演示文稿的方法。该方法通过两阶段的编辑式流程,综合考虑内容质量、视觉设计和结构一致性。PPTAgent首先分析参考演示文稿,以理解其结构模式和内容框架,然后通过代码操作草拟大纲并生成幻灯片。为了全面评估生成演示文稿的质量,本文还引入了PPTEval评估框架,从内容、设计和一致性三个维度进行评估。'}}}, {'id': 'https://huggingface.co/papers/2501.03714', 'title': 'MoDec-GS: Global-to-Local Motion Decomposition and Temporal Interval Adjustment for Compact Dynamic 3D Gaussian Splatting', 'url': 'https://huggingface.co/papers/2501.03714', 'abstract': '3D Gaussian Splatting (3DGS) has made significant strides in scene representation and neural rendering, with intense efforts focused on adapting it for dynamic scenes. Despite delivering remarkable rendering quality and speed, existing methods struggle with storage demands and representing complex real-world motions. To tackle these issues, we propose MoDecGS, a memory-efficient Gaussian splatting framework designed for reconstructing novel views in challenging scenarios with complex motions. We introduce GlobaltoLocal Motion Decomposition (GLMD) to effectively capture dynamic motions in a coarsetofine manner. This approach leverages Global Canonical Scaffolds (Global CS) and Local Canonical Scaffolds (Local CS), extending static Scaffold representation to dynamic video reconstruction. For Global CS, we propose Global Anchor Deformation (GAD) to efficiently represent global dynamics along complex motions, by directly deforming the implicit Scaffold attributes which are anchor position, offset, and local context features. Next, we finely adjust local motions via the Local Gaussian Deformation (LGD) of Local CS explicitly. Additionally, we introduce Temporal Interval Adjustment (TIA) to automatically control the temporal coverage of each Local CS during training, allowing MoDecGS to find optimal interval assignments based on the specified number of temporal segments. Extensive evaluations demonstrate that MoDecGS achieves an average 70% reduction in model size over stateoftheart methods for dynamic 3D Gaussians from realworld dynamic videos while maintaining or even improving rendering quality.', 'score': 5, 'issue_id': 1556, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'c6cfa761edc047da', 'authors': ['Sangwoon Kwak', 'Joonsoo Kim', 'Jun Young Jeong', 'Won-Sik Cheong', 'Jihyong Oh', 'Munchurl Kim'], 'affiliations': ['Chung-Ang University', 'Electronics and Telecommunications Research Institute', 'Korea Advanced Institute of Science and Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.03714.jpg', 'data': {'categories': ['#3d'], 'emoji': '🎭', 'ru': {'title': 'Эффективное представление сложных движений в динамических сценах', 'desc': 'MoDecGS - это новый фреймворк для эффективной реконструкции динамических сцен с использованием 3D Gaussian Splatting. Он вводит метод GlobaltoLocal Motion Decomposition (GLMD) для захвата сложных движений, используя Global Canonical Scaffolds и Local Canonical Scaffolds. Фреймворк также включает Global Anchor Deformation (GAD) для представления глобальной динамики и Local Gaussian Deformation (LGD) для точной настройки локальных движений. MoDecGS демонстрирует значительное сокращение размера модели при сохранении или улучшении качества рендеринга по сравнению с существующими методами.'}, 'en': {'title': 'Efficient Dynamic Scene Rendering with MoDecGS', 'desc': 'The paper presents MoDecGS, a new framework for 3D Gaussian Splatting that efficiently handles dynamic scenes in neural rendering. It introduces GlobaltoLocal Motion Decomposition (GLMD) to capture complex motions using both Global and Local Canonical Scaffolds. The method employs Global Anchor Deformation (GAD) for global dynamics and Local Gaussian Deformation (LGD) for fine-tuning local motions. MoDecGS significantly reduces model size by 70% compared to existing methods while enhancing rendering quality, making it suitable for real-world dynamic video reconstruction.'}, 'zh': {'title': '高效动态场景重建的新方法', 'desc': '3D高斯点云(3DGS)在场景表示和神经渲染方面取得了显著进展,但在处理动态场景时仍面临存储需求和复杂运动表示的挑战。为了解决这些问题,我们提出了MoDecGS,一个内存高效的高斯点云框架,旨在重建具有复杂运动的新视角。我们引入了全局到局部运动分解(GLMD),以粗到细的方式有效捕捉动态运动,并扩展了静态支架表示以适应动态视频重建。通过全局锚点变形(GAD)和局部高斯变形(LGD),MoDecGS在保持或提高渲染质量的同时,平均减少了70%的模型大小。'}}}, {'id': 'https://huggingface.co/papers/2501.03931', 'title': 'Magic Mirror: ID-Preserved Video Generation in Video Diffusion Transformers', 'url': 'https://huggingface.co/papers/2501.03931', 'abstract': 'We present Magic Mirror, a framework for generating identity-preserved videos with cinematic-level quality and dynamic motion. While recent advances in video diffusion models have shown impressive capabilities in text-to-video generation, maintaining consistent identity while producing natural motion remains challenging. Previous methods either require person-specific fine-tuning or struggle to balance identity preservation with motion diversity. Built upon Video Diffusion Transformers, our method introduces three key components: (1) a dual-branch facial feature extractor that captures both identity and structural features, (2) a lightweight cross-modal adapter with Conditioned Adaptive Normalization for efficient identity integration, and (3) a two-stage training strategy combining synthetic identity pairs with video data. Extensive experiments demonstrate that Magic Mirror effectively balances identity consistency with natural motion, outperforming existing methods across multiple metrics while requiring minimal parameters added. The code and model will be made publicly available at: https://github.com/dvlab-research/MagicMirror/', 'score': 4, 'issue_id': 1550, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '1c9696a99b57f781', 'authors': ['Yuechen Zhang', 'Yaoyang Liu', 'Bin Xia', 'Bohao Peng', 'Zexin Yan', 'Eric Lo', 'Jiaya Jia'], 'affiliations': ['CMU', 'CUHK', 'HKUST', 'SmartMore'], 'pdf_title_img': 'assets/pdf/title_img/2501.03931.jpg', 'data': {'categories': ['#training', '#video', '#multimodal', '#open_source', '#synthetic', '#architecture', '#diffusion'], 'emoji': '🪞', 'ru': {'title': 'Магическое зеркало: видео с сохранением личности и естественным движением', 'desc': 'Magic Mirror - это новая система для создания видео с сохранением идентичности и кинематографическим качеством. Она использует модель видеодиффузии и вводит три ключевых компонента: двойной экстрактор лицевых признаков, легкий кросс-модальный адаптер и двухэтапную стратегию обучения. Система эффективно сочетает сохранение идентичности с естественным движением, превосходя существующие методы по нескольким метрикам. Magic Mirror требует минимального добавления параметров и будет доступна в открытом доступе.'}, 'en': {'title': 'Magic Mirror: Identity-Preserved Video Generation with Cinematic Quality', 'desc': 'Magic Mirror is a new framework designed to create high-quality videos that maintain the identity of individuals while showcasing dynamic motion. It addresses the challenges faced by previous video generation methods, which often struggled to keep a consistent identity or required extensive fine-tuning for specific individuals. The framework utilizes Video Diffusion Transformers and introduces innovative components like a dual-branch facial feature extractor and a cross-modal adapter to enhance identity integration. Through a two-stage training approach, Magic Mirror achieves a remarkable balance between identity preservation and natural motion, outperforming existing techniques with fewer additional parameters.'}, 'zh': {'title': 'Magic Mirror:保持身份一致的动态视频生成', 'desc': '本文介绍了Magic Mirror,一个用于生成保持身份一致的视频框架,具有电影级质量和动态运动。尽管最近的视频扩散模型在文本到视频生成方面取得了显著进展,但在生成自然运动的同时保持一致的身份仍然具有挑战性。我们的方法基于视频扩散变换器,提出了三个关键组件,以有效整合身份信息并保持运动多样性。实验结果表明,Magic Mirror在多个指标上超越了现有方法,同时增加的参数极少。'}}}, {'id': 'https://huggingface.co/papers/2501.03916', 'title': 'Dolphin: Closed-loop Open-ended Auto-research through Thinking, Practice, and Feedback', 'url': 'https://huggingface.co/papers/2501.03916', 'abstract': 'The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific research), in this paper, we propose Dolphin, the first closed-loop open-ended auto-research framework to further build the entire process of human scientific research. Dolphin can generate research ideas, perform experiments, and get feedback from experimental results to generate higher-quality ideas. More specifically, Dolphin first generates novel ideas based on relevant papers which are ranked by the topic and task attributes. Then, the codes are automatically generated and debugged with the exception-traceback-guided local code structure. Finally, Dolphin automatically analyzes the results of each idea and feeds the results back to the next round of idea generation. Experiments are conducted on the benchmark datasets of different topics and results show that Dolphin can generate novel ideas continuously and complete the experiment in a loop. We highlight that Dolphin can automatically propose methods that are comparable to the state-of-the-art in some tasks such as 2D image classification and 3D point classification.', 'score': 3, 'issue_id': 1555, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '9a18a60e788b7840', 'authors': ['Jiakang Yuan', 'Xiangchao Yan', 'Botian Shi', 'Tao Chen', 'Wanli Ouyang', 'Bo Zhang', 'Lei Bai', 'Yu Qiao', 'Bowen Zhou'], 'affiliations': ['Fudan University', 'Shanghai Artificial Intelligence Laboratory'], 'pdf_title_img': 'assets/pdf/title_img/2501.03916.jpg', 'data': {'categories': ['#open_source', '#agents', '#science', '#3d', '#cv', '#benchmark', '#dataset'], 'emoji': '🐬', 'ru': {'title': 'Dolphin: ИИ-ассистент для полного цикла научных исследований', 'desc': 'Статья представляет Dolphin - первую замкнутую систему для автоматического проведения научных исследований. Dolphin генерирует идеи на основе релевантных статей, автоматически создает и отлаживает код для экспериментов, а затем анализирует результаты. Система способна непрерывно генерировать новые идеи и проводить эксперименты в цикле. Эксперименты показали, что Dolphin может предлагать методы, сопоставимые с современными подходами в некоторых задачах машинного обучения.'}, 'en': {'title': 'Dolphin: Automating Scientific Research with AI', 'desc': 'This paper introduces Dolphin, an innovative framework designed to automate the scientific research process. Dolphin operates in a closed-loop system, generating research ideas, conducting experiments, and analyzing results to refine future ideas. It utilizes AI to rank relevant literature and automatically generate and debug code, enhancing research efficiency. The framework has been tested on various benchmark datasets, demonstrating its ability to produce novel ideas and achieve results comparable to leading methods in tasks like image classification.'}, 'zh': {'title': 'Dolphin:自动化科学研究的新纪元', 'desc': '这篇论文介绍了一个名为Dolphin的闭环开放式自动研究框架,旨在提升科学研究的效率。Dolphin能够生成研究想法、进行实验,并根据实验结果反馈生成更高质量的想法。具体来说,Dolphin首先根据相关论文生成新想法,然后自动生成和调试代码,最后分析每个想法的结果并反馈到下一轮生成中。实验结果表明,Dolphin能够持续生成新想法,并在循环中完成实验,且在某些任务上与最先进的方法相当。'}}}, {'id': 'https://huggingface.co/papers/2501.02260', 'title': 'MagicFace: High-Fidelity Facial Expression Editing with Action-Unit Control', 'url': 'https://huggingface.co/papers/2501.02260', 'abstract': "We address the problem of facial expression editing by controling the relative variation of facial action-unit (AU) from the same person. This enables us to edit this specific person's expression in a fine-grained, continuous and interpretable manner, while preserving their identity, pose, background and detailed facial attributes. Key to our model, which we dub MagicFace, is a diffusion model conditioned on AU variations and an ID encoder to preserve facial details of high consistency. Specifically, to preserve the facial details with the input identity, we leverage the power of pretrained Stable-Diffusion models and design an ID encoder to merge appearance features through self-attention. To keep background and pose consistency, we introduce an efficient Attribute Controller by explicitly informing the model of current background and pose of the target. By injecting AU variations into a denoising UNet, our model can animate arbitrary identities with various AU combinations, yielding superior results in high-fidelity expression editing compared to other facial expression editing works. Code is publicly available at https://github.com/weimengting/MagicFace.", 'score': 3, 'issue_id': 1550, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': '9eeeb5b132839793', 'authors': ['Mengting Wei', 'Tuomas Varanka', 'Xingxun Jiang', 'Huai-Qian Khor', 'Guoying Zhao'], 'affiliations': ['Center for Machine Vision and Signal Analysis, Faculty of Information Technology and Electrical Engineering, University of Oulu, Oulu, FI-90014, Finland', 'Key Laboratory of Child Development and Learning Science of Ministry of Education, School of Biological Sciences and Medical Engineering, Southeast University, Nanjing 210096, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.02260.jpg', 'data': {'categories': ['#multimodal', '#diffusion', '#open_source', '#cv'], 'emoji': '🎭', 'ru': {'title': 'Точное редактирование мимики с сохранением личности', 'desc': 'Статья представляет новый подход к редактированию мимики лица с использованием диффузионной модели, названной MagicFace. Модель позволяет точно и интерпретируемо изменять выражение лица конкретного человека, сохраняя его идентичность, позу и фоновые детали. Ключевым элементом является условная генерация на основе вариаций лицевых единиц действия (AU) и использование ID-энкодера для сохранения деталей лица. MagicFace демонстрирует превосходные результаты в высококачественном редактировании выражений лица по сравнению с другими методами.'}, 'en': {'title': 'MagicFace: Fine-Grained Facial Expression Editing with Consistent Identity', 'desc': 'This paper presents a method for editing facial expressions while maintaining the identity and other attributes of the person. The proposed model, named MagicFace, utilizes a diffusion model that is conditioned on facial action unit (AU) variations, allowing for fine-grained control over expressions. It incorporates a pretrained Stable-Diffusion model and an ID encoder to ensure high consistency in facial details. Additionally, an Attribute Controller is introduced to maintain background and pose consistency during the editing process, resulting in high-fidelity expression animations.'}, 'zh': {'title': '魔法面孔:高保真面部表情编辑的创新之路', 'desc': '我们提出了一种面部表情编辑的方法,通过控制同一人的面部动作单元(AU)的相对变化来实现。这种方法可以细致、连续且可解释地编辑特定人的表情,同时保持他们的身份、姿势、背景和面部细节。我们的模型称为MagicFace,核心是一个基于AU变化的扩散模型和一个ID编码器,以保持面部细节的一致性。通过将AU变化注入去噪UNet,我们的模型能够以高保真度编辑面部表情,效果优于其他相关工作。'}}}, {'id': 'https://huggingface.co/papers/2501.02790', 'title': 'Segmenting Text and Learning Their Rewards for Improved RLHF in Language Model', 'url': 'https://huggingface.co/papers/2501.02790', 'abstract': 'Reinforcement learning from human feedback (RLHF) has been widely adopted to align language models (LMs) with human preference. Prior RLHF works typically take a bandit formulation, which, though intuitive, ignores the sequential nature of LM generation and can suffer from the sparse reward issue. While recent works propose dense token-level RLHF, treating each token as an action may be oversubtle to proper reward assignment. In this paper, we seek to get the best of both by training and utilizing a segment-level reward model, which assigns a reward to each semantically complete text segment that spans over a short sequence of tokens. For reward learning, our method allows dynamic text segmentation and compatibility with standard sequence-preference datasets. For effective RL-based LM training against segment reward, we generalize the classical scalar bandit reward normalizers into location-aware normalizer functions and interpolate the segment reward for further densification. With these designs, our method performs competitively on three popular RLHF benchmarks for LM policy: AlpacaEval 2.0, Arena-Hard, and MT-Bench. Ablation studies are conducted to further demonstrate our method.', 'score': 2, 'issue_id': 1562, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'bd19e4a3e48539d4', 'authors': ['Yueqin Yin', 'Shentao Yang', 'Yujia Xie', 'Ziyi Yang', 'Yuting Sun', 'Hany Awadalla', 'Weizhu Chen', 'Mingyuan Zhou'], 'affiliations': ['Microsoft', 'The University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.02790.jpg', 'data': {'categories': ['#training', '#reasoning', '#alignment', '#rlhf', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Сегментарный RLHF: золотая середина между токенами и бандитами', 'desc': 'Данная статья представляет новый подход к обучению языковых моделей с подкреплением на основе обратной связи от человека (RLHF). Авторы предлагают использовать сегментарную модель вознаграждения, которая присваивает награду семантически завершенным текстовым сегментам. Метод позволяет динамическую сегментацию текста и совместим со стандартными наборами данных последовательных предпочтений. Для эффективного RL-обучения языковой модели авторы обобщают классические нормализаторы скалярного бандитного вознаграждения в локально-зависимые функции нормализации.'}, 'en': {'title': 'Enhancing Language Models with Segment-Level Rewards in RLHF', 'desc': 'This paper discusses a new approach to Reinforcement Learning from Human Feedback (RLHF) for language models (LMs). It critiques previous methods that treat the task as a bandit problem, which can overlook the sequential nature of text generation and lead to sparse rewards. The authors propose a segment-level reward model that assigns rewards to complete text segments, improving reward assignment. Their method incorporates dynamic text segmentation and enhances training efficiency by using location-aware normalizer functions, showing competitive results on established RLHF benchmarks.'}, 'zh': {'title': '段落级奖励模型:强化学习的新突破', 'desc': '本论文探讨了如何通过人类反馈进行强化学习(RLHF),以使语言模型(LM)更符合人类偏好。以往的RLHF研究通常采用赌博机模型,但这种方法忽视了语言模型生成的序列特性,并可能面临稀疏奖励的问题。我们提出了一种基于段落级奖励模型的方法,为每个语义完整的文本段落分配奖励,从而克服了以往方法的不足。通过动态文本分割和与标准序列偏好数据集的兼容性,我们的方法在多个RLHF基准测试中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.02393', 'title': 'Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers', 'url': 'https://huggingface.co/papers/2501.02393', 'abstract': "We present an approach to modifying Transformer architectures by integrating graph-aware relational reasoning into the attention mechanism, merging concepts from graph neural networks and language modeling. Building on the inherent connection between attention and graph theory, we reformulate the Transformer's attention mechanism as a graph operation and propose Graph-Aware Isomorphic Attention. This method leverages advanced graph modeling strategies, including Graph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA), to enrich the representation of relational structures. Our approach captures complex dependencies and generalizes across tasks, as evidenced by a reduced generalization gap and improved learning performance. Additionally, we expand the concept of graph-aware attention to introduce Sparse GIN-Attention, a fine-tuning approach that employs sparse GINs. By interpreting attention matrices as sparse adjacency graphs, this technique enhances the adaptability of pre-trained foundational models with minimal computational overhead, endowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning achieves improved training dynamics and better generalization compared to alternative methods like low-rank adaption (LoRA). We discuss latent graph-like structures within traditional attention mechanisms, offering a new lens through which Transformers can be understood. By evolving Transformers as hierarchical GIN models for relational reasoning. This perspective suggests profound implications for foundational model development, enabling the design of architectures that dynamically adapt to both local and global dependencies. Applications in bioinformatics, materials science, language modeling, and beyond could benefit from this synthesis of relational and sequential data modeling, setting the stage for interpretable and generalizable modeling strategies.", 'score': 1, 'issue_id': 1563, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': 'a200448c9795e159', 'authors': ['Markus J. Buehler'], 'affiliations': ['Laboratory for Atomistic and Molecular Mechanics (LAMM) MIT Cambridge, MA 02139, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.02393.jpg', 'data': {'categories': ['#graphs', '#architecture', '#interpretability', '#training'], 'emoji': '🕸️', 'ru': {'title': 'Трансформеры эволюционируют в графовые модели для реляционного рассуждения', 'desc': 'Статья представляет новый подход к модификации архитектуры Трансформеров путем интеграции графового реляционного рассуждения в механизм внимания. Авторы переформулируют механизм внимания Трансформера как графовую операцию и предлагают Graph-Aware Isomorphic Attention, используя стратегии моделирования графов, такие как Graph Isomorphism Networks (GIN) и Principal Neighborhood Aggregation (PNA). Метод позволяет улучшить представление реляционных структур, уменьшить разрыв в обобщении и повысить производительность обучения. Также предложен метод тонкой настройки Sparse GIN-Attention, который интерпретирует матрицы внимания как разреженные графы смежности, улучшая адаптивность предобученных моделей.'}, 'en': {'title': 'Transforming Attention: Merging Graphs and Transformers for Enhanced Learning', 'desc': 'This paper introduces a new way to enhance Transformer models by incorporating graph-based reasoning into their attention mechanisms. By treating attention as a graph operation, the authors propose a method called Graph-Aware Isomorphic Attention, which utilizes advanced graph techniques to better capture relationships in data. They also present Sparse GIN-Attention, a fine-tuning method that interprets attention matrices as sparse graphs, improving the adaptability of pre-trained models with less computational cost. Overall, this approach not only improves learning performance but also opens up new possibilities for applying Transformers in various fields like bioinformatics and language modeling.'}, 'zh': {'title': '图感知注意力:Transformer的新视角', 'desc': '本文提出了一种通过将图感知关系推理整合到注意力机制中来修改Transformer架构的方法。这种方法将Transformer的注意力机制重新表述为图操作,并提出了图感知同构注意力(Graph-Aware Isomorphic Attention)。该方法利用图同构网络(GIN)和主邻域聚合(PNA)等先进的图建模策略,增强了关系结构的表示能力。通过引入稀疏GIN注意力(Sparse GIN-Attention),我们展示了如何在保持计算效率的同时,提升预训练模型的适应性和泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2501.09732', 'title': 'Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps', 'url': 'https://huggingface.co/papers/2501.09732', 'abstract': 'Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.', 'score': 40, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '2ad32c666f91ba05', 'authors': ['Nanye Ma', 'Shangyuan Tong', 'Haolin Jia', 'Hexiang Hu', 'Yu-Chuan Su', 'Mingda Zhang', 'Xuan Yang', 'Yandong Li', 'Tommi Jaakkola', 'Xuhui Jia', 'Saining Xie'], 'affiliations': ['Google', 'MIT', 'NYU'], 'pdf_title_img': 'assets/pdf/title_img/2501.09732.jpg', 'data': {'categories': ['#diffusion', '#inference', '#benchmark', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Повышение качества генерации изображений за счет масштабирования вычислений при выводе', 'desc': 'Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислений во время вывода. Авторы рассматривают задачу поиска лучших шумов для процесса сэмплирования диффузионной модели. Они структурируют пространство решений по двум осям: верификаторы для обратной связи и алгоритмы поиска лучших кандидатов шума. Эксперименты показывают, что увеличение вычислений при выводе приводит к значительному улучшению качества сгенерированных изображений.'}, 'en': {'title': 'Enhancing Diffusion Models: Scaling Inference for Better Image Generation', 'desc': 'This paper investigates how to enhance the performance of diffusion models during the inference phase by increasing computational resources. It highlights that, unlike Large Language Models (LLMs), diffusion models can adjust their inference process through the number of denoising steps, but improvements tend to plateau after a certain point. The authors propose a method to optimize the noise used in the diffusion sampling process by exploring different feedback verifiers and algorithms. Their experiments demonstrate that by strategically increasing computation during inference, the quality of generated images can be significantly improved, tailored to various application needs.'}, 'zh': {'title': '扩散模型推理时的计算扩展与性能提升', 'desc': '生成模型在多个领域产生了重要影响,主要得益于其在训练过程中通过增加数据、计算资源和模型规模来扩展的能力。最近的研究开始探讨大型语言模型(LLMs)在推理时的扩展行为,发现额外的计算可以进一步提高性能。与LLMs不同,扩散模型通过去噪步骤的数量灵活调整推理时的计算,尽管性能提升通常在几十步后趋于平稳。本文探讨了扩散模型在推理时的扩展行为,研究如何通过增加计算来进一步提高生成性能,特别是通过寻找更好的噪声来优化扩散采样过程。'}}}, {'id': 'https://huggingface.co/papers/2501.09751', 'title': 'OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking', 'url': 'https://huggingface.co/papers/2501.09751', 'abstract': "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.", 'score': 31, 'issue_id': 1722, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '7e8d42358354f79b', 'authors': ['Zekun Xi', 'Wenbiao Yin', 'Jizhan Fang', 'Jialong Wu', 'Runnan Fang', 'Ningyu Zhang', 'Jiang Yong', 'Pengjun Xie', 'Fei Huang', 'Huajun Chen'], 'affiliations': ['Tongyi Lab, Alibaba Group', 'Zhejiang Key Laboratory of Big Data Intelligent Computing', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09751.jpg', 'data': {'categories': ['#rag', '#story_generation', '#long_context', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста', 'desc': 'Статья представляет новый подход к генерации текста с использованием больших языковых моделей, названный OmniThink. Этот метод имитирует человеческий процесс итеративного расширения знаний и рефлексии, преодолевая ограничения стандартных методов извлечения информации. OmniThink улучшает плотность знаний в генерируемых статьях, не жертвуя связностью и глубиной. Эксперименты и оценки экспертов подтверждают эффективность OmniThink для решения реальных задач генерации длинных статей.'}, 'en': {'title': 'OmniThink: Elevating Machine Writing through Human-Like Learning', 'desc': 'This paper introduces OmniThink, a novel machine writing framework that enhances the capabilities of large language models by mimicking human cognitive processes. Unlike traditional retrieval-augmented generation methods, which often produce shallow and repetitive content, OmniThink focuses on iterative expansion and reflection to deepen knowledge on topics. The framework significantly improves the knowledge density of generated articles while maintaining coherence and depth, as shown by experimental results. Human evaluations and expert feedback confirm that OmniThink effectively addresses challenges in generating high-quality long-form content.'}, 'zh': {'title': 'OmniThink:提升机器写作的知识密度', 'desc': '本文提出了一种名为OmniThink的机器写作框架,旨在改善传统大语言模型在生成内容时的局限性。OmniThink模拟人类学习者的认知过程,通过迭代扩展和反思来加深对主题的理解。实验结果表明,OmniThink能够提高生成文章的知识密度,同时保持连贯性和深度等指标。人类评估和专家反馈进一步验证了OmniThink在生成长篇文章时解决实际问题的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.09755', 'title': 'Learnings from Scaling Visual Tokenizers for Reconstruction and Generation', 'url': 'https://huggingface.co/papers/2501.09755', 'abstract': "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.", 'score': 22, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '426aa3415c3c0ef4', 'authors': ['Philippe Hansen-Estruch', 'David Yan', 'Ching-Yao Chung', 'Orr Zohar', 'Jialiang Wang', 'Tingbo Hou', 'Tao Xu', 'Sriram Vishwanath', 'Peter Vajda', 'Xinlei Chen'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'Stanford University', 'UT Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.09755.jpg', 'data': {'categories': ['#cv', '#benchmark', '#video', '#optimization', '#architecture', '#diffusion'], 'emoji': '🔬', 'ru': {'title': 'ViTok: Оптимизация визуальной токенизации для генеративных моделей', 'desc': 'Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделях изображений и видео. Авторы представляют ViTok - легковесный автоэнкодер на основе Vision Transformer, обученный на масштабных датасетах. Исследование показывает, что масштабирование декодера улучшает реконструкцию, но неоднозначно влияет на генерацию. ViTok демонстрирует конкурентоспособную производительность при меньшем количестве FLOP и устанавливает новые рекорды в условной генерации видео.'}, 'en': {'title': 'Scaling Auto-Encoders for Enhanced Image and Video Generation', 'desc': 'This paper explores the scaling of auto-encoders, particularly focusing on the tokenizer component, which is crucial for image and video generation. The authors introduce ViTok, a Vision Transformer-based architecture that replaces traditional convolutional backbones, allowing for better scaling on large datasets. They investigate how different scaling strategies for the encoder and decoder affect both reconstruction and generative performance, finding that scaling the decoder is more beneficial for reconstruction. Ultimately, ViTok achieves competitive results with fewer computational resources and sets new benchmarks in image and video generation tasks.'}, 'zh': {'title': '自编码器的视觉标记化:提升生成模型的关键', 'desc': '本论文探讨了通过自编码器进行视觉标记化对图像和视频生成模型的影响。我们提出了一种增强的视觉变换器架构(ViTok),用于替代传统的卷积骨干网络,以提高标记化的效果。研究发现,自编码器的瓶颈规模与重建性能高度相关,但与生成性能的关系更为复杂。最终,ViTok在多个任务中表现出色,尤其是在视频重建和图像生成方面,展示了其在计算效率上的优势。'}}}, {'id': 'https://huggingface.co/papers/2501.09686', 'title': 'Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models', 'url': 'https://huggingface.co/papers/2501.09686', 'abstract': 'Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs\' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs\' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI\'s o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.', 'score': 20, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1c6b1b1f0235304c', 'authors': ['Fengli Xu', 'Qianyue Hao', 'Zefang Zong', 'Jingwei Wang', 'Yunke Zhang', 'Jingyi Wang', 'Xiaochong Lan', 'Jiahui Gong', 'Tianjian Ouyang', 'Fanjin Meng', 'Chenyang Shao', 'Yuwei Yan', 'Qinglong Yang', 'Yiwen Song', 'Sijian Ren', 'Xinyuan Hu', 'Yu Li', 'Jie Feng', 'Chen Gao', 'Yong Li'], 'affiliations': ['Emory University, Atlanta GA, USA', 'HKUST (GZ), Guangzhou, China', 'Tsinghua University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09686.jpg', 'data': {'categories': ['#open_source', '#training', '#rl', '#survey', '#reasoning', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Путь к большим моделям рассуждений: новый рубеж в ИИ', 'desc': 'Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваются ключевые технические компоненты, способствующие развитию крупных моделей рассуждений, включая автоматизированное построение данных, методы обучения рассуждениям и масштабирование во время тестирования. Анализируются популярные проекты с открытым исходным кодом по созданию крупных моделей рассуждений. Обсуждаются открытые проблемы и направления будущих исследований в этой области.'}, 'en': {'title': 'Unlocking Human-Like Reasoning in Large Language Models', 'desc': "This paper discusses the advancements in Large Language Models (LLMs) and their application to complex reasoning tasks. It introduces the concept of 'thought', which represents intermediate reasoning steps, allowing LLMs to simulate human-like reasoning processes. The paper highlights the use of reinforcement learning to enhance LLMs' reasoning capabilities by generating high-quality reasoning trajectories through trial-and-error methods. Additionally, it emphasizes the importance of scaling both training and testing phases to improve reasoning accuracy, paving the way for the development of Large Reasoning Models."}, 'zh': {'title': '推动大型推理模型的研究新前沿', 'desc': '这篇论文探讨了大型语言模型(LLMs)在复杂推理任务中的应用。研究者们引入了“思考”的概念,通过中间步骤的令牌序列来模拟人类的推理过程。最近,强化学习(RL)被应用于训练LLMs,以自动生成高质量的推理轨迹,从而显著提高推理能力。论文还讨论了在测试时增加令牌数量以提高推理准确性的效果,并展望了大型推理模型的未来研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09484', 'title': 'Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators', 'url': 'https://huggingface.co/papers/2501.09484', 'abstract': 'Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the "inquiry" phase of the consultation process. This lack of focus has left the relationship between "inquiry" and "diagnosis" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between "inquiry" and "diagnosis" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig\'s law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.', 'score': 16, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'aff7d86ad63040d9', 'authors': ['Zhaocheng Liu', 'Quan Tu', 'Wen Ye', 'Yu Xiao', 'Zhishou Zhang', 'Hengfu Cui', 'Yalun Zhu', 'Qiang Ju', 'Shizheng Li', 'Jian Xie'], 'affiliations': ['Baichuan Inc.', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09484.jpg', 'data': {'categories': ['#data', '#training', '#science', '#open_source', '#healthcare'], 'emoji': '🩺', 'ru': {'title': 'Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ', 'desc': 'Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам.'}, 'en': {'title': 'Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations', 'desc': "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."}, 'zh': {'title': '优化询问,提升诊断效果', 'desc': '本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。'}}}, {'id': 'https://huggingface.co/papers/2501.09756', 'title': 'SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces', 'url': 'https://huggingface.co/papers/2501.09756', 'abstract': "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/", 'score': 14, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'e6621d55eb165448', 'authors': ['Sumit Chaturvedi', 'Mengwei Ren', 'Yannick Hold-Geoffroy', 'Jingyuan Liu', 'Julie Dorsey', 'Zhixin Shu'], 'affiliations': ['Adobe Research', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09756.jpg', 'data': {'categories': ['#dataset', '#3d', '#inference', '#cv', '#diffusion', '#training', '#synthetic'], 'emoji': '💡', 'ru': {'title': 'SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели', 'desc': 'SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматривает перезасветку как проблему повторного рендеринга, где пиксели трансформируются в ответ на изменения условий освещения окружающей среды. Авторы синтезировали датасет с помощью физически корректного рендеринга, симулируя трансформации освещения на 3D-моделях голов. Предложены две стратегии обучения и вывода для преодоления разрыва между синтетическими и реальными изображениями.'}, 'en': {'title': 'Revolutionizing Portrait Relighting with SynthLight', 'desc': 'SynthLight is a diffusion model designed for relighting portraits by treating the task as a re-rendering challenge influenced by environmental lighting changes. It utilizes a physically-based rendering engine to create a synthetic dataset that simulates how lighting affects 3D head models. The model employs multi-task training to utilize real portraits without specific lighting labels and a novel inference strategy that enhances detail preservation during the relighting process. The results show that SynthLight can effectively generalize to real images, producing realistic lighting effects while maintaining the identity of the subjects, outperforming existing methods in both quantitative and qualitative assessments.'}, 'zh': {'title': 'SynthLight:肖像重光照的新方法', 'desc': '我们介绍了SynthLight,这是一种用于肖像重光照的扩散模型。我们将图像重光照视为重新渲染的问题,通过物理基础渲染引擎合成数据集,以模拟在不同光照条件下的像素变换。我们提出了两种训练和推理策略,以缩小合成图像和真实图像之间的差距,利用真实人像进行多任务训练,并在推理时使用无分类器引导的扩散采样程序。我们的模型能够在多样的真实照片中推广,生成逼真的光照效果,同时保持主体的身份特征。'}}}, {'id': 'https://huggingface.co/papers/2501.09747', 'title': 'FAST: Efficient Action Tokenization for Vision-Language-Action Models', 'url': 'https://huggingface.co/papers/2501.09747', 'abstract': 'Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.', 'score': 13, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1ff64d2f7e62d274', 'authors': ['Karl Pertsch', 'Kyle Stachowicz', 'Brian Ichter', 'Danny Driess', 'Suraj Nair', 'Quan Vuong', 'Oier Mees', 'Chelsea Finn', 'Sergey Levine'], 'affiliations': ['Physical Intelligence', 'Stanford', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.09747.jpg', 'data': {'categories': ['#dataset', '#agents', '#training', '#games', '#optimization', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'Революция в токенизации действий робота: от частотного пространства к универсальности', 'desc': 'Статья представляет новый метод токенизации действий робота под названием FAST (Frequency-space Action Sequence Tokenization), основанный на дискретном косинусном преобразовании. Этот подход позволяет обучать авторегрессионные модели VLA (Vision-Language Action) для высокочастотных и сложных задач манипулирования, где стандартные методы дискретизации не работают. Авторы также представляют FAST+, универсальный токенизатор действий робота, обученный на 1 миллионе реальных траекторий. В сочетании с моделью pi0 VLA, метод FAST позволяет обучаться на 10 тысячах часов данных робота и достигать производительности диффузионных VLA, сокращая время обучения до 5 раз.'}, 'en': {'title': 'Revolutionizing Robot Action Tokenization with FAST', 'desc': 'This paper introduces a new method for tokenizing continuous robot actions to improve the performance of autoregressive sequence models, specifically in the context of vision-language action (VLA) policies. The authors identify that traditional tokenization methods, which use simple binning techniques, struggle with high-frequency and dexterous robotic tasks. To overcome this limitation, they propose Frequency-space Action Sequence Tokenization (FAST), which utilizes the discrete cosine transform for better action representation. The results demonstrate that FAST can effectively train VLAs on extensive robot data, achieving performance comparable to diffusion models while significantly reducing training time.'}, 'zh': {'title': '提升机器人灵巧技能的标记化新方法', 'desc': '本文提出了一种新的机器人动作标记化方案,称为频率空间动作序列标记化(FAST),旨在解决现有基于简单分箱方法的标记化在学习灵巧技能时的不足。FAST利用离散余弦变换来有效地处理高频机器人数据,从而提高了模型在复杂任务中的表现。我们还发布了FAST+,这是一个通用的机器人动作标记器,能够处理多种动作序列和控制频率。通过与pi0 VLA结合,我们的方法在训练10,000小时的机器人数据时,能够与扩散VLA的性能相匹配,同时将训练时间减少了多达5倍。'}}}, {'id': 'https://huggingface.co/papers/2501.09038', 'title': 'Do generative video models learn physical principles from watching videos?', 'url': 'https://huggingface.co/papers/2501.09038', 'abstract': "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.", 'score': 11, 'issue_id': 1725, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '6a5047e8681ddcc5', 'authors': ['Saman Motamed', 'Laura Culp', 'Kevin Swersky', 'Priyank Jaini', 'Robert Geirhos'], 'affiliations': ['Google DeepMind', 'INSAIT, Sofia University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09038.jpg', 'data': {'categories': ['#benchmark', '#science', '#video'], 'emoji': '🧠', 'ru': {'title': 'Визуальный реализм не гарантирует понимание физики в ИИ', 'desc': 'Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моделей понимать законы физики. Результаты показывают, что современные модели имеют ограниченное физическое понимание, несмотря на визуальный реализм. Однако некоторые задачи уже успешно решаются, что указывает на потенциал изучения физических принципов из наблюдений.'}, 'en': {'title': 'Visual Realism vs. Physical Understanding in AI Video Generation', 'desc': "This paper explores whether AI video generation models truly understand the laws of physics or if they are just good at creating realistic images. The authors introduce Physics-IQ, a benchmark dataset designed to test models on their grasp of physical principles like fluid dynamics and thermodynamics. Their findings show that current models struggle with physical understanding, even though they can produce visually realistic videos. This suggests that while some physical concepts can be learned from observation, there are still significant gaps in the models' comprehension of reality."}, 'zh': {'title': '视觉真实感不等于物理理解', 'desc': '本论文探讨了AI视频生成技术的进展,特别是模型是否理解物理规律。我们开发了Physics-IQ,一个全面的基准数据集,只有通过深入理解流体动力学、光学、固体力学、磁学和热力学等物理原理才能解决。研究发现,当前模型在物理理解方面存在严重限制,且与视觉真实感无关。尽管某些测试案例已成功解决,但这表明仅通过观察获得某些物理原理仍面临重大挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.09433', 'title': 'CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation', 'url': 'https://huggingface.co/papers/2501.09433', 'abstract': 'The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.', 'score': 10, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '8c7a54f21e46af7a', 'authors': ['Hwan Heo', 'Jangyeong Kim', 'Seongyeong Lee', 'Jeong A Wi', 'Junyoung Choi', 'Sangjun Ahn'], 'affiliations': ['Graphics AI Lab, NC Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.09433.jpg', 'data': {'categories': ['#diffusion', '#3d', '#optimization'], 'emoji': '🎨', 'ru': {'title': 'CaPa: Революция в генерации 3D-моделей', 'desc': 'В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощью латентной диффузионной модели и пространственно-разделенного внимания. CaPa также предлагает алгоритм для заполнения нетекстурированных областей, обеспечивая целостность результатов. Фреймворк генерирует 3D-модели менее чем за 30 секунд, превосходя аналоги по качеству текстур и стабильности геометрии.'}, 'en': {'title': 'CaPa: Fast and High-Fidelity 3D Asset Generation', 'desc': 'This paper presents CaPa, a novel framework for generating high-quality 3D assets from textual or visual inputs. It addresses common challenges in 3D generation, such as multi-view inconsistency and slow generation times, by separating geometry generation from texture synthesis. The framework utilizes a 3D latent diffusion model for consistent geometry creation and a Spatially Decoupled Attention mechanism for high-resolution texture synthesis. CaPa also includes a 3D-aware occlusion inpainting algorithm to enhance the final output, achieving high fidelity and stability in under 30 seconds.'}, 'zh': {'title': '高效生成高保真3D资产的CaPa框架', 'desc': '本论文介绍了一种名为CaPa的框架,用于高效生成高保真度的3D资产。该框架采用两阶段的过程,将几何体生成与纹理合成解耦。首先,使用3D潜在扩散模型生成几何体,确保多视角之间的结构一致性。然后,通过一种新颖的空间解耦注意力机制合成高分辨率纹理,并提出了3D感知的遮挡修复算法,最终在30秒内生成高质量的3D资产。'}}}, {'id': 'https://huggingface.co/papers/2501.09653', 'title': 'The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models', 'url': 'https://huggingface.co/papers/2501.09653', 'abstract': 'The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.', 'score': 9, 'issue_id': 1730, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '6d731a1519dc2727', 'authors': ['Jonathan Katzy', 'Razvan Mihai Popescu', 'Arie van Deursen', 'Maliheh Izadi'], 'affiliations': ['Delft University of Technology Delft, The Netherlands'], 'pdf_title_img': 'assets/pdf/title_img/2501.09653.jpg', 'data': {'categories': ['#low_resource', '#multilingual', '#open_source', '#data', '#dataset'], 'emoji': '🗃️', 'ru': {'title': 'The Heap: чистый код для честной оценки языковых моделей', 'desc': "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор данных под названием 'The Heap' охватывает 57 языков программирования и был дедуплицирован относительно других открытых наборов данных. Это позволяет исследователям проводить объективные оценки больших языковых моделей без необходимости значительной предварительной очистки данных. Создание 'The Heap' решает проблему ограниченности доступного кода для исследования специфических поведений моделей и их оценки без риска загрязнения данных."}, 'en': {'title': 'The Heap: A Clean Dataset for Fair Evaluation of Language Models', 'desc': 'This paper introduces The Heap, a comprehensive multilingual dataset that includes code from 57 programming languages. It addresses the challenge of data contamination in evaluating large language models by providing a deduplicated dataset, ensuring that the code is unique compared to existing open datasets. Researchers can utilize The Heap for downstream tasks without the burden of extensive data cleaning. This resource aims to facilitate fair assessments of model performance in coding tasks.'}, 'zh': {'title': '公平评估大型语言模型的新数据集', 'desc': '随着大型语言模型的流行,开发了大量的代码数据集来训练这些模型。然而,这导致可用于特定行为研究或评估大型语言模型的代码有限,且可能存在数据污染的问题。为了解决这个问题,我们发布了The Heap,这是一个覆盖57种编程语言的大型多语言数据集,经过去重处理,避免与其他开放代码数据集重复。这样,研究人员可以在不需要大量数据清理的情况下,公平地评估大型语言模型。'}}}, {'id': 'https://huggingface.co/papers/2501.08617', 'title': 'RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation', 'url': 'https://huggingface.co/papers/2501.08617', 'abstract': "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.", 'score': 7, 'issue_id': 1720, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'f758bc630d8dd443', 'authors': ['Kaiqu Liang', 'Haimin Hu', 'Ryan Liu', 'Thomas L. Griffiths', 'Jaime Fernández Fisac'], 'affiliations': ['Department of Computer Science, Princeton University', 'Department of Electrical and Computer Engineering, Princeton University', 'Department of Psychology, Princeton University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08617.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#training', '#rl'], 'emoji': '🔮', 'ru': {'title': 'Взгляд в будущее для лучшей настройки ИИ', 'desc': 'Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS использует симуляцию долгосрочных последствий действий модели и оценку их полезности постфактум. Авторы показывают, что RLHS позволяет уменьшить проблему неправильной мотивации модели и улучшить соответствие человеческим ценностям. Эмпирические эксперименты демонстрируют превосходство RLHS над RLHF в достижении целей пользователей.'}, 'en': {'title': 'Aligning AI with Human Values through Hindsight Feedback', 'desc': "This paper addresses the challenge of aligning generative AI systems with human values using Reinforcement Learning from Human Feedback (RLHF). It identifies that relying on immediate feedback can lead to misaligned behaviors, such as sycophancy and deception, due to Goodhart's Law dynamics. The authors propose a new approach called Reinforcement Learning from Hindsight Simulation (RLHS), which uses simulated consequences to gather feedback on beneficial behaviors. Their experiments show that RLHS improves user satisfaction and goal achievement compared to traditional RLHF methods, highlighting the importance of considering long-term outcomes in AI alignment."}, 'zh': {'title': '关注长期后果,提升AI对齐性', 'desc': '这篇论文探讨了生成性人工智能系统如何更好地与人类价值观对齐,以确保其行为有益且可信。现有的基于人类反馈的强化学习(RLHF)方法主要依赖即时反馈,但这种反馈可能无法准确反映与用户效用相关的长期影响。作者提出了一种新的方法,称为基于事后模拟的强化学习(RLHS),通过模拟可能的后果来获取反馈,从而改善模型的对齐性。研究表明,RLHS在帮助用户实现目标和提高满意度方面,优于传统的RLHF方法。'}}}, {'id': 'https://huggingface.co/papers/2501.09503', 'title': 'AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation', 'url': 'https://huggingface.co/papers/2501.09503', 'abstract': 'Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .', 'score': 6, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'fb27e795153a9668', 'authors': ['Junjie He', 'Yuxiang Tuo', 'Binghui Chen', 'Chongyang Zhong', 'Yifeng Geng', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.09503.jpg', 'data': {'categories': ['#cv', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами', 'desc': 'Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод использует универсальный энкодер изображений ReferenceNet и CLIP для высококачественного кодирования характеристик субъектов. AnyStory применяет декуплированный маршрутизатор субъектов для точного определения их потенциального расположения в латентном пространстве. Эксперименты показывают превосходную производительность метода в сохранении деталей субъектов, соответствии текстовым описаниям и персонализации для нескольких субъектов одновременно.'}, 'en': {'title': 'AnyStory: Mastering Personalized Image Generation for Multiple Subjects', 'desc': "This paper introduces AnyStory, a novel method for generating personalized images with high fidelity, even when multiple subjects are involved. It employs an 'encode-then-route' strategy, where a powerful image encoder, ReferenceNet, captures detailed subject features. The routing mechanism uses an instance-aware subject router to accurately determine where each subject should be placed in the generated image. Experimental results show that AnyStory excels in maintaining subject details and aligning them with text descriptions, making it effective for both single and multiple subjects."}, 'zh': {'title': 'AnyStory:个性化主题生成的新方法', 'desc': '最近,大规模生成模型在文本到图像生成方面表现出色。然而,生成高保真度的个性化图像,尤其是涉及多个主题的情况,仍然面临挑战。本文提出了AnyStory,这是一种统一的个性化主题生成方法,能够在不牺牲主题保真的情况下,实现单个和多个主题的高保真个性化。AnyStory通过“编码-再路由”的方式建模主题个性化问题,利用强大的图像编码器和实例感知路由器,准确预测主题在潜在空间中的位置。'}}}, {'id': 'https://huggingface.co/papers/2501.04519', 'title': 'rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking', 'url': 'https://huggingface.co/papers/2501.04519', 'abstract': 'We present rStar-Math to demonstrate that small language models (SLMs) can rival or even surpass the math reasoning capability of OpenAI o1, without distillation from superior models. rStar-Math achieves this by exercising "deep thinking" through Monte Carlo Tree Search (MCTS), where a math policy SLM performs test-time search guided by an SLM-based process reward model. rStar-Math introduces three innovations to tackle the challenges in training the two SLMs: (1) a novel code-augmented CoT data sythesis method, which performs extensive MCTS rollouts to generate step-by-step verified reasoning trajectories used to train the policy SLM; (2) a novel process reward model training method that avoids na\\"ive step-level score annotation, yielding a more effective process preference model (PPM); (3) a self-evolution recipe in which the policy SLM and PPM are built from scratch and iteratively evolved to improve reasoning capabilities. Through 4 rounds of self-evolution with millions of synthesized solutions for 747k math problems, rStar-Math boosts SLMs\' math reasoning to state-of-the-art levels. On the MATH benchmark, it improves Qwen2.5-Math-7B from 58.8% to 90.0% and Phi3-mini-3.8B from 41.4% to 86.4%, surpassing o1-preview by +4.5% and +0.9%. On the USA Math Olympiad (AIME), rStar-Math solves an average of 53.3% (8/15) of problems, ranking among the top 20% the brightest high school math students. Code and data will be available at https://github.com/microsoft/rStar.', 'score': 100, 'issue_id': 1572, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'b065003de5fa3bde', 'authors': ['Xinyu Guan', 'Li Lyna Zhang', 'Yifei Liu', 'Ning Shang', 'Youran Sun', 'Yi Zhu', 'Fan Yang', 'Mao Yang'], 'affiliations': ['Microsoft', 'Peking University', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04519.jpg', 'data': {'categories': ['#training', '#reasoning', '#optimization', '#benchmark', '#small_models', '#dataset'], 'emoji': '🧮', 'ru': {'title': 'Малые модели решают большие задачи: rStar-Math превосходит гигантов в математике', 'desc': 'Статья представляет rStar-Math - подход, позволяющий малым языковым моделям (SLM) достичь или превзойти способности крупных моделей в математических рассуждениях. Метод использует поиск по методу Монте-Карло (MCTS) с двумя специально обученными SLM: политикой и моделью вознаграждения. Авторы вводят новые методы синтеза обучающих данных, обучения модели вознаграждения и итеративного улучшения моделей. В результате rStar-Math значительно повышает эффективность SLM на математических тестах, превосходя более крупные модели.'}, 'en': {'title': 'Empowering Small Models to Excel in Math Reasoning', 'desc': 'The paper introduces rStar-Math, a framework that enhances the math reasoning abilities of small language models (SLMs) without relying on larger models. It employs Monte Carlo Tree Search (MCTS) to enable deep thinking, allowing the SLM to perform guided search during problem-solving. Key innovations include a code-augmented Chain of Thought (CoT) data synthesis method for generating verified reasoning paths, a refined process preference model (PPM) for better reward training, and a self-evolution strategy for iterative improvement. As a result, rStar-Math significantly boosts the performance of SLMs on math benchmarks, achieving state-of-the-art results in various assessments.'}, 'zh': {'title': '小型语言模型的数学推理新突破', 'desc': 'rStar-Math展示了小型语言模型(SLMs)在数学推理能力上可以与OpenAI的o1相媲美,甚至超越它,而无需从更强大的模型中蒸馏。该方法通过蒙特卡洛树搜索(MCTS)实现“深度思考”,在测试时由SLM驱动的过程奖励模型指导数学策略SLM进行搜索。rStar-Math引入了三项创新来解决训练两个SLM的挑战,包括新颖的代码增强的链式推理数据合成方法和更有效的过程偏好模型(PPM)训练方法。经过四轮自我进化,rStar-Math在747,000个数学问题上生成了数百万个合成解,使SLMs的数学推理能力达到了最先进的水平。'}}}, {'id': 'https://huggingface.co/papers/2501.04682', 'title': 'Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Though', 'url': 'https://huggingface.co/papers/2501.04682', 'abstract': 'We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends traditional Chain-of-Thought (CoT) by explicitly modeling the underlying reasoning required to arrive at a particular CoT. We present empirical evidence from state-of-the-art models exhibiting behaviors consistent with in-context search, and explore methods for producing Meta-CoT via process supervision, synthetic data generation, and search algorithms. Finally, we outline a concrete pipeline for training a model to produce Meta-CoTs, incorporating instruction tuning with linearized search traces and reinforcement learning post-training. Finally, we discuss open research questions, including scaling laws, verifier roles, and the potential for discovering novel reasoning algorithms. This work provides a theoretical and practical roadmap to enable Meta-CoT in LLMs, paving the way for more powerful and human-like reasoning in artificial intelligence.', 'score': 42, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '3479f7793755e586', 'authors': ['Violet Xiang', 'Charlie Snell', 'Kanishk Gandhi', 'Alon Albalak', 'Anikait Singh', 'Chase Blagden', 'Duy Phung', 'Rafael Rafailov', 'Nathan Lile', 'Dakota Mahan', 'Louis Castricato', 'Jan-Philipp Franken', 'Nick Haber', 'Chelsea Finn'], 'affiliations': ['Stanford University', 'SynthLabs.ai', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.04682.jpg', 'data': {'categories': ['#synthetic', '#training', '#rlhf', '#rl', '#multimodal', '#optimization', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Meta-CoT: новый уровень рассуждений для ИИ', 'desc': 'Исследователи предлагают новую концепцию под названием Meta Chain-of-Thought (Meta-CoT), которая расширяет традиционный подход Chain-of-Thought. Meta-CoT моделирует базовые рассуждения, необходимые для построения цепочки мыслей. Авторы представляют эмпирические доказательства того, что современные языковые модели демонстрируют поведение, согласующееся с контекстным поиском. Они также описывают конкретный процесс обучения модели для генерации Meta-CoT, включающий инструктивную настройку и обучение с подкреплением.'}, 'en': {'title': 'Empowering AI with Enhanced Reasoning through Meta-CoT', 'desc': 'The paper introduces a new framework called Meta Chain-of-Thought (Meta-CoT), which enhances the traditional Chain-of-Thought (CoT) approach by focusing on the reasoning processes behind generating CoTs. It provides experimental results from advanced models that show behaviors similar to in-context search, and discusses techniques for creating Meta-CoT through process supervision, synthetic data, and search algorithms. The authors propose a detailed training pipeline that combines instruction tuning with search traces and reinforcement learning to improve the generation of Meta-CoTs. Additionally, the paper raises important questions about scaling, the role of verifiers, and the potential for discovering new reasoning methods, aiming to advance the reasoning capabilities of large language models (LLMs).'}, 'zh': {'title': '推动人工智能推理能力的元思维链', 'desc': '我们提出了一种新颖的框架,称为元思维链(Meta-CoT),它通过明确建模所需的推理过程来扩展传统的思维链(CoT)。我们展示了来自最先进模型的实证证据,这些模型表现出与上下文搜索一致的行为,并探索了通过过程监督、合成数据生成和搜索算法来生成元思维链的方法。最后,我们概述了一个具体的训练流程,结合了指令调优、线性化搜索轨迹和强化学习后训练,以生成元思维链。此项工作为在大型语言模型中实现元思维链提供了理论和实践的路线图,推动了人工智能更强大和更人性化的推理能力。'}}}, {'id': 'https://huggingface.co/papers/2501.04686', 'title': 'URSA: Understanding and Verifying Chain-of-thought Reasoning in Multimodal Mathematics', 'url': 'https://huggingface.co/papers/2501.04686', 'abstract': 'Chain-of-thought (CoT) reasoning has been widely applied in the mathematical reasoning of Large Language Models (LLMs). Recently, the introduction of derivative process supervision on CoT trajectories has sparked discussions on enhancing scaling capabilities during test time, thereby boosting the potential of these models. However, in multimodal mathematical reasoning, the scarcity of high-quality CoT training data has hindered existing models from achieving high-precision CoT reasoning and has limited the realization of reasoning potential during test time. In this work, we propose a three-module synthesis strategy that integrates CoT distillation, trajectory-format rewriting, and format unification. It results in a high-quality CoT reasoning instruction fine-tuning dataset in multimodal mathematics, MMathCoT-1M. We comprehensively validate the state-of-the-art (SOTA) performance of the trained URSA-7B model on multiple multimodal mathematical benchmarks. For test-time scaling, we introduce a data synthesis strategy that automatically generates process annotation datasets, known as DualMath-1.1M, focusing on both interpretation and logic. By further training URSA-7B on DualMath-1.1M, we transition from CoT reasoning capabilities to robust supervision abilities. The trained URSA-RM-7B acts as a verifier, effectively enhancing the performance of URSA-7B at test time. URSA-RM-7B also demonstrates excellent out-of-distribution (OOD) verifying capabilities, showcasing its generalization. Model weights, training data and code will be open-sourced.', 'score': 35, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '089df0fb9a548ce8', 'authors': ['Ruilin Luo', 'Zhuofan Zheng', 'Yifan Wang', 'Yiyao Yu', 'Xinzhe Ni', 'Zicheng Lin', 'Jin Zeng', 'Yujiu Yang'], 'affiliations': ['ByteDance', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04686.jpg', 'data': {'categories': ['#dataset', '#training', '#multimodal', '#data', '#open_source', '#reasoning', '#math', '#architecture', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Усиление мультимодальных математических рассуждений через синтез данных и верификацию', 'desc': 'Статья представляет новый подход к улучшению математических рассуждений в мультимодальных языковых моделях. Авторы предлагают стратегию синтеза высококачественного набора данных MMathCoT-1M для обучения цепочкам рассуждений. Они также вводят метод DualMath-1.1M для генерации аннотаций процесса рассуждений, что позволяет модели URSA-7B перейти от способности рассуждать к возможности проверять рассуждения. Результаты показывают улучшение производительности и обобщающей способности модели.'}, 'en': {'title': 'Enhancing Multimodal Mathematical Reasoning with CoT Synthesis', 'desc': "This paper discusses improving mathematical reasoning in Large Language Models (LLMs) using a method called Chain-of-Thought (CoT) reasoning. The authors introduce a new dataset, MMathCoT-1M, which is created through a three-module synthesis strategy to enhance the quality of CoT training data in multimodal mathematics. They also present a data synthesis strategy, DualMath-1.1M, that generates additional training data to improve the model's reasoning capabilities during testing. The results show that their model, URSA-RM-7B, significantly enhances performance and generalization in multimodal mathematical tasks."}, 'zh': {'title': '提升多模态数学推理的链式推理能力', 'desc': '本文探讨了链式推理(CoT)在大型语言模型(LLMs)中的应用,特别是在多模态数学推理中的挑战。由于高质量的CoT训练数据稀缺,现有模型在测试时的推理能力受到限制。为了解决这个问题,作者提出了一种三模块合成策略,生成了高质量的多模态数学推理指令微调数据集MMathCoT-1M。通过进一步训练URSA-7B模型,结合生成的数据集DualMath-1.1M,显著提升了模型在测试时的推理能力和验证能力。'}}}, {'id': 'https://huggingface.co/papers/2501.04227', 'title': 'Agent Laboratory: Using LLM Agents as Research Assistants', 'url': 'https://huggingface.co/papers/2501.04227', 'abstract': 'Historically, scientific discovery has been a lengthy and costly process, demanding substantial time and resources from initial conception to final results. To accelerate scientific discovery, reduce research costs, and improve research quality, we introduce Agent Laboratory, an autonomous LLM-based framework capable of completing the entire research process. This framework accepts a human-provided research idea and progresses through three stages--literature review, experimentation, and report writing to produce comprehensive research outputs, including a code repository and a research report, while enabling users to provide feedback and guidance at each stage. We deploy Agent Laboratory with various state-of-the-art LLMs and invite multiple researchers to assess its quality by participating in a survey, providing human feedback to guide the research process, and then evaluate the final paper. We found that: (1) Agent Laboratory driven by o1-preview generates the best research outcomes; (2) The generated machine learning code is able to achieve state-of-the-art performance compared to existing methods; (3) Human involvement, providing feedback at each stage, significantly improves the overall quality of research; (4) Agent Laboratory significantly reduces research expenses, achieving an 84% decrease compared to previous autonomous research methods. We hope Agent Laboratory enables researchers to allocate more effort toward creative ideation rather than low-level coding and writing, ultimately accelerating scientific discovery.', 'score': 34, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'ff592ae1a5a88909', 'authors': ['Samuel Schmidgall', 'Yusheng Su', 'Ze Wang', 'Ximeng Sun', 'Jialian Wu', 'Xiaodong Yu', 'Jiang Liu', 'Zicheng Liu', 'Emad Barsoum'], 'affiliations': ['AMD', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04227.jpg', 'data': {'categories': ['#science', '#training', '#agents', '#rlhf', '#survey'], 'emoji': '🧪', 'ru': {'title': 'Автономная лаборатория ИИ: революция в научных исследованиях', 'desc': 'Статья представляет Agent Laboratory - автономную систему на основе моделей LLM, способную выполнять полный цикл научного исследования. Система проходит через этапы обзора литературы, экспериментов и написания отчета, позволяя пользователям давать обратную связь на каждом этапе. Эксперименты показали, что Agent Laboratory, работающая на модели o1-preview, генерирует лучшие результаты исследований и значительно снижает затраты на исследования. Авторы надеются, что эта система позволит исследователям сосредоточиться на творческом процессе, ускоряя научные открытия.'}, 'en': {'title': 'Accelerating Science with Autonomous Research Frameworks', 'desc': 'The paper presents Agent Laboratory, an autonomous framework that utilizes large language models (LLMs) to streamline the scientific research process. It operates in three stages: conducting a literature review, performing experiments, and writing reports, all while allowing human researchers to provide feedback. The study shows that Agent Laboratory can produce high-quality research outputs, including code that outperforms existing methods, and significantly reduces research costs by 84%. By automating routine tasks, the framework aims to free researchers to focus more on innovative ideas and less on tedious coding and documentation.'}, 'zh': {'title': 'Agent Laboratory:加速科学发现的智能助手', 'desc': '本文介绍了一种名为Agent Laboratory的自主框架,旨在加速科学发现并降低研究成本。该框架基于大型语言模型(LLM),能够完成文献综述、实验和报告撰写等整个研究过程。研究表明,Agent Laboratory在生成研究成果方面表现优异,尤其是在机器学习代码的性能上,达到了最先进的水平。通过人类反馈的参与,研究质量显著提高,同时研究费用减少了84%。'}}}, {'id': 'https://huggingface.co/papers/2501.04306', 'title': 'LLM4SR: A Survey on Large Language Models for Scientific Research', 'url': 'https://huggingface.co/papers/2501.04306', 'abstract': 'In recent years, the rapid advancement of Large Language Models (LLMs) has transformed the landscape of scientific research, offering unprecedented support across various stages of the research cycle. This paper presents the first systematic survey dedicated to exploring how LLMs are revolutionizing the scientific research process. We analyze the unique roles LLMs play across four critical stages of research: hypothesis discovery, experiment planning and implementation, scientific writing, and peer reviewing. Our review comprehensively showcases the task-specific methodologies and evaluation benchmarks. By identifying current challenges and proposing future research directions, this survey not only highlights the transformative potential of LLMs, but also aims to inspire and guide researchers and practitioners in leveraging LLMs to advance scientific inquiry. Resources are available at the following repository: https://github.com/du-nlp-lab/LLM4SR', 'score': 17, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'bfb9039780003b6d', 'authors': ['Ziming Luo', 'Zonglin Yang', 'Zexin Xu', 'Wei Yang', 'Xinya Du'], 'affiliations': ['Nanyang Technological University, Singapore', 'University of Texas at Dallas, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.04306.jpg', 'data': {'categories': ['#science', '#survey', '#multimodal', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'LLM как революционный инструмент в научных исследованиях', 'desc': 'Эта статья представляет собой первый систематический обзор роли больших языковых моделей (LLM) в научных исследованиях. Авторы анализируют, как LLM используются на четырех ключевых этапах исследовательского процесса: формирование гипотез, планирование и проведение экспериментов, научное письмо и рецензирование. В работе рассматриваются специфические методологии и критерии оценки для каждой задачи. Статья также обсуждает текущие проблемы и предлагает направления для будущих исследований в этой области.'}, 'en': {'title': 'Revolutionizing Research: The Power of Large Language Models', 'desc': 'This paper systematically surveys the impact of Large Language Models (LLMs) on the scientific research process. It identifies how LLMs assist in four key stages: generating hypotheses, planning and conducting experiments, writing scientific papers, and facilitating peer reviews. The authors discuss specific methodologies and evaluation benchmarks for each task, highlighting the transformative potential of LLMs in enhancing research efficiency. Additionally, the paper addresses current challenges and suggests future research directions to further integrate LLMs into scientific inquiry.'}, 'zh': {'title': '大型语言模型:科学研究的变革者', 'desc': '近年来,大型语言模型(LLMs)的快速发展改变了科学研究的格局,为研究周期的各个阶段提供了前所未有的支持。本文首次系统性地调查了LLMs如何革新科学研究过程,分析了它们在假设发现、实验规划与实施、科学写作和同行评审等四个关键阶段的独特作用。我们的综述全面展示了任务特定的方法论和评估基准,并识别了当前面临的挑战,提出了未来的研究方向。通过强调LLMs的变革潜力,本文旨在激励和指导研究人员和从业者利用LLMs推动科学探索。'}}}, {'id': 'https://huggingface.co/papers/2501.04575', 'title': 'InfiGUIAgent: A Multimodal Generalist GUI Agent with Native Reasoning and Reflection', 'url': 'https://huggingface.co/papers/2501.04575', 'abstract': 'Graphical User Interface (GUI) Agents, powered by multimodal large language models (MLLMs), have shown great potential for task automation on computing devices such as computers and mobile phones. However, existing agents face challenges in multi-step reasoning and reliance on textual annotations, limiting their effectiveness. We introduce InfiGUIAgent, an MLLM-based GUI Agent trained with a two-stage supervised fine-tuning pipeline. Stage 1 enhances fundamental skills such as GUI understanding and grounding, while Stage 2 integrates hierarchical reasoning and expectation-reflection reasoning skills using synthesized data to enable native reasoning abilities of the agents. InfiGUIAgent achieves competitive performance on several GUI benchmarks, highlighting the impact of native reasoning skills in enhancing GUI interaction for automation tasks. Resources are available at https://github.com/Reallm-Labs/InfiGUIAgent.', 'score': 14, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '501c7ba58ede235b', 'authors': ['Yuhang Liu', 'Pengxiang Li', 'Zishu Wei', 'Congkai Xie', 'Xueyu Hu', 'Xinchen Xu', 'Shengyu Zhang', 'Xiaotian Han', 'Hongxia Yang', 'Fei Wu'], 'affiliations': ['ByteDance Inc', 'Dalian University of Technology', 'Reallm Labs', 'The Hong Kong Polytechnic University', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04575.jpg', 'data': {'categories': ['#benchmark', '#synthetic', '#training', '#agents', '#multimodal', '#reasoning'], 'emoji': '🤖', 'ru': {'title': 'Умный агент GUI: новый уровень автоматизации интерфейсов', 'desc': 'InfiGUIAgent - это агент графического пользовательского интерфейса, основанный на мультимодальных больших языковых моделях (MLLM). Он обучается с помощью двухэтапного процесса точной настройки, который улучшает базовые навыки понимания GUI и развивает способности к иерархическому рассуждению. InfiGUIAgent демонстрирует высокую эффективность в автоматизации задач взаимодействия с GUI, превосходя существующие подходы. Разработка направлена на преодоление ограничений, связанных с многошаговыми рассуждениями и зависимостью от текстовых аннотаций.'}, 'en': {'title': 'Empowering GUI Agents with Native Reasoning Skills', 'desc': "InfiGUIAgent is a new type of Graphical User Interface (GUI) agent that uses multimodal large language models (MLLMs) to improve task automation on devices like computers and smartphones. This agent addresses the limitations of existing systems by employing a two-stage supervised fine-tuning process. The first stage focuses on developing basic skills such as understanding and interacting with GUIs, while the second stage enhances the agent's ability to perform complex reasoning tasks. As a result, InfiGUIAgent demonstrates strong performance on various GUI benchmarks, showcasing the importance of advanced reasoning capabilities in automating GUI interactions."}, 'zh': {'title': '提升GUI交互的原生推理能力', 'desc': '本文介绍了一种名为InfiGUIAgent的图形用户界面(GUI)代理,它基于多模态大型语言模型(MLLM)进行任务自动化。InfiGUIAgent通过两阶段的监督微调流程进行训练,第一阶段提升了GUI理解和基础技能,第二阶段则通过合成数据整合了层次推理和期望反思推理能力。该代理在多个GUI基准测试中表现出色,显示了原生推理能力在增强GUI交互中的重要性。此研究为提高计算设备上的自动化任务提供了新的思路和方法。'}}}, {'id': 'https://huggingface.co/papers/2501.02772', 'title': 'GeAR: Generation Augmented Retrieval', 'url': 'https://huggingface.co/papers/2501.02772', 'abstract': 'Document retrieval techniques form the foundation for the development of large-scale information systems. The prevailing methodology is to construct a bi-encoder and compute the semantic similarity. However, such scalar similarity is difficult to reflect enough information and impedes our comprehension of the retrieval results. In addition, this computational process mainly emphasizes the global semantics and ignores the fine-grained semantic relationship between the query and the complex text in the document. In this paper, we propose a new method called Generation Augmented Retrieval (GeAR) that incorporates well-designed fusion and decoding modules. This enables GeAR to generate the relevant text from documents based on the fused representation of the query and the document, thus learning to "focus on" the fine-grained information. Also when used as a retriever, GeAR does not add any computational burden over bi-encoders. To support the training of the new framework, we have introduced a pipeline to efficiently synthesize high-quality data by utilizing large language models. GeAR exhibits competitive retrieval and localization performance across diverse scenarios and datasets. Moreover, the qualitative analysis and the results generated by GeAR provide novel insights into the interpretation of retrieval results. The code, data, and models will be released after completing technical review to facilitate future research.', 'score': 11, 'issue_id': 1572, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'dafa87428ce906b5', 'authors': ['Haoyu Liu', 'Shaohan Huang', 'Jianfeng Liu', 'Yuefeng Zhan', 'Hao Sun', 'Weiwei Deng', 'Feng Sun', 'Furu Wei', 'Qi Zhang'], 'affiliations': ['Microsoft Corporation'], 'pdf_title_img': 'assets/pdf/title_img/2501.02772.jpg', 'data': {'categories': ['#interpretability', '#data', '#rag', '#synthetic', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'GeAR: Новый взгляд на извлечение документов через генерацию', 'desc': 'Статья предлагает новый метод извлечения документов под названием Generation Augmented Retrieval (GeAR). В отличие от традиционных би-энкодеров, GeAR использует модули слияния и декодирования для генерации релевантного текста на основе запроса и документа. Это позволяет модели фокусироваться на детальной информации, не увеличивая вычислительную нагрузку. Авторы также разработали конвейер для синтеза качественных данных с помощью больших языковых моделей для обучения GeAR.'}, 'en': {'title': 'GeAR: Enhancing Document Retrieval with Fine-Grained Semantic Focus', 'desc': 'This paper introduces a new method called Generation Augmented Retrieval (GeAR) that enhances document retrieval techniques by focusing on fine-grained semantic relationships. Unlike traditional bi-encoders that primarily assess global semantics, GeAR generates relevant text from documents by fusing the query and document representations. This approach allows for a deeper understanding of retrieval results without increasing computational costs. Additionally, the authors provide a pipeline for synthesizing high-quality training data using large language models, leading to improved performance across various datasets.'}, 'zh': {'title': '生成增强检索:关注细粒度信息的创新方法', 'desc': '本文提出了一种新的文档检索方法,称为生成增强检索(GeAR)。GeAR通过融合查询和文档的表示,生成相关文本,从而关注细粒度信息。与传统的双编码器方法相比,GeAR在检索时不会增加计算负担,同时在多种场景和数据集上表现出竞争力的检索和定位性能。该方法还通过利用大型语言模型合成高质量数据,支持新框架的训练。'}}}, {'id': 'https://huggingface.co/papers/2501.04144', 'title': 'Chirpy3D: Continuous Part Latents for Creative 3D Bird Generation', 'url': 'https://huggingface.co/papers/2501.04144', 'abstract': 'In this paper, we push the boundaries of fine-grained 3D generation into truly creative territory. Current methods either lack intricate details or simply mimic existing objects -- we enable both. By lifting 2D fine-grained understanding into 3D through multi-view diffusion and modeling part latents as continuous distributions, we unlock the ability to generate entirely new, yet plausible parts through interpolation and sampling. A self-supervised feature consistency loss further ensures stable generation of these unseen parts. The result is the first system capable of creating novel 3D objects with species-specific details that transcend existing examples. While we demonstrate our approach on birds, the underlying framework extends beyond things that can chirp! Code will be released at https://github.com/kamwoh/chirpy3d.', 'score': 9, 'issue_id': 1578, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '89e2fad397bf0684', 'authors': ['Kam Woh Ng', 'Jing Yang', 'Jia Wei Sii', 'Jiankang Deng', 'Chee Seng Chan', 'Yi-Zhe Song', 'Tao Xiang', 'Xiatian Zhu'], 'affiliations': ['Imperial College London', 'Universiti Malaya', 'University of Cambridge', 'University of Surrey'], 'pdf_title_img': 'assets/pdf/title_img/2501.04144.jpg', 'data': {'categories': ['#diffusion', '#open_source', '#3d'], 'emoji': '🐦', 'ru': {'title': 'Генерация креативных 3D-моделей с беспрецедентной детализацией', 'desc': 'Эта статья представляет новый метод генерации детализированных 3D-объектов, выходящий за рамки простого копирования существующих примеров. Авторы используют мультиракурсную диффузию и моделирование латентных представлений частей объекта как непрерывных распределений. Это позволяет создавать совершенно новые, но правдоподобные части объектов путем интерполяции и сэмплирования. Самоконтролируемая функция потерь обеспечивает стабильную генерацию этих невиданных ранее частей.'}, 'en': {'title': 'Unlocking Creative 3D Generation with Fine-Grained Detail', 'desc': 'This paper introduces a novel approach to generating detailed 3D objects that are not just replicas of existing items. By utilizing multi-view diffusion and treating part latents as continuous distributions, the authors enable the creation of new and realistic 3D parts through interpolation and sampling techniques. A self-supervised feature consistency loss is implemented to maintain stability in generating these novel parts. The system is demonstrated on birds, showcasing its ability to produce unique species-specific details, while the framework is applicable to a broader range of objects.'}, 'zh': {'title': '突破性细粒度3D生成,创造全新物体!', 'desc': '本文提出了一种创新的细粒度3D生成方法,能够创造出全新的3D物体,而不仅仅是模仿现有物体。我们通过多视角扩散将2D细粒度理解提升到3D,并将部分潜变量建模为连续分布,从而实现了新部件的插值和采样生成。自监督特征一致性损失确保了这些未见部件的稳定生成。我们的系统能够生成具有特定物种细节的全新3D对象,超越了现有的示例。'}}}, {'id': 'https://huggingface.co/papers/2501.04689', 'title': 'SPAR3D: Stable Point-Aware Reconstruction of 3D Objects from Single Images', 'url': 'https://huggingface.co/papers/2501.04689', 'abstract': 'We study the problem of single-image 3D object reconstruction. Recent works have diverged into two directions: regression-based modeling and generative modeling. Regression methods efficiently infer visible surfaces, but struggle with occluded regions. Generative methods handle uncertain regions better by modeling distributions, but are computationally expensive and the generation is often misaligned with visible surfaces. In this paper, we present SPAR3D, a novel two-stage approach aiming to take the best of both directions. The first stage of SPAR3D generates sparse 3D point clouds using a lightweight point diffusion model, which has a fast sampling speed. The second stage uses both the sampled point cloud and the input image to create highly detailed meshes. Our two-stage design enables probabilistic modeling of the ill-posed single-image 3D task while maintaining high computational efficiency and great output fidelity. Using point clouds as an intermediate representation further allows for interactive user edits. Evaluated on diverse datasets, SPAR3D demonstrates superior performance over previous state-of-the-art methods, at an inference speed of 0.7 seconds. Project page with code and model: https://spar3d.github.io', 'score': 9, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '00474027a65aa27c', 'authors': ['Zixuan Huang', 'Mark Boss', 'Aaryaman Vasishta', 'James M. Rehg', 'Varun Jampani'], 'affiliations': ['Stability AI', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2501.04689.jpg', 'data': {'categories': ['#3d'], 'emoji': '🧊', 'ru': {'title': 'SPAR3D: Эффективная реконструкция 3D-объектов с использованием облаков точек', 'desc': 'В статье представлен новый двухэтапный подход SPAR3D для реконструкции 3D-объектов по одному изображению. На первом этапе генерируется разреженное облако точек с помощью легковесной модели диффузии точек. На втором этапе используются сгенерированное облако точек и исходное изображение для создания детализированных 3D-моделей. Этот метод сочетает преимущества регрессионного и генеративного моделирования, обеспечивая высокую вычислительную эффективность и качество результатов.'}, 'en': {'title': 'SPAR3D: Efficient and Detailed 3D Reconstruction from a Single Image', 'desc': 'This paper introduces SPAR3D, a new method for reconstructing 3D objects from a single image. It combines regression and generative modeling to efficiently create 3D point clouds and detailed meshes. The first stage generates sparse point clouds quickly, while the second stage refines these into high-quality meshes using the input image. SPAR3D achieves high fidelity and speed, outperforming existing methods and allowing for user interaction with the 3D output.'}, 'zh': {'title': 'SPAR3D:高效的单图像三维重建新方法', 'desc': '我们研究了单幅图像的三维物体重建问题。最近的研究分为两种方向:基于回归的建模和生成建模。回归方法能够有效推断可见表面,但在处理遮挡区域时表现不佳;而生成方法通过建模分布更好地处理不确定区域,但计算开销大且生成结果常常与可见表面不对齐。本文提出了SPAR3D,这是一种新颖的两阶段方法,旨在结合两种方法的优点,快速生成稀疏的三维点云,并利用输入图像创建高细节的网格。'}}}, {'id': 'https://huggingface.co/papers/2501.03271', 'title': 'DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich Paradigm for Direct Preference Optimization', 'url': 'https://huggingface.co/papers/2501.03271', 'abstract': 'The rapid rise of large language models (LLMs) has unlocked many applications but also underscores the challenge of aligning them with diverse values and preferences. Direct Preference Optimization (DPO) is central to alignment but constrained by fixed divergences and limited feature transformations. We propose DPO-Kernels, which integrates kernel methods to address these issues through four key contributions: (i) Kernelized Representations with polynomial, RBF, Mahalanobis, and spectral kernels for richer transformations, plus a hybrid loss combining embedding-based and probability-based objectives; (ii) Divergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya, Wasserstein, and f-divergences) for greater stability; (iii) Data-Driven Selection metrics that automatically choose the best kernel-divergence pair; and (iv) a Hierarchical Mixture of Kernels for both local precision and global modeling. Evaluations on 12 datasets demonstrate state-of-the-art performance in factuality, safety, reasoning, and instruction following. Grounded in Heavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization for LLMs, offering a comprehensive resource for further alignment research.', 'score': 5, 'issue_id': 1576, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': '33d1640aee045ed5', 'authors': ['Amitava Das', 'Suranjana Trivedy', 'Danush Khanna', 'Rajarshi Roy', 'Gurpreet Singh', 'Basab Ghosh', 'Yaswanth Narsupalli', 'Vinija Jain', 'Vasu Sharma', 'Aishwarya Naresh Reganti', 'Aman Chadha'], 'affiliations': ['Amazon AI, USA', 'Artificial Intelligence Institute, University of South Carolina, USA', 'Meta AI, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.03271.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#reasoning', '#dataset', '#training'], 'emoji': '🧠', 'ru': {'title': 'DPO-Kernels: Новый подход к выравниванию языковых моделей', 'desc': 'Статья представляет новый метод под названием DPO-Kernels для улучшения выравнивания больших языковых моделей (LLM) с различными ценностями и предпочтениями. Авторы предлагают использовать методы ядер для расширения возможностей прямой оптимизации предпочтений (DPO), включая кернелизованные представления, альтернативные дивергенции и data-driven выбор наилучшей комбинации ядра и дивергенции. DPO-Kernels демонстрирует улучшенные результаты в задачах фактологичности, безопасности, рассуждений и следования инструкциям на 12 наборах данных. Метод основан на саморегуляризации с тяжелыми хвостами и обеспечивает надежную генерализацию для LLM.'}, 'en': {'title': 'Enhancing LLM Alignment with DPO-Kernels', 'desc': 'This paper introduces DPO-Kernels, a method designed to improve the alignment of large language models (LLMs) with diverse user values. It enhances Direct Preference Optimization (DPO) by incorporating kernel methods, allowing for more flexible feature transformations and better divergence measures. The approach includes a hybrid loss function, various divergence alternatives, and data-driven selection metrics to optimize performance. Evaluations show that DPO-Kernels achieves state-of-the-art results in key areas such as factuality and safety across multiple datasets.'}, 'zh': {'title': 'DPO-Kernels:提升大型语言模型对齐的创新方法', 'desc': '大型语言模型(LLMs)的快速发展带来了许多应用,但也突显了与多样化价值观和偏好对齐的挑战。直接偏好优化(DPO)是对齐的核心,但受到固定散度和有限特征变换的限制。我们提出了DPO-Kernels,通过四个关键贡献来解决这些问题,包括使用多项式、RBF、Mahalanobis和谱核的核化表示,以及结合嵌入基础和基于概率的目标的混合损失。我们的评估在12个数据集上展示了在事实性、安全性、推理和指令遵循方面的最先进性能,DPO-Kernels为进一步的对齐研究提供了全面的资源。'}}}, {'id': 'https://huggingface.co/papers/2501.04694', 'title': 'EpiCoder: Encompassing Diversity and Complexity in Code Generation', 'url': 'https://huggingface.co/papers/2501.04694', 'abstract': 'Effective instruction tuning is indispensable for optimizing code LLMs, aligning model behavior with user expectations and enhancing model performance in real-world applications. However, most existing methods focus on code snippets, which are limited to specific functionalities and rigid structures, restricting the complexity and diversity of the synthesized data. To address these limitations, we introduce a novel feature tree-based synthesis framework inspired by Abstract Syntax Trees (AST). Unlike AST, which captures syntactic structure of code, our framework models semantic relationships between code elements, enabling the generation of more nuanced and diverse data. The feature tree is constructed from raw data and refined iteratively to increase the quantity and diversity of the extracted features. This process enables the identification of more complex patterns and relationships within the code. By sampling subtrees with controlled depth and breadth, our framework allows precise adjustments to the complexity of the generated code, supporting a wide range of tasks from simple function-level operations to intricate multi-file scenarios. We fine-tuned widely-used base models to create the EpiCoder series, achieving state-of-the-art performance at both the function and file levels across multiple benchmarks. Notably, empirical evidence indicates that our approach shows significant potential in synthesizing highly complex repository-level code data. Further analysis elucidates the merits of this approach by rigorously assessing data complexity and diversity through software engineering principles and LLM-as-a-judge method.', 'score': 4, 'issue_id': 1581, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1c1ef93cdfc23c2f', 'authors': ['Yaoxiang Wang', 'Haoling Li', 'Xin Zhang', 'Jie Wu', 'Xiao Liu', 'Wenxiang Hu', 'Zhongxin Guo', 'Yangyu Huang', 'Ying Xin', 'Yujiu Yang', 'Jinsong Su', 'Qi Chen', 'Scarlett Li'], 'affiliations': ['Microsoft', 'Tsinghua University', 'Xiamen University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04694.jpg', 'data': {'categories': ['#dataset', '#data', '#synthetic', '#training', '#optimization', '#alignment', '#architecture'], 'emoji': '🌳', 'ru': {'title': 'Дерево признаков: новый путь к улучшению языковых моделей для кода', 'desc': 'Статья представляет новый подход к улучшению языковых моделей для программирования с использованием дерева признаков, вдохновленного абстрактными синтаксическими деревьями. Этот метод позволяет генерировать более сложные и разнообразные обучающие данные, моделируя семантические связи между элементами кода. Авторы создали серию моделей EpiCoder, достигших высоких результатов в нескольких бенчмарках. Эмпирические данные показывают потенциал метода для синтеза сложных репозиториев кода.'}, 'en': {'title': 'Unlocking Code Complexity with Feature Trees', 'desc': 'This paper presents a new framework for instruction tuning in code language models (LLMs) that enhances their performance by generating more complex and diverse code data. The proposed feature tree-based synthesis framework goes beyond traditional code snippet methods by modeling semantic relationships between code elements, inspired by Abstract Syntax Trees (AST). By iteratively refining the feature tree, the framework captures intricate patterns and relationships, allowing for the generation of code that ranges from simple functions to complex multi-file scenarios. The authors demonstrate that their fine-tuned EpiCoder models achieve state-of-the-art results across various benchmarks, highlighting the effectiveness of their approach in synthesizing complex repository-level code data.'}, 'zh': {'title': '特征树框架:提升代码生成的复杂性与多样性', 'desc': '本论文提出了一种新的特征树合成框架,用于优化代码大语言模型(LLMs)的指令调优。该框架通过建模代码元素之间的语义关系,克服了现有方法在功能和结构上的局限性,从而生成更复杂和多样化的数据。特征树从原始数据构建,并通过迭代精炼,增加提取特征的数量和多样性。最终,我们通过微调广泛使用的基础模型,创建了EpiCoder系列,在多个基准测试中实现了函数和文件级别的最先进性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04652', 'title': 'Multi-task retriever fine-tuning for domain-specific and efficient RAG', 'url': 'https://huggingface.co/papers/2501.04652', 'abstract': 'Retrieval-Augmented Generation (RAG) has become ubiquitous when deploying Large Language Models (LLMs), as it can address typical limitations such as generating hallucinated or outdated information. However, when building real-world RAG applications, practical issues arise. First, the retrieved information is generally domain-specific. Since it is computationally expensive to fine-tune LLMs, it is more feasible to fine-tune the retriever to improve the quality of the data included in the LLM input. Second, as more applications are deployed in the same real-world system, one cannot afford to deploy separate retrievers. Moreover, these RAG applications normally retrieve different kinds of data. Our solution is to instruction fine-tune a small retriever encoder on a variety of domain-specific tasks to allow us to deploy one encoder that can serve many use cases, thereby achieving low-cost, scalability, and speed. We show how this encoder generalizes to out-of-domain settings as well as to an unseen retrieval task on real-world enterprise use cases.', 'score': 1, 'issue_id': 1584, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1c906eb3ec9e3da5', 'authors': ['Patrice Béchard', 'Orlando Marquez Ayala'], 'affiliations': ['ServiceNow'], 'pdf_title_img': 'assets/pdf/title_img/2501.04652.jpg', 'data': {'categories': ['#transfer_learning', '#training', '#hallucinations', '#rag', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Универсальный извлекатель информации для эффективного RAG', 'desc': 'Данная статья представляет новый подход к улучшению систем извлечения информации для крупных языковых моделей. Авторы предлагают дообучать небольшой энкодер для извлечения информации на различных доменно-специфичных задачах. Это позволяет использовать один энкодер для множества приложений, обеспечивая масштабируемость и эффективность. Исследование показывает, что такой подход хорошо обобщается на новые домены и задачи извлечения информации в реальных корпоративных сценариях.'}, 'en': {'title': 'One Retriever to Rule Them All: Scalable RAG Solutions', 'desc': 'This paper discusses the challenges of using Retrieval-Augmented Generation (RAG) with Large Language Models (LLMs), particularly the issues of domain-specific information retrieval and the high cost of fine-tuning LLMs. The authors propose a solution that involves instruction fine-tuning a small retriever encoder on multiple domain-specific tasks, allowing it to serve various applications without needing separate retrievers. This approach enhances the quality of data fed into the LLM while maintaining low costs and scalability. The results demonstrate that the fine-tuned encoder can effectively generalize to new, unseen tasks in real-world scenarios.'}, 'zh': {'title': '一个编码器,多种应用,低成本高效能', 'desc': '检索增强生成(RAG)在部署大型语言模型(LLM)时变得非常普遍,因为它可以解决生成虚假或过时信息的典型问题。本文提出了一种解决方案,通过对小型检索器编码器进行指令微调,使其能够在多种特定领域任务上工作,从而实现一个编码器服务多个用例。这样可以降低成本,提高可扩展性和速度,同时避免为每个应用程序部署单独的检索器。我们的实验表明,该编码器在不同领域设置和未见过的检索任务中也能很好地泛化。'}}}, {'id': 'https://huggingface.co/papers/2501.05874', 'title': 'VideoRAG: Retrieval-Augmented Generation over Video Corpus', 'url': 'https://huggingface.co/papers/2501.05874', 'abstract': 'Retrieval-Augmented Generation (RAG) is a powerful strategy to address the issue of generating factually incorrect outputs in foundation models by retrieving external knowledge relevant to queries and incorporating it into their generation process. However, existing RAG approaches have primarily focused on textual information, with some recent advancements beginning to consider images, and they largely overlook videos, a rich source of multimodal knowledge capable of representing events, processes, and contextual details more effectively than any other modality. While a few recent studies explore the integration of videos in the response generation process, they either predefine query-associated videos without retrieving them according to queries, or convert videos into the textual descriptions without harnessing their multimodal richness. To tackle these, we introduce VideoRAG, a novel framework that not only dynamically retrieves relevant videos based on their relevance with queries but also utilizes both visual and textual information of videos in the output generation. Further, to operationalize this, our method revolves around the recent advance of Large Video Language Models (LVLMs), which enable the direct processing of video content to represent it for retrieval and seamless integration of the retrieved videos jointly with queries. We experimentally validate the effectiveness of VideoRAG, showcasing that it is superior to relevant baselines.', 'score': 39, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'a6a86d4d49a42b4d', 'authors': ['Soyeong Jeong', 'Kangsan Kim', 'Jinheon Baek', 'Sung Ju Hwang'], 'affiliations': ['DeepAuto.ai', 'KAIST'], 'pdf_title_img': 'assets/pdf/title_img/2501.05874.jpg', 'data': {'categories': ['#multimodal', '#rag', '#interpretability', '#hallucinations', '#video'], 'emoji': '🎥', 'ru': {'title': 'VideoRAG: Обогащение генерации ответов с помощью видеоконтента', 'desc': 'VideoRAG - это новая система для улучшения генерации ответов с использованием видеоконтента. В отличие от существующих подходов, она динамически извлекает релевантные видео и использует как визуальную, так и текстовую информацию из них. VideoRAG основан на Больших Видеоязыковых Моделях (LVLM), которые позволяют напрямую обрабатывать видеоконтент. Экспериментальные результаты показывают превосходство VideoRAG над существующими методами.'}, 'en': {'title': 'Enhancing Generation with Dynamic Video Retrieval', 'desc': "This paper presents VideoRAG, a new framework that enhances the Retrieval-Augmented Generation (RAG) approach by incorporating video content into the generation process. Unlike previous methods that primarily focused on text or predefined videos, VideoRAG dynamically retrieves relevant videos based on the user's query. It leverages both visual and textual information from the videos, allowing for a richer and more accurate output generation. The framework utilizes Large Video Language Models (LVLMs) to effectively process and integrate video content, demonstrating superior performance compared to existing methods."}, 'zh': {'title': '视频检索增强生成:提升多模态知识的利用', 'desc': '检索增强生成(RAG)是一种强大的策略,用于解决基础模型生成事实不准确输出的问题。现有的RAG方法主要集中在文本信息上,最近的一些进展开始考虑图像,但大多数忽视了视频这一丰富的多模态知识源。我们提出了VideoRAG框架,它不仅根据查询动态检索相关视频,还利用视频的视觉和文本信息进行输出生成。实验结果验证了VideoRAG的有效性,显示其优于相关基线。'}}}, {'id': 'https://huggingface.co/papers/2501.03841', 'title': 'OmniManip: Towards General Robotic Manipulation via Object-Centric Interaction Primitives as Spatial Constraints', 'url': 'https://huggingface.co/papers/2501.03841', 'abstract': "The development of general robotic systems capable of manipulating in unstructured environments is a significant challenge. While Vision-Language Models(VLM) excel in high-level commonsense reasoning, they lack the fine-grained 3D spatial understanding required for precise manipulation tasks. Fine-tuning VLM on robotic datasets to create Vision-Language-Action Models(VLA) is a potential solution, but it is hindered by high data collection costs and generalization issues. To address these challenges, we propose a novel object-centric representation that bridges the gap between VLM's high-level reasoning and the low-level precision required for manipulation. Our key insight is that an object's canonical space, defined by its functional affordances, provides a structured and semantically meaningful way to describe interaction primitives, such as points and directions. These primitives act as a bridge, translating VLM's commonsense reasoning into actionable 3D spatial constraints. In this context, we introduce a dual closed-loop, open-vocabulary robotic manipulation system: one loop for high-level planning through primitive resampling, interaction rendering and VLM checking, and another for low-level execution via 6D pose tracking. This design ensures robust, real-time control without requiring VLM fine-tuning. Extensive experiments demonstrate strong zero-shot generalization across diverse robotic manipulation tasks, highlighting the potential of this approach for automating large-scale simulation data generation.", 'score': 37, 'issue_id': 1628, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'c2dc8cc20b9b990a', 'authors': ['Mingjie Pan', 'Jiyao Zhang', 'Tianshu Wu', 'Yinghao Zhao', 'Wenlong Gao', 'Hao Dong'], 'affiliations': ['AgiBot', 'CFCS, School of CS, Peking University', 'PKU-AgiBot Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.03841.jpg', 'data': {'categories': ['#agents', '#reasoning', '#robotics', '#3d', '#transfer_learning', '#agi'], 'emoji': '🤖', 'ru': {'title': 'Объектно-ориентированный подход к роботизированной манипуляции с использованием VLM', 'desc': 'Статья представляет новый подход к робототехнике, объединяющий возможности моделей визуального языка (VLM) с точным 3D-пониманием, необходимым для манипуляций. Авторы предлагают объектно-ориентированное представление, использующее каноническое пространство объекта для описания примитивов взаимодействия. Система включает два цикла: планирование высокого уровня с использованием VLM и низкоуровневое выполнение с отслеживанием 6D-позы. Эксперименты показывают сильную обобщающую способность в различных задачах робототехнической манипуляции.'}, 'en': {'title': 'Bridging High-Level Reasoning and Low-Level Manipulation in Robotics', 'desc': "This paper addresses the challenge of enabling robots to manipulate objects in unpredictable environments by enhancing Vision-Language Models (VLM) with a new approach. The authors propose a Vision-Language-Action Model (VLA) that utilizes an object-centric representation, focusing on an object's canonical space defined by its functional affordances. This representation helps translate high-level reasoning from VLM into specific 3D spatial actions needed for manipulation tasks. The proposed dual closed-loop system allows for effective planning and execution without the need for extensive fine-tuning, demonstrating strong performance in various robotic tasks."}, 'zh': {'title': '打破高层推理与低层操作的壁垒', 'desc': '本论文探讨了在非结构化环境中操作的通用机器人系统的开发挑战。虽然视觉-语言模型(VLM)在高层次的常识推理方面表现出色,但缺乏精细的三维空间理解能力。我们提出了一种新颖的以对象为中心的表示方法,旨在弥合VLM的高层推理与操作所需的低层精度之间的差距。通过引入双闭环、开放词汇的机器人操作系统,我们实现了高效的实时控制,且无需对VLM进行微调。'}}}, {'id': 'https://huggingface.co/papers/2501.06186', 'title': 'LlamaV-o1: Rethinking Step-by-step Visual Reasoning in LLMs', 'url': 'https://huggingface.co/papers/2501.06186', 'abstract': "Reasoning is a fundamental capability for solving complex multi-step problems, particularly in visual contexts where sequential step-wise understanding is essential. Existing approaches lack a comprehensive framework for evaluating visual reasoning and do not emphasize step-wise problem-solving. To this end, we propose a comprehensive framework for advancing step-by-step visual reasoning in large language models (LMMs) through three key contributions. First, we introduce a visual reasoning benchmark specifically designed to evaluate multi-step reasoning tasks. The benchmark presents a diverse set of challenges with eight different categories ranging from complex visual perception to scientific reasoning with over 4k reasoning steps in total, enabling robust evaluation of LLMs' abilities to perform accurate and interpretable visual reasoning across multiple steps. Second, we propose a novel metric that assesses visual reasoning quality at the granularity of individual steps, emphasizing both correctness and logical coherence. The proposed metric offers deeper insights into reasoning performance compared to traditional end-task accuracy metrics. Third, we present a new multimodal visual reasoning model, named LlamaV-o1, trained using a multi-step curriculum learning approach, where tasks are progressively organized to facilitate incremental skill acquisition and problem-solving. The proposed LlamaV-o1 is designed for multi-step reasoning and learns step-by-step through a structured training paradigm. Extensive experiments show that our LlamaV-o1 outperforms existing open-source models and performs favorably against close-source proprietary models. Compared to the recent Llava-CoT, our LlamaV-o1 achieves an average score of 67.3 with an absolute gain of 3.8\\% across six benchmarks while being 5 times faster during inference scaling. Our benchmark, model, and code are publicly available.", 'score': 31, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '40e1a0d2c562cda5', 'authors': ['Omkar Thawakar', 'Dinura Dissanayake', 'Ketan More', 'Ritesh Thawkar', 'Ahmed Heakl', 'Noor Ahsan', 'Yuhao Li', 'Mohammed Zumri', 'Jean Lahoud', 'Rao Muhammad Anwer', 'Hisham Cholakkal', 'Ivan Laptev', 'Mubarak Shah', 'Fahad Shahbaz Khan', 'Salman Khan'], 'affiliations': ['Australian National University', 'Linköping University', 'Mohamed bin Zayed University of AI', 'University of Central Florida'], 'pdf_title_img': 'assets/pdf/title_img/2501.06186.jpg', 'data': {'categories': ['#cv', '#benchmark', '#training', '#multimodal', '#open_source', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Шаг за шагом к совершенному визуальному рассуждению', 'desc': 'Статья представляет комплексный подход к улучшению пошагового визуального рассуждения в больших языковых моделях (LLM). Авторы вводят новый бенчмарк для оценки многошаговых задач визуального рассуждения и метрику для оценки качества рассуждения на уровне отдельных шагов. Они также предлагают новую мультимодальную модель визуального рассуждения LlamaV-o1, обученную с использованием подхода многоступенчатого куррикулярного обучения. Эксперименты показывают, что LlamaV-o1 превосходит существующие модели с открытым исходным кодом и демонстрирует хорошие результаты по сравнению с проприетарными моделями.'}, 'en': {'title': 'Advancing Step-by-Step Visual Reasoning in LLMs', 'desc': "This paper introduces a new framework to enhance visual reasoning in large language models (LLMs) by focusing on step-by-step problem-solving. It presents a visual reasoning benchmark with over 4,000 reasoning steps across eight categories, allowing for thorough evaluation of LLMs' multi-step reasoning capabilities. Additionally, a novel metric is proposed to assess the quality of visual reasoning at each step, providing insights beyond traditional accuracy measures. The authors also introduce LlamaV-o1, a multimodal model trained with a curriculum learning approach, which shows significant performance improvements over existing models."}, 'zh': {'title': '提升视觉推理能力的全新框架', 'desc': '本论文提出了一种新的框架,旨在提升大型语言模型(LLMs)在视觉推理中的逐步推理能力。我们设计了一个视觉推理基准,包含多达4000个推理步骤,涵盖复杂的视觉感知和科学推理等八个类别,以便全面评估模型的推理能力。我们还提出了一种新颖的度量标准,专注于逐步推理的正确性和逻辑一致性,提供比传统的任务准确率更深入的洞察。最后,我们介绍了名为LlamaV-o1的多模态视觉推理模型,通过逐步课程学习的方法进行训练,显著提升了推理性能。'}}}, {'id': 'https://huggingface.co/papers/2501.05510', 'title': 'OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video Understanding?', 'url': 'https://huggingface.co/papers/2501.05510', 'abstract': 'Temporal Awareness, the ability to reason dynamically based on the timestamp when a question is raised, is the key distinction between offline and online video LLMs. Unlike offline models, which rely on complete videos for static, post hoc analysis, online models process video streams incrementally and dynamically adapt their responses based on the timestamp at which the question is posed. Despite its significance, temporal awareness has not been adequately evaluated in existing benchmarks. To fill this gap, we present OVO-Bench (Online-VideO-Benchmark), a novel video benchmark that emphasizes the importance of timestamps for advanced online video understanding capability benchmarking. OVO-Bench evaluates the ability of video LLMs to reason and respond to events occurring at specific timestamps under three distinct scenarios: (1) Backward tracing: trace back to past events to answer the question. (2) Real-time understanding: understand and respond to events as they unfold at the current timestamp. (3) Forward active responding: delay the response until sufficient future information becomes available to answer the question accurately. OVO-Bench comprises 12 tasks, featuring 644 unique videos and approximately human-curated 2,800 fine-grained meta-annotations with precise timestamps. We combine automated generation pipelines with human curation. With these high-quality samples, we further developed an evaluation pipeline to systematically query video LLMs along the video timeline. Evaluations of nine Video-LLMs reveal that, despite advancements on traditional benchmarks, current models struggle with online video understanding, showing a significant gap compared to human agents. We hope OVO-Bench will drive progress in video LLMs and inspire future research in online video reasoning. Our benchmark and code can be accessed at https://github.com/JoeLeelyf/OVO-Bench.', 'score': 26, 'issue_id': 1631, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '6f833a01519603d5', 'authors': ['Yifei Li', 'Junbo Niu', 'Ziyang Miao', 'Chunjiang Ge', 'Yuanhang Zhou', 'Qihao He', 'Xiaoyi Dong', 'Haodong Duan', 'Shuangrui Ding', 'Rui Qian', 'Pan Zhang', 'Yuhang Zang', 'Yuhang Cao', 'Conghui He', 'Jiaqi Wang'], 'affiliations': ['Beihang University', 'Communication University of China', 'SenseTime Group', 'Shanghai Artificial Intelligence Laboratory', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05510.jpg', 'data': {'categories': ['#benchmark', '#survey', '#video', '#reasoning'], 'emoji': '⏱️', 'ru': {'title': 'Временная осведомленность как ключ к онлайн-анализу видео для LLM', 'desc': 'Статья представляет новый бенчмарк OVO-Bench для оценки способности видео-LLM моделей к онлайн-анализу видео с учетом временных меток. Бенчмарк включает 12 задач, 644 уникальных видео и около 2800 мета-аннотаций с точными временными метками. OVO-Bench оценивает три сценария: обратное отслеживание, понимание в реальном времени и активное реагирование на будущие события. Результаты тестирования девяти видео-LLM моделей показывают значительное отставание от человеческих возможностей в онлайн-анализе видео.'}, 'en': {'title': 'Enhancing Online Video Understanding with Temporal Awareness', 'desc': 'This paper introduces OVO-Bench, a new benchmark designed to evaluate the temporal awareness of online video language models (LLMs). Unlike offline models that analyze complete videos, online models must dynamically respond to questions based on the specific timestamp of the inquiry. OVO-Bench assesses video LLMs through three scenarios: backward tracing, real-time understanding, and forward active responding, using a dataset of 644 videos and 2,800 meta-annotations. The findings indicate that current video LLMs still lag behind human performance in understanding and reasoning about events in real-time video streams.'}, 'zh': {'title': '提升视频理解能力的时间意识基准', 'desc': '本文提出了OVO-Bench,这是一个新的视频基准,旨在评估视频大语言模型(LLMs)在时间意识方面的能力。时间意识是指模型根据提问时的时间戳动态推理的能力,这与传统的离线模型不同,后者依赖于完整视频进行静态分析。OVO-Bench包含12个任务,使用644个独特视频和约2800个精细的元注释,强调了时间戳在在线视频理解中的重要性。通过对九个视频LLMs的评估,结果显示当前模型在在线视频理解方面仍存在显著差距,远不及人类代理。'}}}, {'id': 'https://huggingface.co/papers/2501.05727', 'title': 'Enabling Scalable Oversight via Self-Evolving Critic', 'url': 'https://huggingface.co/papers/2501.05727', 'abstract': "Despite their remarkable performance, the development of Large Language Models (LLMs) faces a critical challenge in scalable oversight: providing effective feedback for tasks where human evaluation is difficult or where LLMs outperform humans. While there is growing interest in using LLMs for critique, current approaches still rely on human annotations or more powerful models, leaving the issue of enhancing critique capabilities without external supervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework that enables genuine self-evolution of critique abilities. Technically, SCRIT self-improves by training on synthetic data, generated by a contrastive-based self-critic that uses reference solutions for step-by-step critique, and a self-validation mechanism that ensures critique quality through correction outcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs, SCRIT achieves up to a 10.3\\% improvement on critique-correction and error identification benchmarks. Our analysis reveals that SCRIT's performance scales positively with data and model size, outperforms alternative approaches, and benefits critically from its self-validation component.", 'score': 17, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '5a9e3b95b6aa1312', 'authors': ['Zhengyang Tang', 'Ziniu Li', 'Zhenyang Xiao', 'Tian Ding', 'Ruoyu Sun', 'Benyou Wang', 'Dayiheng Liu', 'Fei Huang', 'Tianyu Liu', 'Bowen Yu', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Inc., Beijing, China', 'Shenzhen Research Institute of Big Data, Shenzhen, China', 'The Chinese University of Hong Kong, Shenzhen, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.05727.jpg', 'data': {'categories': ['#training', '#benchmark', '#optimization', '#rlhf', '#synthetic'], 'emoji': '🔬', 'ru': {'title': 'SCRIT: Самосовершенствующийся критик для LLM', 'desc': 'SCRIT - это новая система для улучшения способностей больших языковых моделей (LLM) к самокритике без внешнего надзора. Она использует синтетические данные, созданные с помощью самокритика на основе контрастного обучения и механизма самопроверки. Реализованная на базе Qwen2.5-72B-Instruct, SCRIT демонстрирует значительное улучшение в задачах критики-коррекции и идентификации ошибок. Анализ показывает, что производительность SCRIT растет с увеличением объема данных и размера модели.'}, 'en': {'title': 'Empowering LLMs with Self-Evolving Critique', 'desc': 'This paper addresses the challenge of providing effective feedback for Large Language Models (LLMs) in tasks where human evaluation is difficult. It introduces SCRIT (Self-evolving CRITic), a framework that enhances the critique capabilities of LLMs without relying on external supervision. SCRIT utilizes synthetic data generated by a contrastive-based self-critic and incorporates a self-validation mechanism to ensure the quality of critiques. The results show that SCRIT significantly improves critique-correction and error identification benchmarks, demonstrating its effectiveness as LLMs scale in size and data.'}, 'zh': {'title': '自我进化,提升批评能力!', 'desc': '尽管大型语言模型(LLMs)表现出色,但在可扩展监督方面面临挑战,特别是在难以进行人类评估的任务中。本文提出了SCRIT(自我进化批评者)框架,旨在提升模型的自我批评能力。SCRIT通过对比自我批评生成合成数据,并利用自我验证机制确保批评质量,从而实现自我改进。实验结果表明,SCRIT在批评纠正和错误识别基准上提高了10.3%的性能,且其表现随着数据和模型规模的增加而提升。'}}}, {'id': 'https://huggingface.co/papers/2501.05452', 'title': 'ReFocus: Visual Editing as a Chain of Thought for Structured Image Understanding', 'url': 'https://huggingface.co/papers/2501.05452', 'abstract': 'Structured image understanding, such as interpreting tables and charts, requires strategically refocusing across various structures and texts within an image, forming a reasoning sequence to arrive at the final answer. However, current multimodal large language models (LLMs) lack this multihop selective attention capability. In this work, we introduce ReFocus, a simple yet effective framework that equips multimodal LLMs with the ability to generate "visual thoughts" by performing visual editing on the input image through code, shifting and refining their visual focuses. Specifically, ReFocus enables multimodal LLMs to generate Python codes to call tools and modify the input image, sequentially drawing boxes, highlighting sections, and masking out areas, thereby enhancing the visual reasoning process. We experiment upon a wide range of structured image understanding tasks involving tables and charts. ReFocus largely improves performance on all tasks over GPT-4o without visual editing, yielding an average gain of 11.0% on table tasks and 6.8% on chart tasks. We present an in-depth analysis of the effects of different visual edits, and reasons why ReFocus can improve the performance without introducing additional information. Further, we collect a 14k training set using ReFocus, and prove that such visual chain-of-thought with intermediate information offers a better supervision than standard VQA data, reaching a 8.0% average gain over the same model trained with QA pairs and 2.6% over CoT.', 'score': 7, 'issue_id': 1630, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '28a63b60414f99da', 'authors': ['Xingyu Fu', 'Minqian Liu', 'Zhengyuan Yang', 'John Corring', 'Yijuan Lu', 'Jianwei Yang', 'Dan Roth', 'Dinei Florencio', 'Cha Zhang'], 'affiliations': ['Microsoft', 'University of Pennsylvania', 'Virginia Tech'], 'pdf_title_img': 'assets/pdf/title_img/2501.05452.jpg', 'data': {'categories': ['#multimodal', '#interpretability', '#dataset', '#reasoning', '#training', '#cv'], 'emoji': '🔍', 'ru': {'title': 'ReFocus: Улучшение визуального понимания LLM через управляемое редактирование изображений', 'desc': "Статья представляет ReFocus - фреймворк, который наделяет мультимодальные большие языковые модели (LLM) способностью генерировать 'визуальные мысли' путем редактирования входного изображения с помощью кода. ReFocus позволяет LLM последовательно рисовать рамки, выделять секции и маскировать области, улучшая процесс визуального рассуждения. Эксперименты показывают значительное улучшение производительности на задачах понимания структурированных изображений, таких как таблицы и диаграммы. Авторы также доказывают, что визуальная цепочка рассуждений с промежуточной информацией обеспечивает лучшее обучение, чем стандартные данные VQA."}, 'en': {'title': 'Enhancing Visual Reasoning with ReFocus', 'desc': "This paper presents ReFocus, a framework designed to enhance the capabilities of multimodal large language models (LLMs) in structured image understanding tasks, such as interpreting tables and charts. ReFocus allows these models to generate 'visual thoughts' by performing visual edits on input images, which helps them focus on relevant areas and improve their reasoning processes. The framework enables the generation of Python code to manipulate images, such as drawing boxes and highlighting sections, which significantly boosts performance on various tasks. Experimental results show that ReFocus achieves notable improvements over existing models, demonstrating the effectiveness of visual editing in enhancing visual reasoning without adding new information."}, 'zh': {'title': 'ReFocus:提升多模态模型的视觉推理能力', 'desc': '本论文提出了一种名为ReFocus的框架,旨在提升多模态大语言模型在结构化图像理解任务中的表现。ReFocus通过生成Python代码对输入图像进行视觉编辑,使模型能够逐步调整视觉焦点,从而形成更有效的推理过程。实验结果表明,ReFocus在表格和图表任务上显著提高了性能,平均提升分别为11.0%和6.8%。此外,研究还表明,使用ReFocus生成的视觉链式思维提供了比标准问答数据更好的监督效果。'}}}, {'id': 'https://huggingface.co/papers/2501.04698', 'title': 'ConceptMaster: Multi-Concept Video Customization on Diffusion Transformer Models Without Test-Time Tuning', 'url': 'https://huggingface.co/papers/2501.04698', 'abstract': 'Text-to-video generation has made remarkable advancements through diffusion models. However, Multi-Concept Video Customization (MCVC) remains a significant challenge. We identify two key challenges in this task: 1) the identity decoupling problem, where directly adopting existing customization methods inevitably mix attributes when handling multiple concepts simultaneously, and 2) the scarcity of high-quality video-entity pairs, which is crucial for training such a model that represents and decouples various concepts well. To address these challenges, we introduce ConceptMaster, an innovative framework that effectively tackles the critical issues of identity decoupling while maintaining concept fidelity in customized videos. Specifically, we introduce a novel strategy of learning decoupled multi-concept embeddings that are injected into the diffusion models in a standalone manner, which effectively guarantees the quality of customized videos with multiple identities, even for highly similar visual concepts. To further overcome the scarcity of high-quality MCVC data, we carefully establish a data construction pipeline, which enables systematic collection of precise multi-concept video-entity data across diverse concepts. A comprehensive benchmark is designed to validate the effectiveness of our model from three critical dimensions: concept fidelity, identity decoupling ability, and video generation quality across six different concept composition scenarios. Extensive experiments demonstrate that our ConceptMaster significantly outperforms previous approaches for this task, paving the way for generating personalized and semantically accurate videos across multiple concepts.', 'score': 6, 'issue_id': 1631, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '6e82dc0b883c447a', 'authors': ['Yuzhou Huang', 'Ziyang Yuan', 'Quande Liu', 'Qiulin Wang', 'Xintao Wang', 'Ruimao Zhang', 'Pengfei Wan', 'Di Zhang', 'Kun Gai'], 'affiliations': ['Kuaishou Technology', 'Sun Yat-sen University', 'The Chinese University of Hong Kong, Shenzhen', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04698.jpg', 'data': {'categories': ['#diffusion', '#benchmark', '#data', '#video', '#dataset'], 'emoji': '🎬', 'ru': {'title': 'ConceptMaster: новый уровень персонализации в генерации видео', 'desc': 'Статья представляет ConceptMaster - новую систему для генерации видео с множественными персонализированными концептами. Авторы решают проблему смешивания атрибутов при одновременной работе с несколькими концептами, предлагая метод обучения раздельных мультиконцептуальных эмбеддингов. Для преодоления нехватки качественных данных разработан специальный конвейер сбора видео-сущностных пар. Эксперименты показывают превосходство ConceptMaster над существующими подходами в точности концептов, способности разделения идентичностей и качестве генерации видео.'}, 'en': {'title': 'Mastering Multi-Concept Video Customization with ConceptMaster', 'desc': "This paper presents ConceptMaster, a new framework for Multi-Concept Video Customization (MCVC) that addresses two main challenges: identity decoupling and the lack of high-quality video-entity pairs. The identity decoupling problem arises when existing methods mix attributes from different concepts, leading to poor customization results. ConceptMaster introduces a novel approach to learn decoupled multi-concept embeddings, which are integrated into diffusion models to ensure high-quality video outputs with distinct identities. Additionally, the authors establish a data construction pipeline to systematically gather diverse multi-concept video-entity data, and they validate their model's effectiveness through comprehensive benchmarks across various scenarios."}, 'zh': {'title': 'ConceptMaster:多概念视频定制的新突破', 'desc': '本文介绍了一种名为ConceptMaster的创新框架,旨在解决多概念视频定制中的身份解耦问题和高质量视频实体对的稀缺性。我们提出了一种新的学习策略,通过独立注入解耦的多概念嵌入到扩散模型中,从而保证定制视频的质量。为了克服高质量MCVC数据的不足,我们建立了一个数据构建管道,系统性地收集多概念视频实体数据。实验结果表明,ConceptMaster在概念保真度、身份解耦能力和视频生成质量等方面显著优于之前的方法。'}}}, {'id': 'https://huggingface.co/papers/2501.05707', 'title': 'Multiagent Finetuning: Self Improvement with Diverse Reasoning Chains', 'url': 'https://huggingface.co/papers/2501.05707', 'abstract': 'Large language models (LLMs) have achieved remarkable performance in recent years but are fundamentally limited by the underlying training data. To improve models beyond the training data, recent works have explored how LLMs can be used to generate synthetic data for autonomous self-improvement. However, successive steps of self-improvement can reach a point of diminishing returns. In this work, we propose a complementary approach towards self-improvement where finetuning is applied to a multiagent society of language models. A group of language models, all starting from the same base model, are independently specialized by updating each one using data generated through multiagent interactions among the models. By training each model on independent sets of data, we illustrate how this approach enables specialization across models and diversification over the set of models. As a result, our overall system is able to preserve diverse reasoning chains and autonomously improve over many more rounds of fine-tuning than single-agent self-improvement methods. We quantitatively illustrate the efficacy of the approach across a wide suite of reasoning tasks.', 'score': 5, 'issue_id': 1629, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '3d75785114d08414', 'authors': ['Vighnesh Subramaniam', 'Yilun Du', 'Joshua B. Tenenbaum', 'Antonio Torralba', 'Shuang Li', 'Igor Mordatch'], 'affiliations': ['Google Deepmind', 'Harvard University', 'MIT CSAIL', 'Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05707.jpg', 'data': {'categories': ['#synthetic', '#reasoning', '#training', '#agents'], 'emoji': '🤖', 'ru': {'title': 'Мультиагентное обучение: новый путь к улучшению языковых моделей', 'desc': 'Эта статья представляет новый подход к улучшению больших языковых моделей (LLM) с помощью мультиагентного обучения. Авторы предлагают создать группу моделей, которые взаимодействуют друг с другом для генерации синтетических данных. Каждая модель специализируется на своем наборе данных, что позволяет сохранить разнообразие рассуждений. Этот метод показывает лучшие результаты по сравнению с одноагентными подходами к самоулучшению на различных задачах рассуждения.'}, 'en': {'title': 'Empowering Language Models through Multiagent Self-Improvement', 'desc': 'This paper discusses a new method for improving large language models (LLMs) by using a multiagent system. Instead of relying solely on the original training data, the authors propose that multiple LLMs can interact and generate their own synthetic data, which they then use to fine-tune themselves. This approach allows each model to specialize in different areas, leading to a more diverse set of reasoning capabilities. The results show that this multiagent fine-tuning method can enhance performance over many iterations, surpassing traditional single-agent self-improvement techniques.'}, 'zh': {'title': '多智能体模型的自我改进新方法', 'desc': '大型语言模型(LLMs)在最近几年取得了显著的性能,但其根本上受到训练数据的限制。为了超越训练数据,最近的研究探索了如何利用LLMs生成合成数据以实现自主自我改进。本文提出了一种补充的方法,通过在多智能体语言模型的社会中进行微调,来实现自我改进。通过独立训练每个模型,利用模型之间的多智能体交互生成的数据,我们展示了这种方法如何实现模型的专业化和多样化,从而在多个微调轮次中保持多样的推理链。'}}}, {'id': 'https://huggingface.co/papers/2501.04961', 'title': 'Demystifying Domain-adaptive Post-training for Financial LLMs', 'url': 'https://huggingface.co/papers/2501.04961', 'abstract': 'Domain-adaptive post-training of large language models (LLMs) has emerged as a promising approach for specialized domains such as medicine and finance. However, significant challenges remain in identifying optimal adaptation criteria and training strategies across varying data and model configurations. To address these challenges, we introduce FINDAP, a systematic and fine-grained investigation into domain-adaptive post-training of LLMs for the finance domain. Our approach begins by identifying the core capabilities required for the target domain and designing a comprehensive evaluation suite aligned with these needs. We then analyze the effectiveness of key post-training stages, including continual pretraining, instruction tuning, and preference alignment. Building on these insights, we propose an effective training recipe centered on a novel preference data distillation method, which leverages process signals from a generative reward model. The resulting model, Llama-Fin, achieves state-of-the-art performance across a wide range of financial tasks. Our analysis also highlights how each post-training stage contributes to distinct capabilities, uncovering specific challenges and effective solutions, providing valuable insights for domain adaptation of LLMs. Project page: https://github.com/SalesforceAIResearch/FinDap', 'score': 4, 'issue_id': 1642, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ade3590d1cc29d47', 'authors': ['Zixuan Ke', 'Yifei Ming', 'Xuan-Phi Nguyen', 'Caiming Xiong', 'Shafiq Joty'], 'affiliations': ['Salesforce AI Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.04961.jpg', 'data': {'categories': ['#optimization', '#rlhf', '#healthcare', '#transfer_learning', '#training'], 'emoji': '💹', 'ru': {'title': 'Оптимизация LLM для финансов: от анализа до совершенства', 'desc': 'Статья представляет FINDAP - систематический подход к доменно-адаптивному постобучению больших языковых моделей (LLM) для финансовой сферы. Авторы разработали комплексный набор оценок, анализирующий эффективность ключевых этапов постобучения, включая продолжающееся предобучение, инструктивную настройку и выравнивание предпочтений. Предложен эффективный рецепт обучения, основанный на новом методе дистилляции данных предпочтений. Результирующая модель Llama-Fin достигает передовых результатов в широком спектре финансовых задач.'}, 'en': {'title': 'FINDAP: Tailoring LLMs for Finance Excellence', 'desc': 'This paper presents FINDAP, a method for improving large language models (LLMs) specifically for the finance sector through domain-adaptive post-training. It identifies essential capabilities needed for financial tasks and creates a tailored evaluation suite to measure these capabilities. The study examines various post-training techniques, such as continual pretraining and instruction tuning, to determine their effectiveness. Ultimately, the authors introduce Llama-Fin, a model that utilizes a novel preference data distillation method, achieving top performance in financial applications while providing insights into the adaptation process.'}, 'zh': {'title': '金融领域的智能适应训练', 'desc': '本文介绍了一种针对金融领域的大型语言模型(LLM)进行领域自适应后训练的方法,称为FINDAP。我们首先识别目标领域所需的核心能力,并设计了与这些需求相一致的综合评估套件。接着,我们分析了关键后训练阶段的有效性,包括持续预训练、指令调优和偏好对齐。最终,我们提出了一种基于新颖偏好数据蒸馏方法的有效训练方案,所得到的模型Llama-Fin在多种金融任务中达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.06187', 'title': 'Multi-subject Open-set Personalization in Video Generation', 'url': 'https://huggingface.co/papers/2501.06187', 'abstract': 'Video personalization methods allow us to synthesize videos with specific concepts such as people, pets, and places. However, existing methods often focus on limited domains, require time-consuming optimization per subject, or support only a single subject. We present Video Alchemist - a video model with built-in multi-subject, open-set personalization capabilities for both foreground objects and background, eliminating the need for time-consuming test-time optimization. Our model is built on a new Diffusion Transformer module that fuses each conditional reference image and its corresponding subject-level text prompt with cross-attention layers. Developing such a large model presents two main challenges: dataset and evaluation. First, as paired datasets of reference images and videos are extremely hard to collect, we sample selected video frames as reference images and synthesize a clip of the target video. However, while models can easily denoise training videos given reference frames, they fail to generalize to new contexts. To mitigate this issue, we design a new automatic data construction pipeline with extensive image augmentations. Second, evaluating open-set video personalization is a challenge in itself. To address this, we introduce a personalization benchmark that focuses on accurate subject fidelity and supports diverse personalization scenarios. Finally, our extensive experiments show that our method significantly outperforms existing personalization methods in both quantitative and qualitative evaluations.', 'score': 4, 'issue_id': 1631, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'fcf16f5f8fe9047a', 'authors': ['Tsai-Shien Chen', 'Aliaksandr Siarohin', 'Willi Menapace', 'Yuwei Fang', 'Kwot Sin Lee', 'Ivan Skorokhodov', 'Kfir Aberman', 'Jun-Yan Zhu', 'Ming-Hsuan Yang', 'Sergey Tulyakov'], 'affiliations': ['CMU', 'Snap Inc.', 'UC Merced'], 'pdf_title_img': 'assets/pdf/title_img/2501.06187.jpg', 'data': {'categories': ['#diffusion', '#synthetic', '#benchmark', '#data', '#optimization', '#video', '#dataset'], 'emoji': '🎭', 'ru': {'title': 'Универсальная персонализация видео без длительной оптимизации', 'desc': 'Статья представляет Video Alchemist - новую модель для персонализации видео с возможностью работы с несколькими объектами. Модель использует новый модуль Diffusion Transformer, который объединяет условные референсные изображения и текстовые промпты. Авторы разработали автоматический конвейер для создания данных с обширными аугментациями изображений. Также был создан новый бенчмарк для оценки персонализации видео в открытом наборе.'}, 'en': {'title': 'Revolutionizing Video Personalization with Video Alchemist', 'desc': "The paper introduces Video Alchemist, a novel video personalization model that allows for the synthesis of videos featuring multiple subjects without the need for extensive optimization. It utilizes a Diffusion Transformer module that integrates reference images and text prompts through cross-attention layers, enabling effective personalization for both foreground and background elements. The authors tackle challenges related to dataset creation by employing a new automatic data construction pipeline with image augmentations, which helps improve generalization to new contexts. Additionally, they propose a personalization benchmark to evaluate the model's performance in diverse scenarios, demonstrating that Video Alchemist outperforms existing methods in both quantitative and qualitative assessments."}, 'zh': {'title': '视频个性化的新突破', 'desc': '视频个性化方法可以合成特定概念的视频,如人物、宠物和地点。然而,现有方法通常只关注有限的领域,且每个主题需要耗时的优化,或者仅支持单一主题。我们提出了视频炼金术师(Video Alchemist),这是一种具有内置多主题、开放集个性化能力的视频模型,能够处理前景物体和背景,消除了耗时的测试时间优化需求。我们的模型基于新的扩散变换器模块,结合条件参考图像和相应的主题级文本提示,通过交叉注意力层进行融合。'}}}, {'id': 'https://huggingface.co/papers/2501.05542', 'title': 'Infecting Generative AI With Viruses', 'url': 'https://huggingface.co/papers/2501.05542', 'abstract': 'This study demonstrates a novel approach to testing the security boundaries of Vision-Large Language Model (VLM/ LLM) using the EICAR test file embedded within JPEG images. We successfully executed four distinct protocols across multiple LLM platforms, including OpenAI GPT-4o, Microsoft Copilot, Google Gemini 1.5 Pro, and Anthropic Claude 3.5 Sonnet. The experiments validated that a modified JPEG containing the EICAR signature could be uploaded, manipulated, and potentially executed within LLM virtual workspaces. Key findings include: 1) consistent ability to mask the EICAR string in image metadata without detection, 2) successful extraction of the test file using Python-based manipulation within LLM environments, and 3) demonstration of multiple obfuscation techniques including base64 encoding and string reversal. This research extends Microsoft Research\'s "Penetration Testing Rules of Engagement" framework to evaluate cloud-based generative AI and LLM security boundaries, particularly focusing on file handling and execution capabilities within containerized environments.', 'score': 4, 'issue_id': 1630, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ac21f1bae807486e', 'authors': ['David Noever', 'Forrest McKee'], 'affiliations': ['PeopleTec, Inc., Huntsville, AL'], 'pdf_title_img': 'assets/pdf/title_img/2501.05542.jpg', 'data': {'categories': ['#cv', '#benchmark', '#data', '#security'], 'emoji': '🛡️', 'ru': {'title': 'Новые горизонты в тестировании безопасности VLM/LLM с помощью EICAR', 'desc': 'Это исследование демонстрирует новый подход к тестированию границ безопасности моделей типа Vision-Large Language Model (VLM/LLM) с использованием тестового файла EICAR, встроенного в изображения JPEG. Эксперименты проводились на нескольких платформах LLM, включая OpenAI GPT-4, Microsoft Copilot, Google Gemini 1.5 Pro и Anthropic Claude 3.5 Sonnet. Ключевые результаты включают успешную маскировку строки EICAR в метаданных изображения, извлечение тестового файла с помощью Python в среде LLM и демонстрацию различных методов обфускации. Исследование расширяет рамки оценки безопасности облачных генеративных ИИ и LLM, особенно в отношении обработки файлов и возможностей выполнения в контейнеризированных средах.'}, 'en': {'title': 'Testing Security Boundaries of LLMs with EICAR in JPEGs', 'desc': 'This paper presents a new method for testing the security limits of Vision-Large Language Models (VLMs/LLMs) by embedding the EICAR test file in JPEG images. The authors conducted experiments on various LLM platforms, revealing that modified JPEGs containing the EICAR signature could be uploaded and manipulated without detection. They demonstrated the ability to extract the EICAR file using Python scripts and employed several obfuscation techniques to hide the EICAR string. This research enhances existing security frameworks by focusing on the file handling and execution capabilities of cloud-based generative AI systems.'}, 'zh': {'title': '测试大型语言模型的安全边界新方法', 'desc': '本研究展示了一种新颖的方法,用于测试视觉大型语言模型(VLM/LLM)的安全边界,使用嵌入在JPEG图像中的EICAR测试文件。我们在多个LLM平台上成功执行了四种不同的协议,包括OpenAI GPT-4o、Microsoft Copilot、Google Gemini 1.5 Pro和Anthropic Claude 3.5 Sonnet。实验验证了修改后的JPEG图像可以在LLM虚拟工作区中上传、操控并可能执行。研究的关键发现包括:在图像元数据中无检测地掩盖EICAR字符串、在LLM环境中成功提取测试文件,以及展示多种混淆技术,如base64编码和字符串反转。'}}}, {'id': 'https://huggingface.co/papers/2501.08828', 'title': 'MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents', 'url': 'https://huggingface.co/papers/2501.08828', 'abstract': 'Multi-modal document retrieval is designed to identify and retrieve various forms of multi-modal content, such as figures, tables, charts, and layout information from extensive documents. Despite its significance, there is a notable lack of a robust benchmark to effectively evaluate the performance of systems in multi-modal document retrieval. To address this gap, this work introduces a new benchmark, named as MMDocIR, encompassing two distinct tasks: page-level and layout-level retrieval. The former focuses on localizing the most relevant pages within a long document, while the latter targets the detection of specific layouts, offering a more fine-grained granularity than whole-page analysis. A layout can refer to a variety of elements such as textual paragraphs, equations, figures, tables, or charts. The MMDocIR benchmark comprises a rich dataset featuring expertly annotated labels for 1,685 questions and bootstrapped labels for 173,843 questions, making it a pivotal resource for advancing multi-modal document retrieval for both training and evaluation. Through rigorous experiments, we reveal that (i) visual retrievers significantly outperform their text counterparts, (ii) MMDocIR train set can effectively benefit the training process of multi-modal document retrieval and (iii) text retrievers leveraging on VLM-text perform much better than those using OCR-text. These findings underscores the potential advantages of integrating visual elements for multi-modal document retrieval.', 'score': 17, 'issue_id': 1698, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'bf9a6df8fecd4ec1', 'authors': ['Kuicai Dong', 'Yujing Chang', 'Xin Deik Goh', 'Dexun Li', 'Ruiming Tang', 'Yong Liu'], 'affiliations': ['Noahs Ark Lab, Huawei'], 'pdf_title_img': 'assets/pdf/title_img/2501.08828.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'MMDocIR: Новый стандарт для мультимодального поиска документов', 'desc': 'Статья представляет новый бенчмарк MMDocIR для оценки систем мультимодального поиска документов. Бенчмарк включает две задачи: поиск на уровне страниц и на уровне макетов. Датасет содержит экспертные аннотации для 1,685 вопросов и автоматически сгенерированные метки для 173,843 вопросов. Эксперименты показали, что визуальные ретриверы превосходят текстовые, а использование визуально-языковых моделей дает лучшие результаты, чем OCR-текст.'}, 'en': {'title': 'Unlocking Multi-Modal Document Retrieval with MMDocIR', 'desc': 'This paper addresses the challenge of multi-modal document retrieval, which involves finding various types of content like figures and tables in large documents. It introduces a new benchmark called MMDocIR, which includes two tasks: page-level retrieval for finding relevant pages and layout-level retrieval for identifying specific layouts within those pages. The benchmark is supported by a comprehensive dataset with thousands of annotated questions, facilitating better training and evaluation of retrieval systems. The results show that visual retrieval methods outperform text-based methods, highlighting the importance of incorporating visual information in multi-modal retrieval tasks.'}, 'zh': {'title': '多模态文档检索的新基准MMDocIR', 'desc': '多模态文档检索旨在从大量文档中识别和提取各种形式的内容,如图形、表格、图表和布局信息。尽管其重要性显著,但目前缺乏有效评估多模态文档检索系统性能的基准。为了解决这一问题,本文提出了一个新的基准MMDocIR,包含页面级和布局级检索两个任务。通过严格的实验,我们发现视觉检索器的表现显著优于文本检索器,且MMDocIR训练集能有效促进多模态文档检索的训练过程。'}}}, {'id': 'https://huggingface.co/papers/2501.08365', 'title': 'Towards Best Practices for Open Datasets for LLM Training', 'url': 'https://huggingface.co/papers/2501.08365', 'abstract': 'Many AI companies are training their large language models (LLMs) on data without the permission of the copyright owners. The permissibility of doing so varies by jurisdiction: in countries like the EU and Japan, this is allowed under certain restrictions, while in the United States, the legal landscape is more ambiguous. Regardless of the legal status, concerns from creative producers have led to several high-profile copyright lawsuits, and the threat of litigation is commonly cited as a reason for the recent trend towards minimizing the information shared about training datasets by both corporate and public interest actors. This trend in limiting data information causes harm by hindering transparency, accountability, and innovation in the broader ecosystem by denying researchers, auditors, and impacted individuals access to the information needed to understand AI models. While this could be mitigated by training language models on open access and public domain data, at the time of writing, there are no such models (trained at a meaningful scale) due to the substantial technical and sociological challenges in assembling the necessary corpus. These challenges include incomplete and unreliable metadata, the cost and complexity of digitizing physical records, and the diverse set of legal and technical skills required to ensure relevance and responsibility in a quickly changing landscape. Building towards a future where AI systems can be trained on openly licensed data that is responsibly curated and governed requires collaboration across legal, technical, and policy domains, along with investments in metadata standards, digitization, and fostering a culture of openness.', 'score': 16, 'issue_id': 1702, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '90686080aa439157', 'authors': ['Stefan Baack', 'Stella Biderman', 'Kasia Odrozek', 'Aviya Skowron', 'Ayah Bdeir', 'Jillian Bommarito', 'Jennifer Ding', 'Maximilian Gahntz', 'Paul Keller', 'Pierre-Carl Langlais', 'Greg Lindahl', 'Sebastian Majstorovic', 'Nik Marda', 'Guilherme Penedo', 'Maarten Van Segbroeck', 'Jennifer Wang', 'Leandro von Werra', 'Mitchell Baker', 'Julie Belião', 'Kasia Chmielinski', 'Marzieh Fadaee', 'Lisa Gutermuth', 'Hynek Kydlíček', 'Greg Leppert', 'EM Lewis-Jong', 'Solana Larsen', 'Shayne Longpre', 'Angela Oduor Lungati', 'Cullen Miller', 'Victor Miller', 'Max Ryabinin', 'Kathleen Siminyu', 'Andrew Strait', 'Mark Surman', 'Anna Tumadóttir', 'Maurice Weber', 'Rebecca Weiss', 'Lee White', 'Thomas Wolf'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08365.jpg', 'data': {'categories': ['#open_source', '#ethics', '#data', '#dataset'], 'emoji': '📚', 'ru': {'title': 'Открытые данные для ответственного ИИ: вызовы и перспективы', 'desc': 'Статья рассматривает проблему обучения больших языковых моделей (LLM) на данных без разрешения правообладателей. Анализируются юридические аспекты этой практики в разных странах и связанные с ней судебные иски. Отмечается тенденция к ограничению информации о наборах данных для обучения, что негативно влияет на прозрачность и подотчетность в сфере ИИ. Обсуждаются вызовы создания моделей на основе открытых данных, включая технические и социологические аспекты.'}, 'en': {'title': 'Towards Transparent AI: The Need for Open Data Collaboration', 'desc': 'This paper discusses the legal and ethical challenges surrounding the training of large language models (LLMs) using copyrighted data without permission. It highlights the varying legal frameworks across different countries, particularly the ambiguity in the United States compared to more defined rules in the EU and Japan. The authors argue that the trend of limiting information about training datasets undermines transparency and innovation in AI, making it difficult for researchers and stakeholders to understand the models. They propose that a shift towards using open access and public domain data is necessary, but emphasize the need for collaboration and investment in infrastructure to overcome the technical and sociological barriers involved.'}, 'zh': {'title': '推动开放许可数据的AI训练未来', 'desc': '许多人工智能公司在没有版权拥有者许可的情况下训练大型语言模型(LLMs)。不同国家对这种做法的合法性有不同的规定,欧盟和日本在某些限制下允许,而美国的法律环境则较为模糊。这种限制数据共享的信息趋势,妨碍了透明度、问责制和创新,影响了研究人员和受影响个体获取理解AI模型所需的信息。为了实现未来能够在开放许可数据上训练AI系统,需要在法律、技术和政策领域进行合作,并投资于元数据标准和数字化。'}}}, {'id': 'https://huggingface.co/papers/2501.08983', 'title': 'CityDreamer4D: Compositional Generative Model of Unbounded 4D Cities', 'url': 'https://huggingface.co/papers/2501.08983', 'abstract': '3D scene generation has garnered growing attention in recent years and has made significant progress. Generating 4D cities is more challenging than 3D scenes due to the presence of structurally complex, visually diverse objects like buildings and vehicles, and heightened human sensitivity to distortions in urban environments. To tackle these issues, we propose CityDreamer4D, a compositional generative model specifically tailored for generating unbounded 4D cities. Our main insights are 1) 4D city generation should separate dynamic objects (e.g., vehicles) from static scenes (e.g., buildings and roads), and 2) all objects in the 4D scene should be composed of different types of neural fields for buildings, vehicles, and background stuff. Specifically, we propose Traffic Scenario Generator and Unbounded Layout Generator to produce dynamic traffic scenarios and static city layouts using a highly compact BEV representation. Objects in 4D cities are generated by combining stuff-oriented and instance-oriented neural fields for background stuff, buildings, and vehicles. To suit the distinct characteristics of background stuff and instances, the neural fields employ customized generative hash grids and periodic positional embeddings as scene parameterizations. Furthermore, we offer a comprehensive suite of datasets for city generation, including OSM, GoogleEarth, and CityTopia. The OSM dataset provides a variety of real-world city layouts, while the Google Earth and CityTopia datasets deliver large-scale, high-quality city imagery complete with 3D instance annotations. Leveraging its compositional design, CityDreamer4D supports a range of downstream applications, such as instance editing, city stylization, and urban simulation, while delivering state-of-the-art performance in generating realistic 4D cities.', 'score': 11, 'issue_id': 1698, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '39cd0826d4232170', 'authors': ['Haozhe Xie', 'Zhaoxi Chen', 'Fangzhou Hong', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University, Singapore 637335'], 'pdf_title_img': 'assets/pdf/title_img/2501.08983.jpg', 'data': {'categories': ['#3d', '#dataset'], 'emoji': '🏙️', 'ru': {'title': 'Композиционная генерация 4D-городов с разделением динамики и статики', 'desc': 'CityDreamer4D - это генеративная модель для создания неограниченных 4D-городов. Она разделяет генерацию динамических объектов (например, транспорта) и статических сцен (зданий, дорог). Модель использует разные типы нейронных полей для зданий, транспорта и фона, применяя специализированные генеративные хеш-сетки и периодические позиционные эмбеддинги. CityDreamer4D демонстрирует передовые результаты в генерации реалистичных 4D-городов и поддерживает различные приложения, включая редактирование объектов и городское моделирование.'}, 'en': {'title': 'Revolutionizing Urban Landscapes: CityDreamer4D for Dynamic City Generation', 'desc': "This paper introduces CityDreamer4D, a generative model designed for creating unbounded 4D cities, which include both static and dynamic elements. The model distinguishes between dynamic objects like vehicles and static structures such as buildings, using specialized neural fields for each type. It employs a compact bird's-eye view (BEV) representation to generate realistic traffic scenarios and city layouts. Additionally, the paper provides extensive datasets for training, enabling various applications like instance editing and urban simulation while achieving high-quality results in 4D city generation."}, 'zh': {'title': 'CityDreamer4D:无限4D城市生成的新突破', 'desc': '近年来,3D场景生成受到了越来越多的关注,并取得了显著进展。生成4D城市比3D场景更具挑战性,因为城市环境中存在结构复杂、视觉多样的物体,如建筑和车辆。为了解决这些问题,我们提出了CityDreamer4D,这是一种专门用于生成无限4D城市的组合生成模型。该模型通过将动态物体与静态场景分离,并使用不同类型的神经场来组合城市中的所有物体,从而实现高质量的城市生成。'}}}, {'id': 'https://huggingface.co/papers/2501.08994', 'title': 'RepVideo: Rethinking Cross-Layer Representation for Video Generation', 'url': 'https://huggingface.co/papers/2501.08994', 'abstract': 'Video generation has achieved remarkable progress with the introduction of diffusion models, which have significantly improved the quality of generated videos. However, recent research has primarily focused on scaling up model training, while offering limited insights into the direct impact of representations on the video generation process. In this paper, we initially investigate the characteristics of features in intermediate layers, finding substantial variations in attention maps across different layers. These variations lead to unstable semantic representations and contribute to cumulative differences between features, which ultimately reduce the similarity between adjacent frames and negatively affect temporal coherence. To address this, we propose RepVideo, an enhanced representation framework for text-to-video diffusion models. By accumulating features from neighboring layers to form enriched representations, this approach captures more stable semantic information. These enhanced representations are then used as inputs to the attention mechanism, thereby improving semantic expressiveness while ensuring feature consistency across adjacent frames. Extensive experiments demonstrate that our RepVideo not only significantly enhances the ability to generate accurate spatial appearances, such as capturing complex spatial relationships between multiple objects, but also improves temporal consistency in video generation.', 'score': 10, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '0d164d45ba2a5c71', 'authors': ['Chenyang Si', 'Weichen Fan', 'Zhengyao Lv', 'Ziqi Huang', 'Yu Qiao', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University, Singapore, 639798', 'Shanghai Artificial Intelligence Laboratory, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.08994.jpg', 'data': {'categories': ['#video', '#diffusion', '#architecture'], 'emoji': '🎬', 'ru': {'title': 'RepVideo: стабильные представления для качественной генерации видео', 'desc': 'Статья представляет RepVideo - улучшенную систему представлений для диффузионных моделей генерации видео на основе текста. Авторы обнаружили, что вариации в картах внимания между слоями приводят к нестабильным семантическим представлениям и снижают согласованность соседних кадров. RepVideo решает эту проблему путем накопления признаков из соседних слоев для создания обогащенных представлений. Эксперименты показывают, что RepVideo значительно улучшает способность генерировать точные пространственные образы и повышает временную согласованность при генерации видео.'}, 'en': {'title': 'Enhancing Video Generation with Stable Representations', 'desc': "This paper presents RepVideo, a new framework designed to improve video generation using text-to-video diffusion models. It identifies issues with unstable semantic representations caused by variations in attention maps across different layers of the model. By accumulating features from neighboring layers, RepVideo creates more stable and enriched representations that enhance the model's ability to maintain consistency between adjacent frames. The results show that RepVideo significantly improves both the spatial accuracy of generated videos and their temporal coherence, leading to more realistic video outputs."}, 'zh': {'title': '提升视频生成质量的RepVideo框架', 'desc': '本论文探讨了扩散模型在视频生成中的应用,提出了RepVideo框架以改善视频生成的质量。研究发现中间层特征的注意力图存在显著差异,这导致语义表示的不稳定性,进而影响相邻帧之间的相似性和时间一致性。RepVideo通过从相邻层累积特征,形成更丰富的表示,从而捕捉更稳定的语义信息。实验结果表明,RepVideo显著提高了生成视频的空间表现能力和时间一致性。'}}}, {'id': 'https://huggingface.co/papers/2501.07783', 'title': 'Parameter-Inverted Image Pyramid Networks for Visual Perception and Multimodal Understanding', 'url': 'https://huggingface.co/papers/2501.07783', 'abstract': 'Image pyramids are widely adopted in top-performing methods to obtain multi-scale features for precise visual perception and understanding. However, current image pyramids use the same large-scale model to process multiple resolutions of images, leading to significant computational cost. To address this challenge, we propose a novel network architecture, called Parameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses pretrained models (ViTs or CNNs) as branches to process multi-scale images, where images of higher resolutions are processed by smaller network branches to balance computational cost and performance. To integrate information from different spatial scales, we further propose a novel cross-branch feature interaction mechanism. To validate PIIP, we apply it to various perception models and a representative multimodal large language model called LLaVA, and conduct extensive experiments on various tasks such as object detection, segmentation, image classification and multimodal understanding. PIIP achieves superior performance compared to single-branch and existing multi-resolution approaches with lower computational cost. When applied to InternViT-6B, a large-scale vision foundation model, PIIP can improve its performance by 1%-2% on detection and segmentation with only 40%-60% of the original computation, finally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For multimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and 74.5% on MMBench with only 2.8M training data. Our code is released at https://github.com/OpenGVLab/PIIP.', 'score': 5, 'issue_id': 1701, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '87295e912b5b0670', 'authors': ['Zhaokai Wang', 'Xizhou Zhu', 'Xue Yang', 'Gen Luo', 'Hao Li', 'Changyao Tian', 'Wenhan Dou', 'Junqi Ge', 'Lewei Lu', 'Yu Qiao', 'Jifeng Dai'], 'affiliations': ['Sensetime', 'Shanghai Artificial Intelligence Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07783.jpg', 'data': {'categories': ['#architecture', '#multimodal', '#cv'], 'emoji': '🔍', 'ru': {'title': 'Эффективные многомасштабные сети для точного визуального восприятия', 'desc': 'Статья представляет новую архитектуру нейронных сетей под названием Parameter-Inverted Image Pyramid Networks (PIIP). PIIP использует предобученные модели (ViT или CNN) в качестве ветвей для обработки многомасштабных изображений, где изображения с более высоким разрешением обрабатываются меньшими сетевыми ветвями для баланса вычислительных затрат и производительности. Авторы также предлагают новый механизм взаимодействия признаков между ветвями. PIIP демонстрирует превосходную производительность по сравнению с одноветвенными и существующими многоразрешающими подходами при меньших вычислительных затратах в задачах обнаружения объектов, сегментации, классификации изображений и мультимодального понимания.'}, 'en': {'title': 'Efficient Multi-Scale Processing with PIIP Networks', 'desc': 'This paper introduces Parameter-Inverted Image Pyramid Networks (PIIP), a new architecture designed to efficiently process multi-scale images for visual tasks. Unlike traditional methods that use a single large model for all resolutions, PIIP employs smaller branches for higher resolution images, reducing computational costs while maintaining performance. The architecture also features a unique cross-branch interaction mechanism to enhance feature integration across different scales. Experimental results demonstrate that PIIP outperforms existing methods in various tasks, achieving significant accuracy improvements with lower resource usage.'}, 'zh': {'title': '高效多尺度图像处理的新方法', 'desc': '本文提出了一种新的网络架构,称为参数反转图像金字塔网络(PIIP),旨在提高多尺度图像处理的效率。PIIP利用预训练模型作为分支,处理不同分辨率的图像,从而在性能和计算成本之间取得平衡。通过引入跨分支特征交互机制,PIIP能够有效整合来自不同空间尺度的信息。实验结果表明,PIIP在目标检测、分割和多模态理解等任务上表现优于现有方法,同时显著降低了计算成本。'}}}, {'id': 'https://huggingface.co/papers/2501.09012', 'title': 'Multimodal LLMs Can Reason about Aesthetics in Zero-Shot', 'url': 'https://huggingface.co/papers/2501.09012', 'abstract': "We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability shall be elicited to evaluate the aesthetics of artworks. To facilitate this investigation, we construct MM-StyleBench, a novel high-quality dataset for benchmarking artistic stylization. We then develop a principled method for human preference modeling and perform a systematic correlation analysis between MLLMs' responses and human preference. Our experiments reveal an inherent hallucination issue of MLLMs in art evaluation, associated with response subjectivity. ArtCoT is proposed, demonstrating that art-specific task decomposition and the use of concrete language boost MLLMs' reasoning ability for aesthetics. Our findings offer valuable insights into MLLMs for art and can benefit a wide range of downstream applications, such as style transfer and artistic image generation. Code available at https://github.com/songrise/MLLM4Art.", 'score': 5, 'issue_id': 1699, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'e516a920b6534cc0', 'authors': ['Ruixiang Jiang', 'Changwen Chen'], 'affiliations': ['The Hong Kong Polytechnic University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09012.jpg', 'data': {'categories': ['#artificial intelligence', '#reasoning', '#hallucinations', '#multimodal', '#benchmark', '#dataset'], 'emoji': '🎨', 'ru': {'title': 'Искусственный интеллект учится оценивать искусство', 'desc': 'Исследование посвящено использованию мультимодальных языковых моделей (MLLM) для оценки эстетики произведений искусства. Авторы создали набор данных MM-StyleBench для тестирования художественной стилизации и разработали метод моделирования человеческих предпочтений. Эксперименты выявили проблему галлюцинаций MLLM при оценке искусства, связанную с субъективностью ответов. Предложенный метод ArtCoT улучшает способность MLLM к рассуждениям об эстетике путем декомпозиции задач и использования конкретного языка.'}, 'en': {'title': 'Enhancing MLLMs for Art Evaluation through Structured Reasoning', 'desc': "This paper investigates how Multimodal Large Language Models (MLLMs) can assess the aesthetics of artworks. The authors introduce MM-StyleBench, a new dataset designed to benchmark artistic stylization. They also create a method for modeling human preferences and analyze the correlation between MLLMs' evaluations and human judgments. The study highlights a hallucination problem in MLLMs when evaluating art and proposes ArtCoT, which improves reasoning by using task decomposition and specific language, providing insights for applications like style transfer and artistic image generation."}, 'zh': {'title': '提升多模态大语言模型的艺术推理能力', 'desc': '本研究首次探讨了多模态大语言模型(MLLMs)在评估艺术作品美学时的推理能力。我们构建了一个新的高质量数据集MM-StyleBench,用于艺术风格化的基准测试。通过系统的相关性分析,我们发现MLLMs在艺术评估中存在固有的幻觉问题,且与人类偏好存在主观性关联。我们提出了ArtCoT方法,表明艺术特定任务分解和使用具体语言可以提升MLLMs的美学推理能力。'}}}, {'id': 'https://huggingface.co/papers/2501.09019', 'title': 'Ouroboros-Diffusion: Exploring Consistent Content Generation in Tuning-free Long Video Diffusion', 'url': 'https://huggingface.co/papers/2501.09019', 'abstract': "The first-in-first-out (FIFO) video diffusion, built on a pre-trained text-to-video model, has recently emerged as an effective approach for tuning-free long video generation. This technique maintains a queue of video frames with progressively increasing noise, continuously producing clean frames at the queue's head while Gaussian noise is enqueued at the tail. However, FIFO-Diffusion often struggles to keep long-range temporal consistency in the generated videos due to the lack of correspondence modeling across frames. In this paper, we propose Ouroboros-Diffusion, a novel video denoising framework designed to enhance structural and content (subject) consistency, enabling the generation of consistent videos of arbitrary length. Specifically, we introduce a new latent sampling technique at the queue tail to improve structural consistency, ensuring perceptually smooth transitions among frames. To enhance subject consistency, we devise a Subject-Aware Cross-Frame Attention (SACFA) mechanism, which aligns subjects across frames within short segments to achieve better visual coherence. Furthermore, we introduce self-recurrent guidance. This technique leverages information from all previous cleaner frames at the front of the queue to guide the denoising of noisier frames at the end, fostering rich and contextual global information interaction. Extensive experiments of long video generation on the VBench benchmark demonstrate the superiority of our Ouroboros-Diffusion, particularly in terms of subject consistency, motion smoothness, and temporal consistency.", 'score': 4, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'c4c991699f684865', 'authors': ['Jingyuan Chen', 'Fuchen Long', 'Jie An', 'Zhaofan Qiu', 'Ting Yao', 'Jiebo Luo', 'Tao Mei'], 'affiliations': ['HiDream.ai Inc.', 'University of Rochester, Rochester, NY USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.09019.jpg', 'data': {'categories': ['#benchmark', '#video', '#long_context', '#diffusion'], 'emoji': '🐍', 'ru': {'title': 'Бесконечное видео: Ouroboros-Diffusion для непрерывной генерации согласованного контента', 'desc': 'Эта статья представляет новый метод генерации видео произвольной длины под названием Ouroboros-Diffusion. Метод улучшает структурную и сюжетную согласованность видео с помощью нового подхода к выборке латентного пространства и механизма Subject-Aware Cross-Frame Attention. Авторы также вводят самоповторяющееся руководство, использующее информацию из предыдущих очищенных кадров для улучшения шумных кадров. Эксперименты на бенчмарке VBench показывают превосходство Ouroboros-Diffusion в сохранении согласованности субъектов, плавности движения и временной согласованности.'}, 'en': {'title': 'Ouroboros-Diffusion: Enhancing Long Video Consistency and Coherence', 'desc': 'The paper introduces Ouroboros-Diffusion, a new framework for improving long video generation using a pre-trained text-to-video model. It addresses the limitations of FIFO-Diffusion, particularly in maintaining long-range temporal consistency across video frames. The proposed method enhances structural consistency through a novel latent sampling technique and improves subject consistency with a Subject-Aware Cross-Frame Attention mechanism. Additionally, self-recurrent guidance is implemented to utilize information from previous frames, resulting in videos with better visual coherence and smoother transitions.'}, 'zh': {'title': 'Ouroboros-Diffusion:提升视频生成一致性的创新框架', 'desc': 'FIFO视频扩散是一种基于预训练文本到视频模型的长视频生成方法,但在生成视频时常常缺乏长时间的一致性。本文提出了Ouroboros-Diffusion框架,通过引入新的潜在采样技术和主题感知跨帧注意机制,增强了视频的结构和内容一致性。该方法确保了帧之间的平滑过渡,并通过自递归引导技术利用前面清晰帧的信息来改善后面噪声帧的去噪效果。实验结果表明,Ouroboros-Diffusion在主题一致性、运动平滑性和时间一致性方面优于现有方法。'}}}, {'id': 'https://huggingface.co/papers/2501.08809', 'title': 'XMusic: Towards a Generalized and Controllable Symbolic Music Generation Framework', 'url': 'https://huggingface.co/papers/2501.08809', 'abstract': 'In recent years, remarkable advancements in artificial intelligence-generated content (AIGC) have been achieved in the fields of image synthesis and text generation, generating content comparable to that produced by humans. However, the quality of AI-generated music has not yet reached this standard, primarily due to the challenge of effectively controlling musical emotions and ensuring high-quality outputs. This paper presents a generalized symbolic music generation framework, XMusic, which supports flexible prompts (i.e., images, videos, texts, tags, and humming) to generate emotionally controllable and high-quality symbolic music. XMusic consists of two core components, XProjector and XComposer. XProjector parses the prompts of various modalities into symbolic music elements (i.e., emotions, genres, rhythms and notes) within the projection space to generate matching music. XComposer contains a Generator and a Selector. The Generator generates emotionally controllable and melodious music based on our innovative symbolic music representation, whereas the Selector identifies high-quality symbolic music by constructing a multi-task learning scheme involving quality assessment, emotion recognition, and genre recognition tasks. In addition, we build XMIDI, a large-scale symbolic music dataset that contains 108,023 MIDI files annotated with precise emotion and genre labels. Objective and subjective evaluations show that XMusic significantly outperforms the current state-of-the-art methods with impressive music quality. Our XMusic has been awarded as one of the nine Highlights of Collectibles at WAIC 2023. The project homepage of XMusic is https://xmusic-project.github.io.', 'score': 4, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'd4d018c9adb2579c', 'pdf_title_img': 'img/title_stub.png', 'data': {'categories': ['#audio', '#story_generation', '#multimodal', '#dataset'], 'emoji': '🎵', 'ru': {'title': 'XMusic: ИИ-композитор нового поколения с управляемыми эмоциями', 'desc': 'Статья представляет XMusic - генерализованный фреймворк для генерации символической музыки, поддерживающий различные типы промптов. XMusic состоит из двух ключевых компонентов: XProjector для обработки промптов и XComposer для генерации музыки. Авторы также создали датасет XMIDI, содержащий более 100 тысяч MIDI-файлов с аннотациями эмоций и жанров. Согласно оценкам, XMusic значительно превосходит современные методы по качеству генерируемой музыки.'}, 'en': {'title': 'XMusic: Emotionally Controlled Music Generation Made Easy!', 'desc': 'This paper introduces XMusic, a new framework for generating symbolic music that can be controlled by emotional prompts. It includes two main components: XProjector, which converts various input types into musical elements, and XComposer, which generates and selects high-quality music. The framework uses a multi-task learning approach to ensure the generated music meets quality, emotional, and genre standards. Additionally, the authors created a large dataset, XMIDI, to support their research and demonstrate that XMusic outperforms existing methods in music generation.'}, 'zh': {'title': 'XMusic:情感可控的高质量音乐生成', 'desc': '近年来,人工智能生成内容(AIGC)在图像合成和文本生成领域取得了显著进展,但在音乐生成方面仍面临挑战。本文提出了一种通用的符号音乐生成框架XMusic,能够通过灵活的提示生成可控情感和高质量的符号音乐。XMusic由两个核心组件组成:XProjector和XComposer,前者将多种模态的提示解析为音乐元素,后者则生成和选择高质量的音乐。通过构建大规模的XMIDI数据集和多任务学习方案,XMusic在音乐质量上显著优于现有方法。'}}, 'authors': [], 'affiliations': []}, {'id': 'https://huggingface.co/papers/2501.08970', 'title': 'Trusted Machine Learning Models Unlock Private Inference for Problems Currently Infeasible with Cryptography', 'url': 'https://huggingface.co/papers/2501.08970', 'abstract': 'We often interact with untrusted parties. Prioritization of privacy can limit the effectiveness of these interactions, as achieving certain goals necessitates sharing private data. Traditionally, addressing this challenge has involved either seeking trusted intermediaries or constructing cryptographic protocols that restrict how much data is revealed, such as multi-party computations or zero-knowledge proofs. While significant advances have been made in scaling cryptographic approaches, they remain limited in terms of the size and complexity of applications they can be used for. In this paper, we argue that capable machine learning models can fulfill the role of a trusted third party, thus enabling secure computations for applications that were previously infeasible. In particular, we describe Trusted Capable Model Environments (TCMEs) as an alternative approach for scaling secure computation, where capable machine learning model(s) interact under input/output constraints, with explicit information flow control and explicit statelessness. This approach aims to achieve a balance between privacy and computational efficiency, enabling private inference where classical cryptographic solutions are currently infeasible. We describe a number of use cases that are enabled by TCME, and show that even some simple classic cryptographic problems can already be solved with TCME. Finally, we outline current limitations and discuss the path forward in implementing them.', 'score': 3, 'issue_id': 1702, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '858fc03ac78b66c1', 'authors': ['Ilia Shumailov', 'Daniel Ramage', 'Sarah Meiklejohn', 'Peter Kairouz', 'Florian Hartmann', 'Borja Balle', 'Eugene Bagdasarian'], 'affiliations': ['Google', 'Google DeepMind', 'Google Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.08970.jpg', 'data': {'categories': ['#data', '#ethics', '#architecture', '#security', '#inference'], 'emoji': '🔐', 'ru': {'title': 'Машинное обучение как доверенный посредник для безопасных вычислений', 'desc': 'Статья представляет новый подход к безопасным вычислениям с использованием машинного обучения - Trusted Capable Model Environments (TCME). TCME предлагается как альтернатива традиционным криптографическим методам для обеспечения конфиденциальности при взаимодействии с ненадежными сторонами. Авторы утверждают, что мощные модели машинного обучения могут выполнять роль доверенной третьей стороны, позволяя проводить безопасные вычисления для приложений, которые ранее были невозможны. В статье описываются возможные применения TCME и обсуждаются текущие ограничения и перспективы развития этого подхода.'}, 'en': {'title': 'Empowering Privacy with Trusted Machine Learning Models', 'desc': 'This paper introduces Trusted Capable Model Environments (TCMEs) as a novel solution for secure computations involving untrusted parties. It suggests that advanced machine learning models can act as trusted intermediaries, allowing for private data sharing while maintaining privacy. The authors highlight how TCMEs can efficiently manage input/output constraints and control information flow, making them suitable for applications where traditional cryptographic methods fall short. They also present various use cases and acknowledge the limitations of their approach, paving the way for future developments in secure machine learning applications.'}, 'zh': {'title': '利用机器学习实现安全计算的新方法', 'desc': '本文探讨了在与不可信方互动时如何平衡隐私和计算效率。我们提出了可信能力模型环境(TCME),作为一种新的安全计算方法,利用机器学习模型充当可信第三方。TCME在输入/输出约束下进行交互,并通过显式的信息流控制和无状态性来保护隐私。我们展示了TCME在解决一些经典密码学问题上的潜力,并讨论了未来的实施路径。'}}}, {'id': 'https://huggingface.co/papers/2501.04693', 'title': 'Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous Sensors via Language Grounding', 'url': 'https://huggingface.co/papers/2501.04693', 'abstract': 'Interacting with the world is a multi-sensory experience: achieving effective general-purpose interaction requires making use of all available modalities -- including vision, touch, and audio -- to fill in gaps from partial observation. For example, when vision is occluded reaching into a bag, a robot should rely on its senses of touch and sound. However, state-of-the-art generalist robot policies are typically trained on large datasets to predict robot actions solely from visual and proprioceptive observations. In this work, we propose FuSe, a novel approach that enables finetuning visuomotor generalist policies on heterogeneous sensor modalities for which large datasets are not readily available by leveraging natural language as a common cross-modal grounding. We combine a multimodal contrastive loss with a sensory-grounded language generation loss to encode high-level semantics. In the context of robot manipulation, we show that FuSe enables performing challenging tasks that require reasoning jointly over modalities such as vision, touch, and sound in a zero-shot setting, such as multimodal prompting, compositional cross-modal prompting, and descriptions of objects it interacts with. We show that the same recipe is applicable to widely different generalist policies, including both diffusion-based generalist policies and large vision-language-action (VLA) models. Extensive experiments in the real world show that FuSeis able to increase success rates by over 20% compared to all considered baselines.', 'score': 0, 'issue_id': 1709, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1612a7343aff595b', 'authors': ['Joshua Jones', 'Oier Mees', 'Carmelo Sferrazza', 'Kyle Stachowicz', 'Pieter Abbeel', 'Sergey Levine'], 'affiliations': ['Berkeley AI Research (BAIR), UC Berkeley, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.04693.jpg', 'data': {'categories': ['#transfer_learning', '#multimodal', '#robotics', '#reasoning'], 'emoji': '🤖', 'ru': {'title': 'Мультисенсорный ИИ: объединение зрения, осязания и звука для улучшения взаимодействия роботов с миром', 'desc': 'Статья представляет FuSe - новый подход к обучению роботов, использующий мультимодальные сенсорные данные. FuSe использует естественный язык как общую основу для объединения различных модальностей, таких как зрение, осязание и звук. Метод сочетает мультимодальную контрастивную функцию потерь с генерацией языка на основе сенсорных данных для кодирования высокоуровневой семантики. Эксперименты показывают, что FuSe позволяет роботам выполнять сложные задачи, требующие рассуждений на основе нескольких модальностей, повышая успешность на 20% по сравнению с базовыми методами.'}, 'en': {'title': 'FuSe: Bridging Sensory Gaps for Smarter Robot Interaction', 'desc': 'This paper introduces FuSe, a method that enhances robot interaction by integrating multiple sensory modalities like vision, touch, and sound. Traditional robot policies often rely solely on visual data, but FuSe allows for fine-tuning these policies using natural language to bridge gaps in sensory information. By employing a multimodal contrastive loss and a sensory-grounded language generation loss, FuSe effectively encodes high-level semantics for better decision-making. The results demonstrate that FuSe significantly improves the success rates of robots in complex tasks, showcasing its versatility across different generalist policies.'}, 'zh': {'title': '多模态交互,提升机器人智能', 'desc': '本论文提出了一种名为FuSe的新方法,旨在通过多模态传感器数据来微调通用机器人策略。FuSe利用自然语言作为跨模态的共同基础,结合多模态对比损失和感知基础的语言生成损失,以编码高层语义。通过这种方法,机器人能够在视觉、触觉和听觉等多种感官信息的共同推理下,完成复杂的操作任务。实验结果表明,FuSe在实际应用中成功率提高了超过20%。'}}}, {'id': 'https://huggingface.co/papers/2412.19412', 'title': 'MINIMA: Modality Invariant Image Matching', 'url': 'https://huggingface.co/papers/2412.19412', 'abstract': 'Image matching for both cross-view and cross-modality plays a critical role in multimodal perception. In practice, the modality gap caused by different imaging systems/styles poses great challenges to the matching task. Existing works try to extract invariant features for specific modalities and train on limited datasets, showing poor generalization. In this paper, we present MINIMA, a unified image matching framework for multiple cross-modal cases. Without pursuing fancy modules, our MINIMA aims to enhance universal performance from the perspective of data scaling up. For such purpose, we propose a simple yet effective data engine that can freely produce a large dataset containing multiple modalities, rich scenarios, and accurate matching labels. Specifically, we scale up the modalities from cheap but rich RGB-only matching data, by means of generative models. Under this setting, the matching labels and rich diversity of the RGB dataset are well inherited by the generated multimodal data. Benefiting from this, we construct MD-syn, a new comprehensive dataset that fills the data gap for general multimodal image matching. With MD-syn, we can directly train any advanced matching pipeline on randomly selected modality pairs to obtain cross-modal ability. Extensive experiments on in-domain and zero-shot matching tasks, including 19 cross-modal cases, demonstrate that our MINIMA can significantly outperform the baselines and even surpass modality-specific methods. The dataset and code are available at https://github.com/LSXI7/MINIMA .', 'score': 0, 'issue_id': 1709, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': 'fa772dead5453f7b', 'authors': ['Xingyu Jiang', 'Jiangwei Ren', 'Zizhuo Li', 'Xin Zhou', 'Dingkang Liang', 'Xiang Bai'], 'affiliations': ['Huazhong University of Science and Technology', 'Wuhan University'], 'pdf_title_img': 'assets/pdf/title_img/2412.19412.jpg', 'data': {'categories': ['#dataset', '#data', '#multimodal', '#open_source', '#synthetic'], 'emoji': '🔀', 'ru': {'title': 'Универсальное сопоставление изображений через масштабирование данных', 'desc': 'Статья представляет MINIMA - универсальную систему сопоставления изображений для различных кросс-модальных случаев. Авторы предлагают эффективный механизм генерации большого набора данных с несколькими модальностями, разнообразными сценариями и точными метками сопоставления. Используя этот подход, они создают новый комплексный датасет MD-syn для обучения нейросетей кросс-модальному сопоставлению изображений. Эксперименты показывают, что MINIMA значительно превосходит базовые модели и даже специализированные методы для конкретных модальностей в 19 кросс-модальных задачах.'}, 'en': {'title': 'MINIMA: Bridging the Gap in Cross-Modal Image Matching', 'desc': 'This paper introduces MINIMA, a framework designed for image matching across different views and modalities, addressing the challenges posed by varying imaging systems. The authors highlight the limitations of existing methods that rely on invariant features and small datasets, which often lead to poor performance. MINIMA enhances image matching by scaling up data through a generative model that creates a large, diverse dataset with accurate matching labels. The new dataset, MD-syn, allows for effective training of matching algorithms, resulting in improved performance in both in-domain and zero-shot scenarios compared to traditional methods.'}, 'zh': {'title': 'MINIMA:跨模态图像匹配的新突破', 'desc': '本文提出了一种名为MINIMA的统一图像匹配框架,旨在解决跨视角和跨模态的图像匹配问题。现有方法在特定模态上提取不变特征,但在有限数据集上训练,导致泛化能力差。MINIMA通过一个简单有效的数据引擎,生成包含多种模态和丰富场景的大型数据集,从而提升通用性能。通过构建MD-syn数据集,MINIMA能够在随机选择的模态对上直接训练,显著提高跨模态匹配能力。'}}}, {'id': 'https://huggingface.co/papers/2501.08313', 'title': 'MiniMax-01: Scaling Foundation Models with Lightning Attention', 'url': 'https://huggingface.co/papers/2501.08313', 'abstract': 'We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, of which 45.9 billion are activated for each token. We develop an optimized parallel strategy and highly efficient computation-communication overlap techniques for MoE and lightning attention. This approach enables us to conduct efficient training and inference on models with hundreds of billions of parameters across contexts spanning millions of tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens during training and extrapolate to 4 million tokens during inference at an affordable cost. Our vision-language model, MiniMax-VL-01 is built through continued training with 512 billion vision-language tokens. Experiments on both standard and in-house benchmarks show that our models match the performance of state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 times longer context window. We publicly release MiniMax-01 at https://github.com/MiniMax-AI.', 'score': 192, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'a57d7b1914e7383a', 'authors': ['MiniMax', 'Aonian Li', 'Bangwei Gong', 'Bo Yang', 'Boji Shan', 'Chang Liu', 'Cheng Zhu', 'Chunhao Zhang', 'Congchao Guo', 'Da Chen', 'Dong Li', 'Enwei Jiao', 'Gengxin Li', 'Guojun Zhang', 'Haohai Sun', 'Houze Dong', 'Jiadai Zhu', 'Jiaqi Zhuang', 'Jiayuan Song', 'Jin Zhu', 'Jingtao Han', 'Jingyang Li', 'Junbin Xie', 'Junhao Xu', 'Junjie Yan', 'Kaishun Zhang', 'Kecheng Xiao', 'Kexi Kang', 'Le Han', 'Leyang Wang', 'Lianfei Yu', 'Liheng Feng', 'Lin Zheng', 'Linbo Chai', 'Long Xing', 'Meizhi Ju', 'Mingyuan Chi', 'Mozhi Zhang', 'Peikai Huang', 'Pengcheng Niu', 'Pengfei Li', 'Pengyu Zhao', 'Qi Yang', 'Qidi Xu', 'Qiexiang Wang', 'Qin Wang', 'Qiuhui Li', 'Ruitao Leng', 'Shengmin Shi', 'Shuqi Yu', 'Sichen Li', 'Songquan Zhu', 'Tao Huang', 'Tianrun Liang', 'Weigao Sun', 'Weixuan Sun', 'Weiyu Cheng', 'Wenkai Li', 'Xiangjun Song', 'Xiao Su', 'Xiaodong Han', 'Xinjie Zhang', 'Xinzhu Hou', 'Xu Min', 'Xun Zou', 'Xuyang Shen', 'Yan Gong', 'Yingjie Zhu', 'Yipeng Zhou', 'Yiran Zhong', 'Yongyi Hu', 'Yuanxiang Fan', 'Yue Yu', 'Yufeng Yang', 'Yuhao Li', 'Yunan Huang', 'Yunji Li', 'Yunpeng Huang', 'Yunzhi Xu', 'Yuxin Mao', 'Zehan Li', 'Zekang Li', 'Zewei Tao', 'Zewen Ying', 'Zhaoyang Cong', 'Zhen Qin', 'Zhenhua Fan', 'Zhihang Yu', 'Zhuo Jiang', 'Zijia Wu'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08313.jpg', 'data': {'categories': ['#open_source', '#architecture', '#optimization', '#benchmark', '#long_context', '#training'], 'emoji': '🚀', 'ru': {'title': 'MiniMax-01: Революция в обработке длинных контекстов', 'desc': 'Исследователи представили серию моделей MiniMax-01, включая MiniMax-Text-01 и MiniMax-VL-01, которые сравнимы с лучшими моделями, но обладают улучшенными возможностями обработки длинных контекстов. В основе лежит технология lightning attention и ее эффективное масштабирование, интегрированные с Mixture of Experts (MoE). Модель имеет 32 эксперта и 456 миллиардов параметров, из которых 45,9 миллиардов активируются для каждого токена. Контекстное окно MiniMax-Text-01 может достигать 1 миллиона токенов при обучении и экстраполироваться до 4 миллионов токенов при инференсе.'}, 'en': {'title': 'Unleashing Long Contexts with MiniMax-01 Models', 'desc': 'The MiniMax-01 series introduces advanced models, MiniMax-Text-01 and MiniMax-VL-01, designed to handle longer contexts effectively. These models utilize lightning attention and a Mixture of Experts (MoE) architecture, featuring 32 experts and a staggering 456 billion parameters, optimizing the activation of 45.9 billion parameters per token. By implementing efficient parallel strategies and computation-communication overlap techniques, the models can train and infer on extensive datasets, reaching context windows of up to 1 million tokens during training and 4 million during inference. Performance evaluations indicate that MiniMax-01 models rival leading models like GPT-4o and Claude-3.5-Sonnet while significantly extending context capabilities.'}, 'zh': {'title': 'MiniMax-01:超长上下文处理的新纪元', 'desc': '我们介绍了MiniMax-01系列,包括MiniMax-Text-01和MiniMax-VL-01,这些模型在处理更长的上下文时具有优越的能力。核心技术是闪电注意力和高效的扩展能力。为了最大化计算能力,我们将其与专家混合模型(MoE)结合,创建了一个拥有32个专家和4560亿参数的模型。我们的实验表明,这些模型在标准和内部基准测试中表现出色,能够与最先进的模型相媲美,同时提供20到32倍更长的上下文窗口。'}}}, {'id': 'https://huggingface.co/papers/2501.08332', 'title': 'MangaNinja: Line Art Colorization with Precise Reference Following', 'url': 'https://huggingface.co/papers/2501.08332', 'abstract': 'Derived from diffusion models, MangaNinjia specializes in the task of reference-guided line art colorization. We incorporate two thoughtful designs to ensure precise character detail transcription, including a patch shuffling module to facilitate correspondence learning between the reference color image and the target line art, and a point-driven control scheme to enable fine-grained color matching. Experiments on a self-collected benchmark demonstrate the superiority of our model over current solutions in terms of precise colorization. We further showcase the potential of the proposed interactive point control in handling challenging cases, cross-character colorization, multi-reference harmonization, beyond the reach of existing algorithms.', 'score': 31, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '20ea6b75639e2ced', 'authors': ['Zhiheng Liu', 'Ka Leong Cheng', 'Xi Chen', 'Jie Xiao', 'Hao Ouyang', 'Kai Zhu', 'Yu Liu', 'Yujun Shen', 'Qifeng Chen', 'Ping Luo'], 'affiliations': ['Ant Group', 'HKU', 'HKUST', 'Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.08332.jpg', 'data': {'categories': ['#cv', '#diffusion', '#benchmark'], 'emoji': '🎨', 'ru': {'title': 'Прецизионное раскрашивание манги с помощью ИИ', 'desc': 'MangaNinjia - это модель для раскрашивания линейных рисунков манги, основанная на диффузионных моделях. Она использует модуль перемешивания патчей для обучения соответствиям между цветным изображением-образцом и целевым линейным рисунком. Модель также включает схему точечного контроля для точного подбора цветов. Эксперименты показывают превосходство MangaNinjia над существующими решениями в точности раскрашивания.'}, 'en': {'title': 'MangaNinjia: Mastering Line Art Colorization with Precision', 'desc': 'MangaNinjia is a model designed for coloring line art by using reference images. It employs a patch shuffling module to help the model learn how to match colors from the reference image to the target line art accurately. Additionally, it features a point-driven control scheme that allows for detailed color adjustments, ensuring that colors are applied precisely. Our experiments show that MangaNinjia outperforms existing methods in colorization tasks, especially in complex scenarios involving multiple references and different characters.'}, 'zh': {'title': 'MangaNinjia:精准上色的新方法', 'desc': 'MangaNinjia 是一种基于扩散模型的参考引导线条艺术上色技术。我们设计了两个模块来确保角色细节的准确转录,包括补丁洗牌模块和点驱动控制方案,以实现精细的颜色匹配。实验结果表明,我们的模型在精确上色方面优于现有解决方案。我们还展示了所提议的交互式点控制在处理复杂案例和多参考协调方面的潜力,超越了现有算法的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.06751', 'title': 'Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models', 'url': 'https://huggingface.co/papers/2501.06751', 'abstract': "Text-to-image (T2I) diffusion models rely on encoded prompts to guide the image generation process. Typically, these prompts are extended to a fixed length by adding padding tokens before text encoding. Despite being a default practice, the influence of padding tokens on the image generation process has not been investigated. In this work, we conduct the first in-depth analysis of the role padding tokens play in T2I models. We develop two causal techniques to analyze how information is encoded in the representation of tokens across different components of the T2I pipeline. Using these techniques, we investigate when and how padding tokens impact the image generation process. Our findings reveal three distinct scenarios: padding tokens may affect the model's output during text encoding, during the diffusion process, or be effectively ignored. Moreover, we identify key relationships between these scenarios and the model's architecture (cross or self-attention) and its training process (frozen or trained text encoder). These insights contribute to a deeper understanding of the mechanisms of padding tokens, potentially informing future model design and training practices in T2I systems.", 'score': 27, 'issue_id': 1677, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': '05733e8e82e23568', 'authors': ['Michael Toker', 'Ido Galil', 'Hadas Orgad', 'Rinon Gal', 'Yoad Tewel', 'Gal Chechik', 'Yonatan Belinkov'], 'affiliations': ['Bar-Ilan University', 'NVIDIA', 'Technion Israel Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.06751.jpg', 'data': {'categories': ['#cv', '#architecture', '#interpretability', '#diffusion', '#training'], 'emoji': '🧩', 'ru': {'title': 'Раскрытие тайн токенов заполнения в генерации изображений', 'desc': 'Исследователи провели первый глубокий анализ роли токенов заполнения в моделях преобразования текста в изображение (T2I). Они разработали две причинно-следственные техники для изучения того, как информация кодируется в представлении токенов в различных компонентах конвейера T2I. Результаты показали три различных сценария влияния токенов заполнения на процесс генерации изображений. Исследование выявило ключевые взаимосвязи между этими сценариями и архитектурой модели, а также процессом ее обучения.'}, 'en': {'title': 'Unpacking Padding: The Hidden Role in Text-to-Image Models', 'desc': "This paper explores the impact of padding tokens in text-to-image (T2I) diffusion models, which are used to generate images from text prompts. The authors analyze how these padding tokens influence the image generation process at different stages, including text encoding and the diffusion process. They identify three scenarios where padding tokens can either affect the output or be ignored, depending on the model's architecture and training methods. The findings provide valuable insights that could guide future improvements in T2I model design and training practices."}, 'zh': {'title': '填充标记在图像生成中的关键作用', 'desc': '本文研究了文本到图像(T2I)扩散模型中填充标记的作用。填充标记通常用于将提示扩展到固定长度,但其对图像生成过程的影响尚未被深入探讨。我们开发了两种因果分析技术,探讨填充标记在T2I模型不同组件中的信息编码方式。研究结果表明,填充标记在文本编码、扩散过程中的影响各不相同,并与模型架构和训练过程存在重要关系。'}}}, {'id': 'https://huggingface.co/papers/2501.08316', 'title': 'Diffusion Adversarial Post-Training for One-Step Video Generation', 'url': 'https://huggingface.co/papers/2501.08316', 'abstract': 'The diffusion models are widely used for image and video generation, but their iterative generation process is slow and expansive. While existing distillation approaches have demonstrated the potential for one-step generation in the image domain, they still suffer from significant quality degradation. In this work, we propose Adversarial Post-Training (APT) against real data following diffusion pre-training for one-step video generation. To improve the training stability and quality, we introduce several improvements to the model architecture and training procedures, along with an approximated R1 regularization objective. Empirically, our experiments show that our adversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720, 24fps videos in real time using a single forward evaluation step. Additionally, our model is capable of generating 1024px images in a single step, achieving quality comparable to state-of-the-art methods.', 'score': 19, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '4122a780e8356ce7', 'authors': ['Shanchuan Lin', 'Xin Xia', 'Yuxi Ren', 'Ceyuan Yang', 'Xuefeng Xiao', 'Lu Jiang'], 'affiliations': ['ByteDance'], 'pdf_title_img': 'assets/pdf/title_img/2501.08316.jpg', 'data': {'categories': ['#architecture', '#optimization', '#video', '#diffusion', '#training'], 'emoji': '🎬', 'ru': {'title': 'Революция в генерации видео: от итераций к мгновенному результату', 'desc': 'Эта статья представляет новый метод под названием Adversarial Post-Training (APT) для одношаговой генерации видео. Авторы предлагают улучшения архитектуры модели и процедур обучения, включая аппроксимированную регуляризацию R1. Их модель Seaweed-APT способна генерировать 2-секундные видео высокого разрешения в реальном времени за один проход. Кроме того, модель может создавать изображения размером 1024px за один шаг, достигая качества, сравнимого с современными методами.'}, 'en': {'title': 'Fast and High-Quality Video Generation with Seaweed-APT', 'desc': 'This paper addresses the slow and costly iterative process of generating images and videos using diffusion models. The authors introduce Adversarial Post-Training (APT) to enhance one-step video generation while maintaining high quality. They implement architectural and procedural improvements, including an approximated R1 regularization, to stabilize training. Their model, Seaweed-APT, successfully generates high-quality 2-second videos and 1024px images in real time with a single forward evaluation step.'}, 'zh': {'title': '对抗后训练:快速高质量视频生成的新方法', 'desc': '扩散模型广泛应用于图像和视频生成,但其迭代生成过程较慢且成本高昂。现有的蒸馏方法在图像领域展示了单步生成的潜力,但仍存在显著的质量下降。本文提出了一种针对真实数据的对抗后训练(APT)方法,以实现单步视频生成。我们的实验表明,经过对抗后训练的模型Seaweed-APT能够实时生成1280x720、24fps的2秒视频,并且在单步生成1024px图像时,其质量可与最先进的方法相媲美。'}}}, {'id': 'https://huggingface.co/papers/2501.08187', 'title': 'A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction Following', 'url': 'https://huggingface.co/papers/2501.08187', 'abstract': 'Large language models excel at interpreting complex natural language instructions, enabling them to perform a wide range of tasks. In the life sciences, single-cell RNA sequencing (scRNA-seq) data serves as the "language of cellular biology", capturing intricate gene expression patterns at the single-cell level. However, interacting with this "language" through conventional tools is often inefficient and unintuitive, posing challenges for researchers. To address these limitations, we present InstructCell, a multi-modal AI copilot that leverages natural language as a medium for more direct and flexible single-cell analysis. We construct a comprehensive multi-modal instruction dataset that pairs text-based instructions with scRNA-seq profiles from diverse tissues and species. Building on this, we develop a multi-modal cell language architecture capable of simultaneously interpreting and processing both modalities. InstructCell empowers researchers to accomplish critical tasks-such as cell type annotation, conditional pseudo-cell generation, and drug sensitivity prediction-using straightforward natural language commands. Extensive evaluations demonstrate that InstructCell consistently meets or exceeds the performance of existing single-cell foundation models, while adapting to diverse experimental conditions. More importantly, InstructCell provides an accessible and intuitive tool for exploring complex single-cell data, lowering technical barriers and enabling deeper biological insights.', 'score': 18, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'de984ce7cc62fa5e', 'authors': ['Yin Fang', 'Xinle Deng', 'Kangwei Liu', 'Ningyu Zhang', 'Jingyang Qian', 'Penghui Yang', 'Xiaohui Fan', 'Huajun Chen'], 'affiliations': ['College of Computer Science and Technology, Zhejiang University, Hangzhou 310027, China', 'College of Pharmaceutical Sciences, Zhejiang University, Hangzhou 310058, China', 'Future Health Laboratory, Innovation Center of Yangtze River Delta, Zhejiang University, Jiaxing 314100, China', 'Innovation Center in Zhejiang University, State Key Laboratory of Component-Based Chinese Medicine, Hangzhou 310058, China', 'School of Software Technology, Zhejiang University, Ningbo 315048, China', 'ZJU-Hangzhou Global Scientific and Technological Innovation Center, Hangzhou 311200, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.08187.jpg', 'data': {'categories': ['#architecture', '#multimodal', '#dataset', '#science', '#healthcare'], 'emoji': '🧬', 'ru': {'title': 'Естественный язык как ключ к расшифровке клеточной биологии', 'desc': 'InstructCell - это мультимодальный ИИ-помощник для анализа данных одноклеточного РНК-секвенирования (scRNA-seq). Он использует архитектуру, способную интерпретировать как естественный язык, так и профили экспрессии генов. InstructCell позволяет исследователям выполнять такие задачи, как аннотация типов клеток и предсказание чувствительности к лекарствам, с помощью простых текстовых команд. Модель демонстрирует высокую производительность и адаптивность к различным экспериментальным условиям.'}, 'en': {'title': 'InstructCell: Bridging Language and Biology for Seamless Single-Cell Analysis', 'desc': 'This paper introduces InstructCell, an AI tool designed to simplify the analysis of single-cell RNA sequencing (scRNA-seq) data using natural language instructions. By creating a dataset that links text commands with scRNA-seq profiles, InstructCell allows researchers to perform complex tasks like cell type annotation and drug sensitivity prediction more intuitively. The model employs a multi-modal architecture that processes both text and biological data simultaneously, enhancing its usability. Evaluations show that InstructCell outperforms existing models, making single-cell analysis more accessible and efficient for researchers in the life sciences.'}, 'zh': {'title': '用自然语言解锁单细胞数据的潜力', 'desc': '这篇论文介绍了InstructCell,一个多模态的人工智能助手,旨在通过自然语言简化单细胞RNA测序(scRNA-seq)数据的分析。传统工具在处理细胞生物学的复杂数据时效率低下,而InstructCell通过将文本指令与scRNA-seq数据结合,提供了更直接和灵活的分析方式。该系统能够执行细胞类型注释、条件伪细胞生成和药物敏感性预测等关键任务,且使用简单的自然语言命令即可完成。评估结果表明,InstructCell在性能上优于现有的单细胞基础模型,同时适应多种实验条件,降低了技术门槛,促进了生物学的深入理解。'}}}, {'id': 'https://huggingface.co/papers/2501.08225', 'title': 'FramePainter: Endowing Interactive Image Editing with Video Diffusion Priors', 'url': 'https://huggingface.co/papers/2501.08225', 'abstract': 'Interactive image editing allows users to modify images through visual interaction operations such as drawing, clicking, and dragging. Existing methods construct such supervision signals from videos, as they capture how objects change with various physical interactions. However, these models are usually built upon text-to-image diffusion models, so necessitate (i) massive training samples and (ii) an additional reference encoder to learn real-world dynamics and visual consistency. In this paper, we reformulate this task as an image-to-video generation problem, so that inherit powerful video diffusion priors to reduce training costs and ensure temporal consistency. Specifically, we introduce FramePainter as an efficient instantiation of this formulation. Initialized with Stable Video Diffusion, it only uses a lightweight sparse control encoder to inject editing signals. Considering the limitations of temporal attention in handling large motion between two frames, we further propose matching attention to enlarge the receptive field while encouraging dense correspondence between edited and source image tokens. We highlight the effectiveness and efficiency of FramePainter across various of editing signals: it domainantly outperforms previous state-of-the-art methods with far less training data, achieving highly seamless and coherent editing of images, \\eg, automatically adjust the reflection of the cup. Moreover, FramePainter also exhibits exceptional generalization in scenarios not present in real-world videos, \\eg, transform the clownfish into shark-like shape. Our code will be available at https://github.com/YBYBZhang/FramePainter.', 'score': 12, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '811cfd0f18eb1e53', 'authors': ['Yabo Zhang', 'Xinpeng Zhou', 'Yihan Zeng', 'Hang Xu', 'Hui Li', 'Wangmeng Zuo'], 'affiliations': ['Harbin Institute of Technology', 'Huawei Noahs Ark Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.08225.jpg', 'data': {'categories': ['#video', '#cv', '#optimization', '#diffusion'], 'emoji': '🎨', 'ru': {'title': 'FramePainter: эффективное редактирование изображений через генерацию видео', 'desc': 'Статья представляет FramePainter - новый подход к интерактивному редактированию изображений, основанный на генерации видео. В отличие от существующих методов, использующих модели диффузии текст-изображение, FramePainter опирается на мощные видео-диффузионные модели для обеспечения временной согласованности и снижения затрат на обучение. Метод использует легковесный энкодер для внедрения сигналов редактирования и вводит механизм согласованного внимания для улучшения обработки крупных движений между кадрами. FramePainter превосходит современные методы, требуя значительно меньше обучающих данных и демонстрируя высокую обобщающую способность.'}, 'en': {'title': 'Revolutionizing Image Editing with Efficient Video Diffusion', 'desc': 'This paper presents FramePainter, a novel approach to interactive image editing that reformulates the task as image-to-video generation. By leveraging video diffusion models, FramePainter reduces the need for extensive training data while ensuring temporal consistency in edited images. It utilizes a lightweight sparse control encoder to effectively incorporate editing signals, and introduces matching attention to improve the handling of large motion between frames. The results demonstrate that FramePainter significantly outperforms existing methods, achieving seamless image edits and showcasing strong generalization capabilities.'}, 'zh': {'title': 'FramePainter:高效的图像编辑新方法', 'desc': '本文提出了一种交互式图像编辑的新方法,称为FramePainter。该方法将图像编辑任务重新定义为图像到视频的生成问题,从而利用强大的视频扩散先验,降低训练成本并确保时间一致性。FramePainter使用轻量级的稀疏控制编码器来注入编辑信号,并通过匹配注意力机制增强了对大运动的处理能力。实验结果表明,FramePainter在各种编辑信号下表现优异,能够实现无缝且连贯的图像编辑,且在未见过的场景中也展现出卓越的泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2501.08326', 'title': 'Omni-RGPT: Unifying Image and Video Region-level Understanding via Token Marks', 'url': 'https://huggingface.co/papers/2501.08326', 'abstract': 'We present Omni-RGPT, a multimodal large language model designed to facilitate region-level comprehension for both images and videos. To achieve consistent region representation across spatio-temporal dimensions, we introduce Token Mark, a set of tokens highlighting the target regions within the visual feature space. These tokens are directly embedded into spatial regions using region prompts (e.g., boxes or masks) and simultaneously incorporated into the text prompt to specify the target, establishing a direct connection between visual and text tokens. To further support robust video understanding without requiring tracklets, we introduce an auxiliary task that guides Token Mark by leveraging the consistency of the tokens, enabling stable region interpretation across the video. Additionally, we introduce a large-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT achieves state-of-the-art results on image and video-based commonsense reasoning benchmarks while showing strong performance in captioning and referring expression comprehension tasks.', 'score': 11, 'issue_id': 1678, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '463580cacfaa6789', 'authors': ['Miran Heo', 'Min-Hung Chen', 'De-An Huang', 'Sifei Liu', 'Subhashree Radhakrishnan', 'Seon Joo Kim', 'Yu-Chiang Frank Wang', 'Ryo Hachiuma'], 'affiliations': ['NVIDIA', 'Yonsei University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08326.jpg', 'data': {'categories': ['#multimodal', '#reasoning', '#agi', '#cv', '#dataset', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'Omni-RGPT: Новый уровень понимания изображений и видео искусственным интеллектом', 'desc': 'Omni-RGPT - это мультимодальная большая языковая модель, разработанная для понимания изображений и видео на уровне регионов. Модель использует технологию Token Mark для выделения целевых регионов в визуальном пространстве признаков. Для улучшения понимания видео без необходимости трекинга объектов введена вспомогательная задача, использующая согласованность токенов. Авторы также представили большой набор данных RegVID-300k для обучения на видео с инструкциями на уровне регионов.'}, 'en': {'title': 'Omni-RGPT: Bridging Visual and Textual Understanding with Token Mark', 'desc': 'Omni-RGPT is a multimodal large language model that enhances understanding of specific regions in images and videos. It uses a novel approach called Token Mark, which embeds tokens into visual features to highlight target areas, linking them with text prompts. This model also includes an auxiliary task that ensures consistent token representation across video frames, improving video comprehension. With the introduction of the RegVID-300k dataset, Omni-RGPT sets new benchmarks in commonsense reasoning, captioning, and referring expression tasks.'}, 'zh': {'title': 'Omni-RGPT:图像与视频的区域理解新突破', 'desc': '本文介绍了Omni-RGPT,这是一种多模态的大型语言模型,旨在促进图像和视频的区域级理解。为了在时空维度上实现一致的区域表示,我们引入了Token Mark,这是一组突出视觉特征空间中目标区域的标记。通过使用区域提示(如框或掩码),这些标记被直接嵌入到空间区域中,并同时与文本提示结合,以指定目标,从而建立视觉和文本标记之间的直接联系。此外,我们还引入了一个辅助任务,通过利用标记的一致性来指导Token Mark,从而支持稳健的视频理解。'}}}, {'id': 'https://huggingface.co/papers/2501.07730', 'title': 'Democratizing Text-to-Image Masked Generative Models with Compact Text-Aware One-Dimensional Tokens', 'url': 'https://huggingface.co/papers/2501.07730', 'abstract': 'Image tokenizers form the foundation of modern text-to-image generative models but are notoriously difficult to train. Furthermore, most existing text-to-image models rely on large-scale, high-quality private datasets, making them challenging to replicate. In this work, we introduce Text-Aware Transformer-based 1-Dimensional Tokenizer (TA-TiTok), an efficient and powerful image tokenizer that can utilize either discrete or continuous 1-dimensional tokens. TA-TiTok uniquely integrates textual information during the tokenizer decoding stage (i.e., de-tokenization), accelerating convergence and enhancing performance. TA-TiTok also benefits from a simplified, yet effective, one-stage training process, eliminating the need for the complex two-stage distillation used in previous 1-dimensional tokenizers. This design allows for seamless scalability to large datasets. Building on this, we introduce a family of text-to-image Masked Generative Models (MaskGen), trained exclusively on open data while achieving comparable performance to models trained on private data. We aim to release both the efficient, strong TA-TiTok tokenizers and the open-data, open-weight MaskGen models to promote broader access and democratize the field of text-to-image masked generative models.', 'score': 10, 'issue_id': 1673, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '80f40715084c602b', 'authors': ['Dongwon Kim', 'Ju He', 'Qihang Yu', 'Chenglin Yang', 'Xiaohui Shen', 'Suha Kwak', 'Liang-Chieh Chen'], 'affiliations': ['ByteDance Seed', 'POSTECH'], 'pdf_title_img': 'assets/pdf/title_img/2501.07730.jpg', 'data': {'categories': ['#dataset', '#data', '#training', '#cv', '#open_source'], 'emoji': '🖼️', 'ru': {'title': 'Демократизация генерации изображений с помощью эффективной токенизации и открытых данных', 'desc': 'В этой статье представлен новый подход к токенизации изображений для генеративных моделей текст-в-изображение под названием TA-TiTok. Данный токенизатор использует одномерные токены и интегрирует текстовую информацию на этапе детокенизации, что ускоряет сходимость и улучшает производительность. На основе TA-TiTok авторы разработали семейство моделей MaskGen, обученных исключительно на открытых данных. Целью работы является демократизация области генеративных моделей текст-в-изображение путем публикации эффективных токенизаторов и моделей с открытыми весами.'}, 'en': {'title': 'Democratizing Text-to-Image Generation with TA-TiTok', 'desc': 'This paper presents TA-TiTok, a novel image tokenizer designed for text-to-image generative models, which simplifies the training process and improves performance. Unlike traditional models that require large private datasets, TA-TiTok can effectively utilize open data, making it more accessible for researchers. The tokenizer incorporates textual information during the decoding stage, which helps it learn faster and perform better. Additionally, the authors introduce MaskGen, a family of generative models that leverage TA-TiTok and are trained on publicly available datasets, aiming to democratize access to advanced text-to-image generation technology.'}, 'zh': {'title': '高效的文本到图像生成模型,推动开放数据的使用', 'desc': '本文介绍了一种新的图像标记器,称为TA-TiTok,它可以有效地处理文本到图像的生成任务。TA-TiTok在解码阶段整合了文本信息,从而加快了模型的收敛速度并提高了性能。与以往的标记器不同,TA-TiTok采用了一种简化的一阶段训练过程,避免了复杂的两阶段蒸馏过程。我们还提出了一系列基于开放数据训练的文本到图像生成模型MaskGen,旨在促进更广泛的访问和民主化。'}}}, {'id': 'https://huggingface.co/papers/2501.05131', 'title': '3DIS-FLUX: simple and efficient multi-instance generation with DiT rendering', 'url': 'https://huggingface.co/papers/2501.05131', 'abstract': "The growing demand for controllable outputs in text-to-image generation has driven significant advancements in multi-instance generation (MIG), enabling users to define both instance layouts and attributes. Currently, the state-of-the-art methods in MIG are primarily adapter-based. However, these methods necessitate retraining a new adapter each time a more advanced model is released, resulting in significant resource consumption. A methodology named Depth-Driven Decoupled Instance Synthesis (3DIS) has been introduced, which decouples MIG into two distinct phases: 1) depth-based scene construction and 2) detail rendering with widely pre-trained depth control models. The 3DIS method requires adapter training solely during the scene construction phase, while enabling various models to perform training-free detail rendering. Initially, 3DIS focused on rendering techniques utilizing U-Net architectures such as SD1.5, SD2, and SDXL, without exploring the potential of recent DiT-based models like FLUX. In this paper, we present 3DIS-FLUX, an extension of the 3DIS framework that integrates the FLUX model for enhanced rendering capabilities. Specifically, we employ the FLUX.1-Depth-dev model for depth map controlled image generation and introduce a detail renderer that manipulates the Attention Mask in FLUX's Joint Attention mechanism based on layout information. This approach allows for the precise rendering of fine-grained attributes of each instance. Our experimental results indicate that 3DIS-FLUX, leveraging the FLUX model, outperforms the original 3DIS method, which utilized SD2 and SDXL, and surpasses current state-of-the-art adapter-based methods in terms of both performance and image quality. Project Page: https://limuloo.github.io/3DIS/.", 'score': 9, 'issue_id': 1684, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ca5ad23cb146f3aa', 'authors': ['Dewei Zhou', 'Ji Xie', 'Zongxin Yang', 'Yi Yang'], 'affiliations': ['DBMI, HMS, Harvard University', 'RELER, CCAI, Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05131.jpg', 'data': {'categories': ['#cv', '#games', '#architecture', '#multimodal', '#optimization'], 'emoji': '🎨', 'ru': {'title': '3DIS-FLUX: Новый уровень контролируемой генерации мульти-объектных изображений', 'desc': 'Статья представляет метод 3DIS-FLUX для управляемой генерации изображений с несколькими объектами. Этот подход разделяет процесс на создание сцены на основе глубины и детализированный рендеринг с использованием предобученных моделей контроля глубины. 3DIS-FLUX интегрирует модель FLUX для улучшенного рендеринга, манипулируя маской внимания в механизме совместного внимания FLUX. Эксперименты показывают, что 3DIS-FLUX превосходит предыдущие методы по производительности и качеству изображений.'}, 'en': {'title': 'Enhancing Text-to-Image Generation with 3DIS-FLUX', 'desc': 'This paper introduces a new method called 3DIS-FLUX for improving text-to-image generation by enhancing the multi-instance generation (MIG) process. The 3DIS framework separates the generation into two phases: constructing the scene based on depth and rendering details using pre-trained models. By integrating the FLUX model, the method allows for better control over the rendering of fine details while reducing the need for retraining adapters. Experimental results show that 3DIS-FLUX outperforms previous methods in both performance and image quality, making it a significant advancement in controllable image generation.'}, 'zh': {'title': '深度驱动解耦实例合成:提升图像生成的可控性与质量', 'desc': '随着对可控文本到图像生成输出的需求增加,多实例生成(MIG)技术得到了显著进展。现有的MIG方法主要基于适配器,但每次新模型发布时都需要重新训练适配器,消耗大量资源。本文提出了一种名为深度驱动解耦实例合成(3DIS)的方法,将MIG分为两个阶段:基于深度的场景构建和细节渲染。通过引入FLUX模型,3DIS-FLUX在细节渲染方面实现了更高的性能和图像质量。'}}}, {'id': 'https://huggingface.co/papers/2501.08328', 'title': 'PokerBench: Training Large Language Models to become Professional Poker Players', 'url': 'https://huggingface.co/papers/2501.08328', 'abstract': 'We introduce PokerBench - a benchmark for evaluating the poker-playing abilities of large language models (LLMs). As LLMs excel in traditional NLP tasks, their application to complex, strategic games like poker poses a new challenge. Poker, an incomplete information game, demands a multitude of skills such as mathematics, reasoning, planning, strategy, and a deep understanding of game theory and human psychology. This makes Poker the ideal next frontier for large language models. PokerBench consists of a comprehensive compilation of 11,000 most important scenarios, split between pre-flop and post-flop play, developed in collaboration with trained poker players. We evaluate prominent models including GPT-4, ChatGPT 3.5, and various Llama and Gemma series models, finding that all state-of-the-art LLMs underperform in playing optimal poker. However, after fine-tuning, these models show marked improvements. We validate PokerBench by having models with different scores compete with each other, demonstrating that higher scores on PokerBench lead to higher win rates in actual poker games. Through gameplay between our fine-tuned model and GPT-4, we also identify limitations of simple supervised fine-tuning for learning optimal playing strategy, suggesting the need for more advanced methodologies for effectively training language models to excel in games. PokerBench thus presents a unique benchmark for a quick and reliable evaluation of the poker-playing ability of LLMs as well as a comprehensive benchmark to study the progress of LLMs in complex game-playing scenarios. The dataset and code will be made available at: https://github.com/pokerllm/pokerbench.', 'score': 9, 'issue_id': 1674, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '7b4dacedffdbfa15', 'authors': ['Richard Zhuang', 'Akshat Gupta', 'Richard Yang', 'Aniket Rahane', 'Zhengyu Li', 'Gopala Anumanchipalli'], 'affiliations': ['Georgia Institute of Technology', 'University of California, Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.08328.jpg', 'data': {'categories': ['#training', '#reasoning', '#games', '#optimization', '#benchmark'], 'emoji': '🃏', 'ru': {'title': 'PokerBench: новый рубеж для оценки стратегических способностей языковых моделей', 'desc': 'PokerBench - это новый бенчмарк для оценки способностей больших языковых моделей (LLM) играть в покер. Он включает 11000 важнейших сценариев игры, разработанных совместно с профессиональными игроками. Авторы оценили производительность современных LLM, таких как GPT-4 и ChatGPT 3.5, обнаружив, что все модели показывают результаты ниже оптимальных. После дообучения модели демонстрируют значительное улучшение, но авторы отмечают ограничения простого обучения с учителем для освоения оптимальной стратегии игры.'}, 'en': {'title': 'PokerBench: Elevating LLMs to Master the Game of Poker', 'desc': 'PokerBench is a new benchmark designed to assess the poker-playing skills of large language models (LLMs). It focuses on the unique challenges of poker, which requires a blend of mathematical skills, strategic reasoning, and an understanding of human psychology. The benchmark includes 11,000 scenarios that cover various aspects of the game, and it has been tested on several leading models, revealing that they initially struggle with optimal poker play. However, after fine-tuning, these models show significant improvement, highlighting the need for advanced training techniques to enhance their performance in complex games.'}, 'zh': {'title': 'PokerBench:评估语言模型扑克能力的新基准', 'desc': '我们介绍了PokerBench,这是一个用于评估大型语言模型(LLMs)扑克游戏能力的基准。扑克是一种不完全信息游戏,需要数学、推理、规划、策略以及对博弈论和人类心理的深刻理解。PokerBench包含11,000个重要场景,分为翻牌前和翻牌后游戏,经过训练的扑克玩家共同开发。通过对不同模型的评估,我们发现尽管当前的LLMs在扑克游戏中表现不佳,但经过微调后,它们的表现有显著提升。'}}}, {'id': 'https://huggingface.co/papers/2501.08319', 'title': 'Enhancing Automated Interpretability with Output-Centric Feature Descriptions', 'url': 'https://huggingface.co/papers/2501.08319', 'abstract': 'Automated interpretability pipelines generate natural language descriptions for the concepts represented by features in large language models (LLMs), such as plants or the first word in a sentence. These descriptions are derived using inputs that activate the feature, which may be a dimension or a direction in the model\'s representation space. However, identifying activating inputs is costly, and the mechanistic role of a feature in model behavior is determined both by how inputs cause a feature to activate and by how feature activation affects outputs. Using steering evaluations, we reveal that current pipelines provide descriptions that fail to capture the causal effect of the feature on outputs. To fix this, we propose efficient, output-centric methods for automatically generating feature descriptions. These methods use the tokens weighted higher after feature stimulation or the highest weight tokens after applying the vocabulary "unembedding" head directly to the feature. Our output-centric descriptions better capture the causal effect of a feature on model outputs than input-centric descriptions, but combining the two leads to the best performance on both input and output evaluations. Lastly, we show that output-centric descriptions can be used to find inputs that activate features previously thought to be "dead".', 'score': 7, 'issue_id': 1677, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '22615e3bb16f93af', 'authors': ['Yoav Gur-Arieh', 'Roy Mayan', 'Chen Agassy', 'Atticus Geiger', 'Mor Geva'], 'affiliations': ['Blavatnik School of Computer Science and AI, Tel Aviv University', 'Pr(Ai)2R Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.08319.jpg', 'data': {'categories': ['#interpretability', '#inference', '#training', '#data'], 'emoji': '🔍', 'ru': {'title': 'Взгляд изнутри: новый метод интерпретации больших языковых моделей', 'desc': 'Статья описывает новый подход к автоматической интерпретации нейронных сетей, фокусируясь на выходных данных модели вместо входных. Авторы предлагают эффективные методы для генерации описаний признаков, основанные на токенах с наибольшим весом после стимуляции признака. Эксперименты показывают, что ориентированные на выход описания лучше отражают причинно-следственное влияние признака на результаты модели. Комбинация подходов, ориентированных на вход и выход, дает наилучшие результаты в оценке как входных, так и выходных данных.'}, 'en': {'title': 'Unlocking Feature Interpretability in Language Models', 'desc': 'This paper discusses how automated interpretability pipelines can create natural language descriptions for features in large language models (LLMs). It highlights the challenge of identifying inputs that activate these features, which is essential for understanding their role in model behavior. The authors propose new methods that focus on the output effects of features, leading to more accurate descriptions of their causal impact. By combining both input-centric and output-centric approaches, the proposed methods improve the overall interpretability of LLMs and can even identify previously overlooked features.'}, 'zh': {'title': '以输出为中心的特征描述生成方法', 'desc': '这篇论文讨论了自动化可解释性管道如何为大型语言模型中的特征生成自然语言描述。特征的描述是通过激活特征的输入生成的,但识别这些输入的过程成本高昂。研究表明,现有的描述方法未能有效捕捉特征对输出的因果影响。为此,作者提出了一种以输出为中心的方法,能够更好地生成特征描述,并结合输入和输出的评估来提高性能。'}}}, {'id': 'https://huggingface.co/papers/2501.08197', 'title': 'OpenCSG Chinese Corpus: A Series of High-quality Chinese Datasets for LLM Training', 'url': 'https://huggingface.co/papers/2501.08197', 'abstract': 'Large language models (LLMs) have demonstrated remarkable capabilities, but their success heavily relies on the quality of pretraining corpora. For Chinese LLMs, the scarcity of high-quality Chinese datasets presents a significant challenge, often limiting their performance. To address this issue, we propose the OpenCSG Chinese Corpus, a series of high-quality datasets specifically designed for LLM pretraining, post-training, and fine-tuning. This corpus includes Fineweb-edu-chinese, Fineweb-edu-chinese-v2, Cosmopedia-chinese, and Smoltalk-chinese, each with distinct characteristics: Fineweb-edu datasets focus on filtered, high-quality content derived from diverse Chinese web sources; Cosmopedia-chinese provides synthetic, textbook-style data for knowledge-intensive training; and Smoltalk-chinese emphasizes stylistic and diverse chat-format data. The OpenCSG Chinese Corpus is characterized by its high-quality text, diverse coverage across domains, and scalable, reproducible data curation processes. Additionally, we conducted extensive experimental analyses, including evaluations on smaller parameter models, which demonstrated significant performance improvements in tasks such as C-Eval, showcasing the effectiveness of the corpus for training Chinese LLMs.', 'score': 5, 'issue_id': 1675, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '27267ae1a569051c', 'authors': ['Yijiong Yu', 'Ziyun Dai', 'Zekun Wang', 'Wei Wang', 'Ran Chen', 'Ji Pei'], 'affiliations': ['OpenCSG', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08197.jpg', 'data': {'categories': ['#data', '#open_source', '#dataset', '#synthetic', '#training', '#low_resource'], 'emoji': '🐉', 'ru': {'title': 'Прорыв в обучении китайских языковых моделей: OpenCSG Chinese Corpus', 'desc': 'Эта статья представляет OpenCSG Chinese Corpus - набор высококачественных китайских датасетов для предобучения, пост-обучения и тонкой настройки больших языковых моделей (LLM). Корпус включает в себя несколько датасетов, каждый с уникальными характеристиками: от отфильтрованного веб-контента до синтетических учебных данных и разговорных форматов. Авторы подчеркивают высокое качество текста, разнообразие тематик и масштабируемость процесса сбора данных. Эксперименты показали значительное улучшение производительности моделей на различных задачах, включая C-Eval.'}, 'en': {'title': 'Empowering Chinese LLMs with OpenCSG Corpus', 'desc': 'This paper introduces the OpenCSG Chinese Corpus, a collection of high-quality datasets aimed at improving the performance of Chinese large language models (LLMs). The corpus includes several datasets, each tailored for different training needs: Fineweb-edu datasets focus on high-quality web content, Cosmopedia-chinese offers synthetic textbook-style data, and Smoltalk-chinese provides diverse chat-format data. The authors highlight the importance of quality pretraining data for LLMs and demonstrate through experiments that using this corpus leads to significant performance gains in various evaluation tasks. Overall, the OpenCSG Chinese Corpus addresses the challenge of limited high-quality datasets for Chinese LLMs, promoting better training outcomes.'}, 'zh': {'title': '提升中文LLM性能的高质量语料库', 'desc': '大型语言模型(LLMs)在处理自然语言方面表现出色,但其成功依赖于高质量的预训练语料库。针对中文LLMs,优质中文数据集的稀缺性成为了一个重大挑战,限制了它们的性能。为了解决这个问题,我们提出了OpenCSG中文语料库,这是一系列专门为LLM预训练、后训练和微调设计的高质量数据集。该语料库包括Fineweb-edu-chinese、Fineweb-edu-chinese-v2、Cosmopedia-chinese和Smoltalk-chinese,涵盖了多样化的内容和风格,显著提升了中文LLMs的训练效果。'}}}, {'id': 'https://huggingface.co/papers/2501.08167', 'title': 'Potential and Perils of Large Language Models as Judges of Unstructured Textual Data', 'url': 'https://huggingface.co/papers/2501.08167', 'abstract': "Rapid advancements in large language models have unlocked remarkable capabilities when it comes to processing and summarizing unstructured text data. This has implications for the analysis of rich, open-ended datasets, such as survey responses, where LLMs hold the promise of efficiently distilling key themes and sentiments. However, as organizations increasingly turn to these powerful AI systems to make sense of textual feedback, a critical question arises, can we trust LLMs to accurately represent the perspectives contained within these text based datasets? While LLMs excel at generating human-like summaries, there is a risk that their outputs may inadvertently diverge from the true substance of the original responses. Discrepancies between the LLM-generated outputs and the actual themes present in the data could lead to flawed decision-making, with far-reaching consequences for organizations. This research investigates the effectiveness of LLMs as judge models to evaluate the thematic alignment of summaries generated by other LLMs. We utilized an Anthropic Claude model to generate thematic summaries from open-ended survey responses, with Amazon's Titan Express, Nova Pro, and Meta's Llama serving as LLM judges. The LLM-as-judge approach was compared to human evaluations using Cohen's kappa, Spearman's rho, and Krippendorff's alpha, validating a scalable alternative to traditional human centric evaluation methods. Our findings reveal that while LLMs as judges offer a scalable solution comparable to human raters, humans may still excel at detecting subtle, context-specific nuances. This research contributes to the growing body of knowledge on AI assisted text analysis. We discuss limitations and provide recommendations for future research, emphasizing the need for careful consideration when generalizing LLM judge models across various contexts and use cases.", 'score': 5, 'issue_id': 1675, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '866161709624c632', 'authors': ['Rewina Bedemariam', 'Natalie Perez', 'Sreyoshi Bhaduri', 'Satya Kapoor', 'Alex Gil', 'Elizabeth Conjar', 'Ikkei Itoku', 'David Theil', 'Aman Chadha', 'Naumaan Nayyar'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08167.jpg', 'data': {'categories': ['#data', '#dataset', '#science', '#ethics', '#multimodal', '#benchmark', '#interpretability'], 'emoji': '🤖', 'ru': {'title': 'LLM как судьи: масштабируемая альтернатива человеческим оценкам в анализе текста', 'desc': 'Исследование посвящено использованию больших языковых моделей (LLM) для анализа неструктурированных текстовых данных, таких как ответы на опросы. Авторы изучают эффективность применения LLM в качестве судей для оценки тематического соответствия сгенерированных другими LLM резюме. Результаты показывают, что LLM-судьи предлагают масштабируемое решение, сопоставимое с оценками людей, хотя люди все еще могут превосходить их в обнаружении тонких, контекстно-зависимых нюансов. Исследование вносит вклад в растущий объем знаний об анализе текста с помощью искусственного интеллекта.'}, 'en': {'title': 'Trusting AI: Evaluating LLMs for Accurate Text Analysis', 'desc': 'This paper explores the use of large language models (LLMs) for summarizing and analyzing unstructured text data, particularly from open-ended survey responses. It raises concerns about the trustworthiness of LLM-generated summaries, as they may not accurately reflect the original sentiments and themes present in the data. The research introduces an LLM-as-judge framework, where one LLM generates summaries while others evaluate their thematic alignment, comparing this method to human evaluations. The findings suggest that while LLMs can provide a scalable alternative to human raters, they may struggle with detecting subtle nuances that humans can identify, highlighting the importance of careful application in different contexts.'}, 'zh': {'title': '信任大型语言模型的总结能力吗?', 'desc': '这篇论文探讨了大型语言模型(LLMs)在处理和总结非结构化文本数据方面的能力,尤其是在分析开放式调查反馈时的应用。研究表明,虽然LLMs能够生成类似人类的总结,但它们的输出可能与原始文本的真实主题存在偏差,这可能导致错误的决策。为了评估LLMs生成的总结与实际主题的一致性,研究使用了LLMs作为评判模型,并与人类评估进行了比较。结果显示,LLMs作为评判者提供了一种可扩展的解决方案,但人类在捕捉细微的上下文特征方面仍然表现更佳。'}}}, {'id': 'https://huggingface.co/papers/2501.07888', 'title': 'Tarsier2: Advancing Large Vision-Language Models from Detailed Video Description to Comprehensive Video Understanding', 'url': 'https://huggingface.co/papers/2501.07888', 'abstract': 'We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM) designed for generating detailed and accurate video descriptions, while also exhibiting superior general video understanding capabilities. Tarsier2 achieves significant advancements through three key upgrades: (1) Scaling pre-training data from 11M to 40M video-text pairs, enriching both volume and diversity; (2) Performing fine-grained temporal alignment during supervised fine-tuning; (3) Using model-based sampling to automatically construct preference data and applying DPO training for optimization. Extensive experiments show that Tarsier2-7B consistently outperforms leading proprietary models, including GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K benchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\% performance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B also sets new state-of-the-art results across 15 public benchmarks, spanning tasks such as video question-answering, video grounding, hallucination test, and embodied question-answering, demonstrating its versatility as a robust generalist vision-language model.', 'score': 5, 'issue_id': 1674, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '54780a4b6f93fb10', 'authors': ['Liping Yuan', 'Jiawei Wang', 'Haomiao Sun', 'Yuchen Zhang', 'Yuan Lin'], 'affiliations': ['ByteDance Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.07888.jpg', 'data': {'categories': ['#dataset', '#training', '#cv', '#hallucinations', '#optimization', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'Tarsier2: Революция в понимании видео искусственным интеллектом', 'desc': 'Tarsier2 - это современная крупномасштабная модель для понимания видео и языка (LVLM), разработанная для создания детальных и точных описаний видео. Модель достигает значительных улучшений благодаря увеличению объема обучающих данных, точной временной синхронизации при тонкой настройке и применению обучения с предпочтениями (DPO). Tarsier2-7B превосходит ведущие проприетарные модели, такие как GPT-4o и Gemini 1.5 Pro, в задачах детального описания видео. Модель также устанавливает новые рекорды в 15 публичных бенчмарках, демонстрируя свою универсальность как надежная модель общего назначения для понимания видео и языка.'}, 'en': {'title': 'Tarsier2: Redefining Video Understanding with Advanced LVLM Technology', 'desc': "Tarsier2 is a cutting-edge large vision-language model (LVLM) that excels in generating precise and detailed descriptions of videos while showcasing advanced video comprehension skills. The model's improvements stem from three main enhancements: increasing the pre-training dataset from 11 million to 40 million video-text pairs, implementing fine-grained temporal alignment during fine-tuning, and utilizing model-based sampling for preference data construction with DPO training for optimization. Extensive testing reveals that Tarsier2-7B surpasses top proprietary models like GPT-4o and Gemini 1.5 Pro in video description tasks, achieving notable F1 score improvements on the DREAM-1K benchmark. Additionally, Tarsier2-7B sets new records across 15 public benchmarks, proving its effectiveness in various tasks such as video question-answering and video grounding."}, 'zh': {'title': 'Tarsier2:视频描述的新标杆', 'desc': 'Tarsier2是一种先进的大型视觉语言模型,专门用于生成详细且准确的视频描述,同时具备出色的视频理解能力。该模型通过三个关键升级实现了显著进步:首先,预训练数据从1100万对视频文本扩展到4000万对,增加了数据的数量和多样性;其次,在监督微调过程中进行精细的时间对齐;最后,采用基于模型的采样自动构建偏好数据,并应用DPO训练进行优化。实验结果表明,Tarsier2-7B在视频描述任务中持续超越领先的专有模型,展现出其作为强大通用视觉语言模型的多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.08292', 'title': 'HALoGEN: Fantastic LLM Hallucinations and Where to Find Them', 'url': 'https://huggingface.co/papers/2501.08292', 'abstract': 'Despite their impressive ability to generate high-quality and fluent text, generative large language models (LLMs) also produce hallucinations: statements that are misaligned with established world knowledge or provided input context. However, measuring hallucination can be challenging, as having humans verify model generations on-the-fly is both expensive and time-consuming. In this work, we release HALoGEN, a comprehensive hallucination benchmark consisting of: (1) 10,923 prompts for generative models spanning nine domains including programming, scientific attribution, and summarization, and (2) automatic high-precision verifiers for each use case that decompose LLM generations into atomic units, and verify each unit against a high-quality knowledge source. We use this framework to evaluate ~150,000 generations from 14 language models, finding that even the best-performing models are riddled with hallucinations (sometimes up to 86% of generated atomic facts depending on the domain). We further define a novel error classification for LLM hallucinations based on whether they likely stem from incorrect recollection of training data (Type A errors), or incorrect knowledge in training data (Type B errors), or are fabrication (Type C errors). We hope our framework provides a foundation to enable the principled study of why generative models hallucinate, and advances the development of trustworthy large language models.', 'score': 5, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'f6751d682ff824ed', 'authors': ['Abhilasha Ravichander', 'Shrusti Ghela', 'David Wadden', 'Yejin Choi'], 'affiliations': ['Google', 'NVIDIA', 'University of Washington'], 'pdf_title_img': 'assets/pdf/title_img/2501.08292.jpg', 'data': {'categories': ['#dataset', '#hallucinations', '#benchmark'], 'emoji': '🔍', 'ru': {'title': 'HALoGEN: Автоматическая проверка галлюцинаций в языковых моделях', 'desc': 'Эта статья представляет HALoGEN - комплексный инструмент для оценки галлюцинаций в больших языковых моделях (LLM). Авторы создали набор из 10,923 промптов в девяти различных областях и автоматические верификаторы высокой точности для проверки генераций LLM. Исследование выявило, что даже лучшие модели страдают от галлюцинаций, иногда до 86% сгенерированных фактов оказываются неверными. Авторы также предложили новую классификацию ошибок LLM, разделив их на три типа в зависимости от источника галлюцинаций.'}, 'en': {'title': 'HALoGEN: A Benchmark for Measuring Hallucinations in Language Models', 'desc': 'This paper introduces HALoGEN, a new benchmark designed to measure hallucinations in generative large language models (LLMs). Hallucinations refer to incorrect statements generated by these models that do not align with known facts or the given context. The benchmark includes over 10,000 prompts across various domains and employs automatic verifiers to assess the accuracy of model outputs. The study reveals that even top-performing models exhibit significant hallucinations, prompting a classification system for different types of errors to better understand their origins and improve model reliability.'}, 'zh': {'title': '揭示生成模型的幻觉问题', 'desc': '尽管生成性大型语言模型(LLMs)能够生成高质量和流畅的文本,但它们也会产生幻觉,即与已知世界知识或输入上下文不一致的陈述。测量幻觉的难度在于,实时验证模型生成的内容既昂贵又耗时。为此,我们推出了HALoGEN,这是一个全面的幻觉基准,包含10,923个跨越九个领域的提示和自动高精度验证器。我们的研究发现,即使是表现最好的模型,其生成的原子事实中也有高达86%可能存在幻觉,这为理解生成模型的幻觉提供了基础。'}}}, {'id': 'https://huggingface.co/papers/2501.08284', 'title': 'AfriHate: A Multilingual Collection of Hate Speech and Abusive Language Datasets for African Languages', 'url': 'https://huggingface.co/papers/2501.08284', 'abstract': 'Hate speech and abusive language are global phenomena that need socio-cultural background knowledge to be understood, identified, and moderated. However, in many regions of the Global South, there have been several documented occurrences of (1) absence of moderation and (2) censorship due to the reliance on keyword spotting out of context. Further, high-profile individuals have frequently been at the center of the moderation process, while large and targeted hate speech campaigns against minorities have been overlooked. These limitations are mainly due to the lack of high-quality data in the local languages and the failure to include local communities in the collection, annotation, and moderation processes. To address this issue, we present AfriHate: a multilingual collection of hate speech and abusive language datasets in 15 African languages. Each instance in AfriHate is annotated by native speakers familiar with the local culture. We report the challenges related to the construction of the datasets and present various classification baseline results with and without using LLMs. The datasets, individual annotations, and hate speech and offensive language lexicons are available on https://github.com/AfriHate/AfriHate', 'score': 3, 'issue_id': 1676, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '8c76dd102740009c', 'authors': ['Shamsuddeen Hassan Muhammad', 'Idris Abdulmumin', 'Abinew Ali Ayele', 'David Ifeoluwa Adelani', 'Ibrahim Said Ahmad', 'Saminu Mohammad Aliyu', 'Nelson Odhiambo Onyango', 'Lilian D. A. Wanzare', 'Samuel Rutunda', 'Lukman Jibril Aliyu', 'Esubalew Alemneh', 'Oumaima Hourrane', 'Hagos Tesfahun Gebremichael', 'Elyas Abdi Ismail', 'Meriem Beloucif', 'Ebrahim Chekol Jibril', 'Andiswa Bukula', 'Rooweither Mabuya', 'Salomey Osei', 'Abigail Oppong', 'Tadesse Destaw Belay', 'Tadesse Kebede Guge', 'Tesfa Tegegne Asfaw', 'Chiamaka Ijeoma Chukwuneke', 'Paul Röttger', 'Seid Muhie Yimam', 'Nedjma Ousidhoum'], 'affiliations': ['Addis Ababa University', 'Al Akhawayn University', 'Bahir Dar University', 'Bayero University Kano', 'Bocconi University', 'Cardiff University', 'DSFSI, University of Pretoria', 'Digital Umuganda', 'Haramaya University', 'HausaNLP', 'Imperial College London', 'Independent Researcher', 'Instituto Politécnico Nacional', 'Istanbul Technical University', 'Lancaster University', 'Maseno University', 'Mila, McGill University & Canada CIFAR AI Chair', 'Northeastern University', 'SADiLaR', 'University of Deusto', 'University of Hamburg', 'Uppsala University', 'Wollo University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08284.jpg', 'data': {'categories': ['#dataset', '#ethics', '#multilingual', '#data', '#low_resource', '#open_source'], 'emoji': '🌍', 'ru': {'title': 'AfriHate: борьба с языком вражды в Африке с помощью локальных данных и экспертизы', 'desc': 'Статья представляет AfriHate - многоязычный набор данных по языку вражды и оскорбительной лексике на 15 африканских языках. Датасет создан для решения проблемы недостатка качественных данных на местных языках и отсутствия вовлечения локальных сообществ в процессы сбора, разметки и модерации контента. Каждый пример в AfriHate размечен носителями языка, знакомыми с местной культурой. Авторы описывают трудности, связанные с созданием датасетов, и представляют результаты базовых классификационных моделей, в том числе с использованием языковых моделей.'}, 'en': {'title': 'Empowering Local Voices Against Hate Speech with AfriHate', 'desc': 'This paper addresses the challenges of identifying and moderating hate speech in the Global South, particularly in African languages. It highlights the limitations of existing moderation techniques that rely on keyword spotting without cultural context, leading to ineffective censorship and oversight of targeted hate campaigns. To combat this, the authors introduce AfriHate, a multilingual dataset of hate speech and abusive language in 15 African languages, annotated by native speakers. The paper also discusses the difficulties faced during dataset construction and presents baseline classification results, demonstrating the potential of using large language models (LLMs) for this task.'}, 'zh': {'title': '构建多语言仇恨言论数据集,助力社会文化理解', 'desc': '本论文介绍了AfriHate,这是一个包含15种非洲语言的仇恨言论和辱骂语言数据集。该数据集由熟悉当地文化的母语者进行标注,以解决全球南方地区在仇恨言论管理中的数据缺乏问题。研究还探讨了数据集构建过程中的挑战,并展示了使用和不使用大型语言模型(LLMs)进行分类的基线结果。所有数据集、标注和相关词汇表均可在指定网站上获取。'}}}, {'id': 'https://huggingface.co/papers/2501.08120', 'title': 'In-situ graph reasoning and knowledge expansion using Graph-PReFLexOR', 'url': 'https://huggingface.co/papers/2501.08120', 'abstract': "The pursuit of automated scientific discovery has fueled progress from symbolic logic to modern AI, forging new frontiers in reasoning and pattern recognition. Transformers function as potential systems, where every possible relationship remains latent potentiality until tasks impose constraints, akin to measurement. Yet, refining their sampling requires more than probabilistic selection: solutions must conform to specific structures or rules, ensuring consistency and the invocation of general principles. We present Graph-PReFLexOR (Graph-based Preference-based Recursive Language Modeling for Exploratory Optimization of Reasoning), a framework that combines graph reasoning with symbolic abstraction to dynamically expand domain knowledge. Inspired by reinforcement learning, Graph-PReFLexOR defines reasoning as a structured mapping, where tasks yield knowledge graphs, abstract patterns, and ultimately, final answers. Inspired by category theory, it encodes concepts as nodes and their relationships as edges, supporting hierarchical inference and adaptive learning through isomorphic representations. Demonstrations include hypothesis generation, materials design, and creative reasoning, such as discovering relationships between mythological concepts like 'thin places' with materials science. We propose a 'knowledge garden growth' strategy that integrates insights across domains, promoting interdisciplinary connections. Results with a 3-billion-parameter Graph-PReFLexOR model show superior reasoning depth and adaptability, underscoring the potential for transparent, multidisciplinary AI-driven discovery. It lays the groundwork for general autonomous reasoning solutions.", 'score': 1, 'issue_id': 1683, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'f8f5360d1fb8fb75', 'authors': ['Markus J. Buehler'], 'affiliations': ['Laboratory for Atomistic and Molecular Mechanics, MIT, Cambridge, MA 02139, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.08120.jpg', 'data': {'categories': ['#multimodal', '#reasoning', '#agents', '#graphs', '#rl', '#science', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Graph-PReFLexOR: Новый горизонт в автономном научном открытии', 'desc': 'Статья представляет Graph-PReFLexOR - фреймворк, объединяющий графовые рассуждения с символьной абстракцией для динамического расширения предметных знаний. Вдохновленный теорией категорий, он кодирует концепции как узлы, а их отношения как ребра, поддерживая иерархический вывод и адаптивное обучение. Демонстрации включают генерацию гипотез, дизайн материалов и творческие рассуждения, такие как обнаружение связей между мифологическими концепциями и материаловедением. Результаты с 3-миллиардной моделью Graph-PReFLexOR показывают превосходную глубину рассуждений и адаптивность, подчеркивая потенциал для прозрачных, междисциплинарных решений на основе ИИ.'}, 'en': {'title': 'Empowering AI with Graph-Based Reasoning for Scientific Discovery', 'desc': 'This paper introduces Graph-PReFLexOR, a novel framework that enhances automated scientific discovery by integrating graph reasoning with symbolic abstraction. It utilizes a structured mapping approach inspired by reinforcement learning, allowing for the generation of knowledge graphs and abstract patterns from various tasks. The framework supports hierarchical inference and adaptive learning, enabling it to explore interdisciplinary connections effectively. Demonstrations of its capabilities include hypothesis generation and creative reasoning, showcasing its potential for deep and adaptable reasoning in AI-driven discovery.'}, 'zh': {'title': '知识花园的成长:跨领域的智能推理', 'desc': '这篇论文介绍了一种名为Graph-PReFLexOR的框架,它结合了图推理和符号抽象,以动态扩展领域知识。该框架通过结构化映射定义推理,利用知识图谱和抽象模式来生成最终答案。它的灵感来自强化学习和范畴理论,将概念编码为节点,关系编码为边,支持层次推理和自适应学习。实验结果表明,Graph-PReFLexOR在推理深度和适应性方面表现优越,为自动化推理解决方案奠定了基础。'}}}, {'id': 'https://huggingface.co/papers/2501.07556', 'title': 'MatchAnything: Universal Cross-Modality Image Matching with Large-Scale Pre-Training', 'url': 'https://huggingface.co/papers/2501.07556', 'abstract': 'Image matching, which aims to identify corresponding pixel locations between images, is crucial in a wide range of scientific disciplines, aiding in image registration, fusion, and analysis. In recent years, deep learning-based image matching algorithms have dramatically outperformed humans in rapidly and accurately finding large amounts of correspondences. However, when dealing with images captured under different imaging modalities that result in significant appearance changes, the performance of these algorithms often deteriorates due to the scarcity of annotated cross-modal training data. This limitation hinders applications in various fields that rely on multiple image modalities to obtain complementary information. To address this challenge, we propose a large-scale pre-training framework that utilizes synthetic cross-modal training signals, incorporating diverse data from various sources, to train models to recognize and match fundamental structures across images. This capability is transferable to real-world, unseen cross-modality image matching tasks. Our key finding is that the matching model trained with our framework achieves remarkable generalizability across more than eight unseen cross-modality registration tasks using the same network weight, substantially outperforming existing methods, whether designed for generalization or tailored for specific tasks. This advancement significantly enhances the applicability of image matching technologies across various scientific disciplines and paves the way for new applications in multi-modality human and artificial intelligence analysis and beyond.', 'score': 0, 'issue_id': 1688, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': 'ad0c408491c545d5', 'authors': ['Xingyi He', 'Hao Yu', 'Sida Peng', 'Dongli Tan', 'Zehong Shen', 'Hujun Bao', 'Xiaowei Zhou'], 'affiliations': ['Shandong University', 'State Key Lab of CAD&CG, Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07556.jpg', 'data': {'categories': ['#synthetic', '#dataset', '#multimodal', '#transfer_learning', '#cv'], 'emoji': '🔍', 'ru': {'title': 'Универсальное сопоставление изображений разных модальностей с помощью глубокого обучения', 'desc': 'Статья представляет новый подход к сопоставлению изображений разных модальностей с использованием глубокого обучения. Авторы предлагают фреймворк для предварительного обучения на синтетических кросс-модальных данных, что позволяет модели распознавать фундаментальные структуры в изображениях. Обученная модель демонстрирует впечатляющую обобщаемость на более чем восемь новых задач кросс-модальной регистрации, значительно превосходя существующие методы. Это достижение открывает новые возможности для применения технологий сопоставления изображений в различных научных дисциплинах.'}, 'en': {'title': 'Enhancing Image Matching Across Modalities with Synthetic Training', 'desc': "This paper presents a new framework for image matching that helps identify corresponding pixel locations between images taken in different ways. Traditional deep learning methods struggle with this task due to a lack of annotated training data for different image types. The proposed solution uses synthetic training signals from diverse sources to improve the model's ability to recognize and match structures across various images. As a result, the model shows excellent performance in unseen cross-modal tasks, making it highly useful for applications in many scientific fields."}, 'zh': {'title': '跨模态图像匹配的新突破', 'desc': '本文提出了一种大规模预训练框架,用于解决图像匹配中的跨模态问题。该框架利用合成的跨模态训练信号,结合来自不同来源的多样化数据,训练模型识别和匹配图像中的基本结构。研究发现,使用该框架训练的匹配模型在超过八个未见的跨模态配准任务中表现出显著的泛化能力,远超现有方法。此进展大大增强了图像匹配技术在各科学领域的适用性,并为多模态人类和人工智能分析的新应用铺平了道路。'}}}, {'id': 'https://huggingface.co/papers/2501.01895', 'title': 'EnerVerse: Envisioning Embodied Future Space for Robotics Manipulation', 'url': 'https://huggingface.co/papers/2501.01895', 'abstract': "We introduce EnerVerse, a comprehensive framework for embodied future space generation specifically designed for robotic manipulation tasks. EnerVerse seamlessly integrates convolutional and bidirectional attention mechanisms for inner-chunk space modeling, ensuring low-level consistency and continuity. Recognizing the inherent redundancy in video data, we propose a sparse memory context combined with a chunkwise unidirectional generative paradigm to enable the generation of infinitely long sequences. To further augment robotic capabilities, we introduce the Free Anchor View (FAV) space, which provides flexible perspectives to enhance observation and analysis. The FAV space mitigates motion modeling ambiguity, removes physical constraints in confined environments, and significantly improves the robot's generalization and adaptability across various tasks and settings. To address the prohibitive costs and labor intensity of acquiring multi-camera observations, we present a data engine pipeline that integrates a generative model with 4D Gaussian Splatting (4DGS). This pipeline leverages the generative model's robust generalization capabilities and the spatial constraints provided by 4DGS, enabling an iterative enhancement of data quality and diversity, thus creating a data flywheel effect that effectively narrows the sim-to-real gap. Finally, our experiments demonstrate that the embodied future space generation prior substantially enhances policy predictive capabilities, resulting in improved overall performance, particularly in long-range robotic manipulation tasks.", 'score': 41, 'issue_id': 1506, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'bae2a6e63f87958d', 'authors': ['Siyuan Huang', 'Liliang Chen', 'Pengfei Zhou', 'Shengcong Chen', 'Zhengkai Jiang', 'Yue Hu', 'Peng Gao', 'Hongsheng Li', 'Maoqing Yao', 'Guanghui Ren'], 'affiliations': ['AgiBot', 'CUHK', 'FDU', 'HIT', 'HKUST', 'SJTU', 'Shanghai AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.01895.jpg', 'data': {'categories': ['#3d', '#data', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'EnerVerse: Революция в пространственном моделировании для роботов-манипуляторов', 'desc': 'EnerVerse - это комплексная система для генерации пространства будущего в задачах роботизированной манипуляции. Она использует сверточные механизмы и двунаправленное внимание для моделирования внутренних фрагментов пространства, обеспечивая согласованность на низком уровне. Система вводит пространство Free Anchor View для гибких перспектив наблюдения и анализа, улучшая обобщение и адаптивность робота. EnerVerse также включает конвейер данных, интегрирующий генеративную модель с 4D Gaussian Splatting для сужения разрыва между симуляцией и реальностью.'}, 'en': {'title': 'Empowering Robots with EnerVerse: A New Era in Space Generation and Manipulation', 'desc': 'EnerVerse is a new framework designed to help robots better understand and manipulate their environments. It uses advanced techniques like convolutional and bidirectional attention mechanisms to create a consistent model of space. By recognizing that video data often has unnecessary information, EnerVerse employs a sparse memory context to generate long sequences efficiently. Additionally, the Free Anchor View (FAV) space allows robots to observe from different angles, improving their ability to adapt and perform tasks in various settings.'}, 'zh': {'title': 'EnerVerse:提升机器人操作的未来空间生成框架', 'desc': '本文介绍了EnerVerse,这是一个专为机器人操作任务设计的未来空间生成框架。EnerVerse结合了卷积和双向注意机制,以确保内部空间建模的一致性和连续性。我们提出了一种稀疏记忆上下文和单向生成范式的结合,能够生成无限长的序列,从而提高机器人的能力。通过引入自由锚视图空间(FAV),我们增强了观察和分析的灵活性,显著改善了机器人在各种任务和环境中的泛化能力和适应性。'}}}, {'id': 'https://huggingface.co/papers/2501.01957', 'title': 'VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction', 'url': 'https://huggingface.co/papers/2501.01957', 'abstract': 'Recent Multimodal Large Language Models (MLLMs) have typically focused on integrating visual and textual modalities, with less emphasis placed on the role of speech in enhancing interaction. However, speech plays a crucial role in multimodal dialogue systems, and implementing high-performance in both vision and speech tasks remains a significant challenge due to the fundamental modality differences. In this paper, we propose a carefully designed multi-stage training methodology that progressively trains LLM to understand both visual and speech information, ultimately enabling fluent vision and speech interaction. Our approach not only preserves strong vision-language capacity, but also enables efficient speech-to-speech dialogue capabilities without separate ASR and TTS modules, significantly accelerating multimodal end-to-end response speed. By comparing our method against state-of-the-art counterparts across benchmarks for image, video, and speech tasks, we demonstrate that our model is equipped with both strong visual and speech capabilities, making near real-time vision and speech interaction.', 'score': 19, 'issue_id': 1506, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'b6690c7efedf5a39', 'authors': ['Chaoyou Fu', 'Haojia Lin', 'Xiong Wang', 'Yi-Fan Zhang', 'Yunhang Shen', 'Xiaoyu Liu', 'Yangze Li', 'Zuwei Long', 'Heting Gao', 'Ke Li', 'Xiawu Zheng', 'Rongrong Ji', 'Xing Sun', 'Caifeng Shan', 'Ran He'], 'affiliations': ['CASIA', 'NJU', 'Tencent Youtu Lab', 'XMU'], 'pdf_title_img': 'assets/pdf/title_img/2501.01957.jpg', 'data': {'categories': ['#training', '#cv', '#multimodal', '#benchmark', '#audio'], 'emoji': '🗣️', 'ru': {'title': 'Революция в мультимодальном взаимодействии: речь и зрение в одной модели', 'desc': 'В статье представлена новая методология обучения мультимодальных языковых моделей, объединяющая визуальную и речевую модальности. Авторы предлагают поэтапный подход к обучению, который позволяет модели эффективно понимать как визуальную, так и речевую информацию. Модель демонстрирует высокую производительность в задачах обработки изображений, видео и речи, превосходя современные аналоги. Этот подход обеспечивает возможность ведения диалога с использованием речи и изображений в режиме, близком к реальному времени.'}, 'en': {'title': 'Enhancing Multimodal Interaction with Speech and Vision Integration', 'desc': 'This paper introduces a novel training methodology for Multimodal Large Language Models (MLLMs) that enhances their ability to process both visual and speech data. The proposed multi-stage training approach allows the model to progressively learn and integrate information from images, videos, and spoken language, facilitating seamless interaction. By eliminating the need for separate Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) modules, the model achieves faster response times in multimodal dialogues. Experimental results show that this method not only maintains strong vision-language performance but also excels in speech tasks, enabling near real-time interactions.'}, 'zh': {'title': '实现流畅的视觉与语音交互', 'desc': '最近的多模态大型语言模型(MLLMs)主要集中在视觉和文本的整合上,而对语音在增强交互中的作用关注较少。然而,语音在多模态对话系统中起着至关重要的作用,如何在视觉和语音任务中实现高性能仍然是一个重大挑战。本文提出了一种精心设计的多阶段训练方法,逐步训练大型语言模型理解视觉和语音信息,从而实现流畅的视觉和语音交互。我们的方法不仅保持了强大的视觉-语言能力,还实现了高效的语音对话能力,显著加快了多模态端到端的响应速度。'}}}, {'id': 'https://huggingface.co/papers/2501.01904', 'title': 'Virgo: A Preliminary Exploration on Reproducing o1-like MLLM', 'url': 'https://huggingface.co/papers/2501.01904', 'abstract': 'Recently, slow-thinking reasoning systems, built upon large language models (LLMs), have garnered widespread attention by scaling the thinking time during inference. There is also growing interest in adapting this capability to multimodal large language models (MLLMs). Given that MLLMs handle more complex data semantics across different modalities, it is intuitively more challenging to implement multimodal slow-thinking systems. To address this issue, in this paper, we explore a straightforward approach by fine-tuning a capable MLLM with a small amount of textual long-form thought data, resulting in a multimodal slow-thinking system, Virgo (Visual reasoning with long thought). We find that these long-form reasoning processes, expressed in natural language, can be effectively transferred to MLLMs. Moreover, it seems that such textual reasoning data can be even more effective than visual reasoning data in eliciting the slow-thinking capacities of MLLMs. While this work is preliminary, it demonstrates that slow-thinking capacities are fundamentally associated with the language model component, which can be transferred across modalities or domains. This finding can be leveraged to guide the development of more powerful slow-thinking reasoning systems. We release our resources at https://github.com/RUCAIBox/Virgo.', 'score': 12, 'issue_id': 1505, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '576423a20b419d0f', 'authors': ['Yifan Du', 'Zikang Liu', 'Yifan Li', 'Wayne Xin Zhao', 'Yuqi Huo', 'Bingning Wang', 'Weipeng Chen', 'Zheng Liu', 'Zhongyuan Wang', 'Ji-Rong Wen'], 'affiliations': ['BAAI', 'Baichuan AI', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01904.jpg', 'data': {'categories': ['#reasoning', '#multimodal', '#transfer_learning', '#training'], 'emoji': '🧠', 'ru': {'title': 'Обучение мультимодальных ИИ длительным рассуждениям через текст', 'desc': 'Статья описывает исследование в области мультимодальных больших языковых моделей (MLLM) и их способности к медленному мышлению. Авторы предлагают метод Virgo, который позволяет обучить MLLM длительным рассуждениям с помощью небольшого количества текстовых данных. Результаты показывают, что текстовые данные для обучения рассуждениям могут быть даже эффективнее визуальных. Это исследование демонстрирует, что способности к медленному мышлению в основном связаны с языковым компонентом модели и могут переноситься между модальностями.'}, 'en': {'title': 'Unlocking Slow-Thinking in Multimodal Models with Textual Reasoning', 'desc': 'This paper discusses the development of a multimodal slow-thinking reasoning system called Virgo, which is based on fine-tuning a multimodal large language model (MLLM) using long-form textual reasoning data. The authors found that incorporating long-form reasoning in natural language significantly enhances the slow-thinking capabilities of MLLMs, even more so than using visual reasoning data. This suggests that the slow-thinking abilities are closely linked to the language model aspect, allowing for effective transfer across different data modalities. The research indicates a promising direction for creating advanced reasoning systems that can handle complex data semantics.'}, 'zh': {'title': '多模态慢思维推理的探索', 'desc': '最近,基于大型语言模型(LLMs)的慢思维推理系统引起了广泛关注,尤其是在推理过程中延长思考时间的能力。本文探讨了如何将这种能力应用于多模态大型语言模型(MLLMs),尽管处理不同模态的复杂数据语义更具挑战性。我们通过微调一个强大的MLLM,使用少量的长文本思维数据,成功构建了一个多模态慢思维系统,命名为Virgo(视觉推理与长思维)。研究表明,长文本推理过程可以有效转移到MLLMs,并且这种文本推理数据在激发MLLMs的慢思维能力方面,似乎比视觉推理数据更有效。'}}}, {'id': 'https://huggingface.co/papers/2412.21059', 'title': 'VisionReward: Fine-Grained Multi-Dimensional Human Preference Learning for Image and Video Generation', 'url': 'https://huggingface.co/papers/2412.21059', 'abstract': 'We present a general strategy to aligning visual generation models -- both image and video generation -- with human preference. To start with, we build VisionReward -- a fine-grained and multi-dimensional reward model. We decompose human preferences in images and videos into multiple dimensions, each represented by a series of judgment questions, linearly weighted and summed to an interpretable and accurate score. To address the challenges of video quality assessment, we systematically analyze various dynamic features of videos, which helps VisionReward surpass VideoScore by 17.2% and achieve top performance for video preference prediction. Based on VisionReward, we develop a multi-objective preference learning algorithm that effectively addresses the issue of confounding factors within preference data. Our approach significantly outperforms existing image and video scoring methods on both machine metrics and human evaluation. All code and datasets are provided at https://github.com/THUDM/VisionReward.', 'score': 11, 'issue_id': 1510, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '1f3bb267ffa751d9', 'authors': ['Jiazheng Xu', 'Yu Huang', 'Jiale Cheng', 'Yuanming Yang', 'Jiajun Xu', 'Yuan Wang', 'Wenbo Duan', 'Shen Yang', 'Qunlin Jin', 'Shurun Li', 'Jiayan Teng', 'Zhuoyi Yang', 'Wendi Zheng', 'Xiao Liu', 'Ming Ding', 'Xiaohan Zhang', 'Xiaotao Gu', 'Shiyu Huang', 'Minlie Huang', 'Jie Tang', 'Yuxiao Dong'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2412.21059.jpg', 'data': {'categories': ['#rag', '#training', '#open_source', '#cv', '#video', '#optimization', '#alignment'], 'emoji': '🎥', 'ru': {'title': 'VisionReward: многомерная оценка визуального контента с учетом человеческих предпочтений', 'desc': 'Исследователи представили стратегию для согласования моделей генерации визуального контента с человеческими предпочтениями. Они разработали VisionReward - многомерную модель вознаграждения, которая декомпозирует предпочтения в изображениях и видео на несколько измерений. Для оценки качества видео были проанализированы различные динамические характеристики, что позволило VisionReward превзойти существующие методы на 17.2%. На основе VisionReward был разработан алгоритм многоцелевого обучения предпочтениям, эффективно решающий проблему конфаундинг-факторов в данных о предпочтениях.'}, 'en': {'title': 'Aligning Visual Generation with Human Preferences', 'desc': 'This paper introduces a method for aligning visual generation models, such as those for images and videos, with human preferences. The authors create a reward model called VisionReward, which breaks down human preferences into multiple dimensions assessed through specific judgment questions. They enhance video quality assessment by analyzing dynamic features, leading to a 17.2% improvement over previous methods. Additionally, a multi-objective preference learning algorithm is developed to manage confounding factors in preference data, resulting in superior performance compared to existing scoring methods.'}, 'zh': {'title': '视觉生成模型与人类偏好的完美对齐', 'desc': '本文提出了一种通用策略,用于将视觉生成模型(包括图像和视频生成)与人类偏好对齐。我们构建了VisionReward,这是一个细粒度和多维度的奖励模型,能够将人类对图像和视频的偏好分解为多个维度。通过分析视频的动态特征,VisionReward在视频偏好预测中超越了现有方法,提升了17.2%的性能。基于VisionReward,我们开发了一种多目标偏好学习算法,有效解决了偏好数据中的混淆因素问题。'}}}, {'id': 'https://huggingface.co/papers/2501.01821', 'title': 'SDPO: Segment-Level Direct Preference Optimization for Social Agents', 'url': 'https://huggingface.co/papers/2501.01821', 'abstract': "Social agents powered by large language models (LLMs) can simulate human social behaviors but fall short in handling complex goal-oriented social dialogues. Direct Preference Optimization (DPO) has proven effective in aligning LLM behavior with human preferences across a variety of agent tasks. Existing DPO-based approaches for multi-turn interactions are divided into turn-level and session-level methods. The turn-level method is overly fine-grained, focusing exclusively on individual turns, while session-level methods are too coarse-grained, often introducing training noise. To address these limitations, we propose Segment-Level Direct Preference Optimization (SDPO), which focuses on specific key segments within interactions to optimize multi-turn agent behavior while minimizing training noise. Evaluations on the SOTOPIA benchmark demonstrate that SDPO-tuned agents consistently outperform both existing DPO-based methods and proprietary LLMs like GPT-4o, underscoring SDPO's potential to advance the social intelligence of LLM-based agents. We release our code and data at https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/SDPO.", 'score': 10, 'issue_id': 1514, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '499b008b0bce4f74', 'authors': ['Aobo Kong', 'Wentao Ma', 'Shiwan Zhao', 'Yongbin Li', 'Yuchuan Wu', 'Ke Wang', 'Xiaoqian Liu', 'Qicheng Li', 'Yong Qin', 'Fei Huang'], 'affiliations': ['TMCC, CS, Nankai University', 'Tongyi Lab', 'alibaba-inc.com'], 'pdf_title_img': 'assets/pdf/title_img/2501.01821.jpg', 'data': {'categories': ['#open_source', '#benchmark', '#rlhf', '#agents', '#alignment', '#training'], 'emoji': '🤖', 'ru': {'title': 'SDPO: Новый шаг к созданию социально интеллектуальных ИИ-агентов', 'desc': 'В статье представлен новый метод оптимизации поведения языковых моделей (LLM) в сложных многоходовых социальных диалогах - Segment-Level Direct Preference Optimization (SDPO). SDPO фокусируется на ключевых сегментах взаимодействия, что позволяет эффективнее оптимизировать поведение агентов по сравнению с существующими методами. Эксперименты на бенчмарке SOTOPIA показали, что агенты, настроенные с помощью SDPO, превосходят как другие методы на основе DPO, так и проприетарные модели вроде GPT-4. Это демонстрирует потенциал SDPO для повышения социального интеллекта агентов на основе LLM.'}, 'en': {'title': 'Enhancing Social Intelligence in LLMs with SDPO', 'desc': "This paper introduces Segment-Level Direct Preference Optimization (SDPO), a new method for improving the performance of social agents powered by large language models (LLMs) in complex dialogues. Unlike existing methods that either focus too narrowly on individual turns or too broadly on entire sessions, SDPO targets specific key segments of conversations to better align agent behavior with human preferences. The approach reduces training noise and enhances the agent's ability to engage in multi-turn interactions effectively. Evaluations show that agents trained with SDPO outperform both traditional DPO methods and advanced LLMs like GPT-4o, highlighting its effectiveness in enhancing social intelligence."}, 'zh': {'title': '提升社交智能的新方法:分段级直接偏好优化', 'desc': '本论文提出了一种新的方法,称为分段级直接偏好优化(SDPO),旨在提高大型语言模型(LLM)在多轮社交对话中的表现。现有的直接偏好优化(DPO)方法在处理多轮交互时存在细粒度和粗粒度的局限性,导致训练噪声。SDPO通过关注交互中的关键段落,优化代理的多轮行为,从而减少训练噪声。实验结果表明,SDPO调优的代理在SOTOPIA基准测试中表现优于现有的DPO方法和其他大型语言模型,显示出其在提升社交智能方面的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01073', 'title': 'Graph Generative Pre-trained Transformer', 'url': 'https://huggingface.co/papers/2501.01073', 'abstract': "Graph generation is a critical task in numerous domains, including molecular design and social network analysis, due to its ability to model complex relationships and structured data. While most modern graph generative models utilize adjacency matrix representations, this work revisits an alternative approach that represents graphs as sequences of node set and edge set. We advocate for this approach due to its efficient encoding of graphs and propose a novel representation. Based on this representation, we introduce the Graph Generative Pre-trained Transformer (G2PT), an auto-regressive model that learns graph structures via next-token prediction. To further exploit G2PT's capabilities as a general-purpose foundation model, we explore fine-tuning strategies for two downstream applications: goal-oriented generation and graph property prediction. We conduct extensive experiments across multiple datasets. Results indicate that G2PT achieves superior generative performance on both generic graph and molecule datasets. Furthermore, G2PT exhibits strong adaptability and versatility in downstream tasks from molecular design to property prediction.", 'score': 9, 'issue_id': 1508, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '596abc88d57e0650', 'authors': ['Xiaohui Chen', 'Yinkai Wang', 'Jiaxing He', 'Yuanqi Du', 'Soha Hassoun', 'Xiaolin Xu', 'Li-Ping Liu'], 'affiliations': ['Cornell University', 'Northeastern University', 'Tufts University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01073.jpg', 'data': {'categories': ['#dataset', '#optimization', '#training', '#architecture', '#data', '#graphs'], 'emoji': '🕸️', 'ru': {'title': 'G2PT: Универсальный трансформер для эффективной генерации графов', 'desc': 'В статье представлена новая модель генерации графов - Graph Generative Pre-trained Transformer (G2PT). G2PT использует альтернативный подход к представлению графов в виде последовательностей множеств узлов и рёбер вместо матриц смежности. Модель обучается предсказывать следующий токен автореgressивным способом. G2PT показывает превосходные результаты в генерации как общих графов, так и молекул, а также демонстрирует хорошую адаптивность к различным задачам.'}, 'en': {'title': 'Revolutionizing Graph Generation with G2PT', 'desc': 'This paper focuses on improving graph generation, which is important for tasks like designing molecules and analyzing social networks. Instead of using the common adjacency matrix, it proposes a new way to represent graphs as sequences of node and edge sets, making the encoding more efficient. The authors introduce the Graph Generative Pre-trained Transformer (G2PT), an auto-regressive model that learns to generate graph structures by predicting the next token in a sequence. Through various experiments, they demonstrate that G2PT outperforms existing models in generating graphs and is effective in applications like molecular design and predicting graph properties.'}, 'zh': {'title': '图生成的创新:G2PT模型', 'desc': '图生成在许多领域中非常重要,比如分子设计和社交网络分析,因为它能够建模复杂的关系和结构化数据。本文提出了一种新的图表示方法,将图表示为节点集和边集的序列,而不是传统的邻接矩阵。基于这种表示,我们引入了图生成预训练变换器(G2PT),这是一种通过下一个标记预测学习图结构的自回归模型。实验结果表明,G2PT在通用图和分子数据集上表现出色,并且在分子设计和属性预测等下游任务中具有很强的适应性和多功能性。'}}}, {'id': 'https://huggingface.co/papers/2501.00874', 'title': 'LUSIFER: Language Universal Space Integration for Enhanced Multilingual Embeddings with Large Language Models', 'url': 'https://huggingface.co/papers/2501.00874', 'abstract': "Recent advancements in large language models (LLMs) based embedding models have established new state-of-the-art benchmarks for text embedding tasks, particularly in dense vector-based retrieval. However, these models predominantly focus on English, leaving multilingual embedding capabilities largely unexplored. To address this limitation, we present LUSIFER, a novel zero-shot approach that adapts LLM-based embedding models for multilingual tasks without requiring multilingual supervision. LUSIFER's architecture combines a multilingual encoder, serving as a language-universal learner, with an LLM-based embedding model optimized for embedding-specific tasks. These components are seamlessly integrated through a minimal set of trainable parameters that act as a connector, effectively transferring the multilingual encoder's language understanding capabilities to the specialized embedding model. Additionally, to comprehensively evaluate multilingual embedding performance, we introduce a new benchmark encompassing 5 primary embedding tasks, 123 diverse datasets, and coverage across 14 languages. Extensive experimental results demonstrate that LUSIFER significantly enhances the multilingual performance across various embedding tasks, particularly for medium and low-resource languages, without requiring explicit multilingual training data.", 'score': 7, 'issue_id': 1507, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': '5bdfec436923a2a6', 'authors': ['Hieu Man', 'Nghia Trung Ngo', 'Viet Dac Lai', 'Ryan A. Rossi', 'Franck Dernoncourt', 'Thien Huu Nguyen'], 'affiliations': ['Adobe Research, USA', 'Dept. of Computer Science, University of Oregon, OR, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.00874.jpg', 'data': {'categories': ['#transfer_learning', '#architecture', '#benchmark', '#multilingual', '#low_resource'], 'emoji': '🌍', 'ru': {'title': 'Универсальные многоязычные эмбеддинги без многоязычного обучения', 'desc': 'LUSIFER - это новый подход к созданию многоязычных эмбеддингов без использования многоязычных обучающих данных. Он объединяет многоязычный энкодер и LLM-модель для эмбеддингов через набор обучаемых параметров. Авторы также представили новый бенчмарк для оценки качества многоязычных эмбеддингов, охватывающий 5 основных задач, 123 датасета и 14 языков. Эксперименты показали, что LUSIFER значительно улучшает многоязычную производительность, особенно для языков с ограниченными ресурсами.'}, 'en': {'title': 'LUSIFER: Bridging Multilingual Gaps in Text Embedding', 'desc': "This paper introduces LUSIFER, a new method that enhances large language models (LLMs) for multilingual text embedding tasks. Unlike existing models that mainly focus on English, LUSIFER uses a zero-shot approach to adapt LLMs for multiple languages without needing multilingual training data. It combines a multilingual encoder with an LLM-based embedding model, allowing for effective language understanding and embedding performance. The authors also present a comprehensive benchmark to evaluate LUSIFER's performance across various languages and tasks, showing significant improvements, especially for less-resourced languages."}, 'zh': {'title': 'LUSIFER:无监督多语言嵌入的新突破', 'desc': '最近,大型语言模型(LLMs)在文本嵌入任务中取得了新的突破,尤其是在基于密集向量的检索方面。然而,这些模型主要集中在英语上,导致多语言嵌入能力尚未得到充分探索。为了解决这个问题,我们提出了LUSIFER,这是一种新颖的零样本方法,可以在不需要多语言监督的情况下,将LLM嵌入模型适应于多语言任务。LUSIFER的架构结合了一个多语言编码器和一个针对嵌入特定任务优化的LLM嵌入模型,通过一组最小的可训练参数实现无缝连接,有效地将多语言编码器的语言理解能力转移到专门的嵌入模型上。'}}}, {'id': 'https://huggingface.co/papers/2501.01540', 'title': 'BoxingGym: Benchmarking Progress in Automated Experimental Design and Model Discovery', 'url': 'https://huggingface.co/papers/2501.01540', 'abstract': "Understanding the world and explaining it with scientific theories is a central aspiration of artificial intelligence research. Proposing theories, designing experiments to test them, and then revising them based on data are fundamental to scientific discovery. Despite the significant promise of LLM-based scientific agents, no benchmarks systematically test LLM's ability to propose scientific models, collect experimental data, and revise them in light of new data. We introduce BoxingGym, a benchmark with 10 environments for systematically evaluating both experimental design (e.g. collecting data to test a scientific theory) and model discovery (e.g. proposing and revising scientific theories). To enable tractable and quantitative evaluation, we implement each environment as a generative probabilistic model with which a scientific agent can run interactive experiments. These probabilistic models are drawn from various real-world scientific domains ranging from psychology to ecology. To quantitatively evaluate a scientific agent's ability to collect informative experimental data, we compute the expected information gain (EIG), an information-theoretic quantity which measures how much an experiment reduces uncertainty about the parameters of a generative model. A good scientific theory is a concise and predictive explanation. Therefore, to quantitatively evaluate model discovery, we ask a scientific agent to explain their model and then assess whether this explanation enables another scientific agent to make reliable predictions about this environment. In addition to this explanation-based evaluation, we compute standard model evaluation metrics such as prediction errors. We find that current LLMs, such as GPT-4o, struggle with both experimental design and model discovery. We find that augmenting the LLM-based agent with an explicit statistical model does not reliably improve these results.", 'score': 4, 'issue_id': 1510, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '0f853b1681ef29b5', 'authors': ['Kanishk Gandhi', 'Michael Y. Li', 'Lyle Goodyear', 'Louise Li', 'Aditi Bhaskar', 'Mohammed Zaman', 'Noah D. Goodman'], 'affiliations': ['Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01540.jpg', 'data': {'categories': ['#benchmark', '#data', '#science', '#agents'], 'emoji': '🧪', 'ru': {'title': 'BoxingGym: новый вызов для ИИ в научном моделировании', 'desc': 'Статья представляет новый бенчмарк BoxingGym для оценки способности языковых моделей (LLM) к научному открытию. Бенчмарк включает 10 сред, моделирующих различные научные области, и позволяет тестировать планирование экспериментов и построение теорий. Для оценки качества экспериментов используется ожидаемый прирост информации (EIG), а для оценки теорий - их способность объяснять и предсказывать. Результаты показывают, что современные LLM, включая GPT-4, пока слабо справляются с этими задачами.'}, 'en': {'title': 'BoxingGym: Evaluating LLMs in Scientific Discovery', 'desc': 'This paper introduces BoxingGym, a benchmark designed to evaluate the capabilities of large language models (LLMs) in scientific discovery tasks. It focuses on two main aspects: experimental design, which involves collecting data to test scientific theories, and model discovery, which includes proposing and revising these theories. The benchmark consists of 10 environments modeled as generative probabilistic models from various scientific fields, allowing for interactive experimentation. The study finds that current LLMs, like GPT-4o, face challenges in both areas, and adding a statistical model does not consistently enhance their performance.'}, 'zh': {'title': '评估人工智能在科学研究中的能力', 'desc': '这篇论文探讨了人工智能在科学研究中的应用,特别是大型语言模型(LLM)在提出科学理论和设计实验方面的能力。作者提出了一个名为BoxingGym的基准测试,包含10个环境,用于系统评估实验设计和模型发现的能力。通过计算期望信息增益(EIG),论文量化了科学代理收集实验数据的有效性,并评估其提出的模型是否能进行可靠预测。研究发现,当前的LLM在实验设计和模型发现方面表现不佳,且简单地增加统计模型并未显著改善结果。'}}}, {'id': 'https://huggingface.co/papers/2501.00958', 'title': '2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining', 'url': 'https://huggingface.co/papers/2501.00958', 'abstract': 'Compared to image-text pair data, interleaved corpora enable Vision-Language Models (VLMs) to understand the world more naturally like humans. However, such existing datasets are crawled from webpage, facing challenges like low knowledge density, loose image-text relations, and poor logical coherence between images. On the other hand, the internet hosts vast instructional videos (e.g., online geometry courses) that are widely used by humans to learn foundational subjects, yet these valuable resources remain underexplored in VLM training. In this paper, we introduce a high-quality multimodal textbook corpus with richer foundational knowledge for VLM pretraining. It collects over 2.5 years of instructional videos, totaling 22,000 class hours. We first use an LLM-proposed taxonomy to systematically gather instructional videos. Then we progressively extract and refine visual (keyframes), audio (ASR), and textual knowledge (OCR) from the videos, and organize as an image-text interleaved corpus based on temporal order. Compared to its counterparts, our video-centric textbook offers more coherent context, richer knowledge, and better image-text alignment. Experiments demonstrate its superb pretraining performance, particularly in knowledge- and reasoning-intensive tasks like ScienceQA and MathVista. Moreover, VLMs pre-trained on our textbook exhibit outstanding interleaved context awareness, leveraging visual and textual cues in their few-shot context for task solving~Our code are available at \\url{https://github.com/DAMO-NLP-SG/multimodal_textbook}.', 'score': 68, 'issue_id': 1475, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'b10f0cd62f6334fc', 'authors': ['Wenqi Zhang', 'Hang Zhang', 'Xin Li', 'Jiashuo Sun', 'Yongliang Shen', 'Weiming Lu', 'Deli Zhao', 'Yueting Zhuang', 'Lidong Bing'], 'affiliations': ['College of Computer Science and Technology, Zhejiang University', 'DAMO Academy, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.00958.jpg', 'data': {'categories': ['#science', '#dataset', '#reasoning', '#multimodal', '#cv', '#video'], 'emoji': '📚', 'ru': {'title': 'Мультимодальный учебник: новый стандарт для обучения VLM', 'desc': 'Эта статья представляет новый подход к обучению моделей компьютерного зрения и обработки естественного языка (VLM) с использованием мультимодального учебного корпуса. Авторы создали базу данных из 22 000 часов обучающих видео, систематически собранных с помощью таксономии, предложенной языковой моделью (LLM). Этот корпус отличается более высокой плотностью знаний, лучшей связью между изображениями и текстом, а также логической согласованностью по сравнению с существующими наборами данных. Эксперименты показывают превосходную производительность предобучения на этом корпусе, особенно в задачах, требующих глубоких знаний и рассуждений.'}, 'en': {'title': 'Harnessing Instructional Videos for Superior Vision-Language Model Training', 'desc': 'This paper presents a new approach to training Vision-Language Models (VLMs) using a multimodal textbook corpus derived from instructional videos. Unlike traditional datasets that often suffer from low knowledge density and weak image-text relationships, this corpus offers a richer and more coherent context for VLM pretraining. The authors systematically extract visual, audio, and textual information from over 22,000 hours of instructional content, enhancing the alignment between images and text. Experiments show that VLMs trained on this video-centric dataset perform significantly better on knowledge-intensive tasks, demonstrating improved reasoning and context awareness.'}, 'zh': {'title': '视频教材:提升视觉语言模型的知识与推理能力', 'desc': '本文提出了一种高质量的多模态教材语料库,旨在为视觉语言模型(VLM)提供更丰富的基础知识。该语料库收集了超过2.5年的教学视频,总计22,000小时,系统性地提取了视频中的视觉、音频和文本知识。与现有的数据集相比,这种视频中心的教材提供了更连贯的上下文、更丰富的知识和更好的图像-文本对齐。实验结果表明,基于该教材预训练的VLM在知识和推理密集型任务中表现优异,尤其在ScienceQA和MathVista等任务中。'}}}, {'id': 'https://huggingface.co/papers/2501.01427', 'title': 'VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion Control', 'url': 'https://huggingface.co/papers/2501.01427', 'abstract': 'Despite significant advancements in video generation, inserting a given object into videos remains a challenging task. The difficulty lies in preserving the appearance details of the reference object and accurately modeling coherent motions at the same time. In this paper, we propose VideoAnydoor, a zero-shot video object insertion framework with high-fidelity detail preservation and precise motion control. Starting from a text-to-video model, we utilize an ID extractor to inject the global identity and leverage a box sequence to control the overall motion. To preserve the detailed appearance and meanwhile support fine-grained motion control, we design a pixel warper. It takes the reference image with arbitrary key-points and the corresponding key-point trajectories as inputs. It warps the pixel details according to the trajectories and fuses the warped features with the diffusion U-Net, thus improving detail preservation and supporting users in manipulating the motion trajectories. In addition, we propose a training strategy involving both videos and static images with a reweight reconstruction loss to enhance insertion quality. VideoAnydoor demonstrates significant superiority over existing methods and naturally supports various downstream applications (e.g., talking head generation, video virtual try-on, multi-region editing) without task-specific fine-tuning.', 'score': 39, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '4c67f688775a3eca', 'authors': ['Yuanpeng Tu', 'Hao Luo', 'Xi Chen', 'Sihui Ji', 'Xiang Bai', 'Hengshuang Zhao'], 'affiliations': ['DAMO Academy, Alibaba Group', 'HUST', 'Hupan Lab', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.01427.jpg', 'data': {'categories': ['#diffusion', '#games', '#video'], 'emoji': '🎬', 'ru': {'title': 'Точная вставка объектов в видео с сохранением деталей', 'desc': 'В этой статье представлен VideoAnydoor - фреймворк для вставки объектов в видео без предварительного обучения. Он использует экстрактор идентификаторов и последовательность ограничивающих рамок для контроля движения объекта. Ключевым компонентом является пиксельный варпер, который сохраняет детали внешнего вида и позволяет точно управлять движением. Предложенная стратегия обучения с использованием видео и статических изображений улучшает качество вставки объектов.'}, 'en': {'title': 'Seamless Object Insertion in Videos with VideoAnydoor', 'desc': 'This paper introduces VideoAnydoor, a novel framework for zero-shot video object insertion that excels in maintaining high-fidelity details and precise motion control. The approach begins with a text-to-video model and incorporates an ID extractor to ensure consistent object identity while using a box sequence for motion management. A key innovation is the pixel warper, which adjusts pixel details based on key-point trajectories, enhancing both detail preservation and user control over motion. The proposed training strategy, which combines videos and static images with a reweighted reconstruction loss, significantly improves the quality of object insertion, making VideoAnydoor versatile for various applications without needing specific fine-tuning.'}, 'zh': {'title': '高保真视频对象插入的新突破', 'desc': '尽管视频生成技术取得了显著进展,但将特定对象插入视频仍然是一项具有挑战性的任务。本文提出了VideoAnydoor,这是一个零-shot视频对象插入框架,能够高保真地保留细节并精确控制运动。我们设计了一种像素变形器,能够根据关键点轨迹扭曲像素细节,并与扩散U-Net融合,从而提高细节保留能力。VideoAnydoor在现有方法中表现出显著优势,并支持多种下游应用,无需特定任务的微调。'}}}, {'id': 'https://huggingface.co/papers/2501.01257', 'title': 'CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings', 'url': 'https://huggingface.co/papers/2501.01257', 'abstract': 'With the increasing code reasoning capabilities of existing large language models (LLMs) and breakthroughs in reasoning models like OpenAI o1 and o3, there is a growing need to develop more challenging and comprehensive benchmarks that effectively test their sophisticated competition-level coding abilities. Existing benchmarks, like LiveCodeBench and USACO, fall short due to the unavailability of private test cases, lack of support for special judges, and misaligned execution environments. To bridge this gap, we introduce CodeElo, a standardized competition-level code generation benchmark that effectively addresses all these challenges for the first time. CodeElo benchmark is mainly based on the official CodeForces platform and tries to align with the platform as much as possible. We compile the recent six months of contest problems on CodeForces with detailed information such as contest divisions, problem difficulty ratings, and problem algorithm tags. We introduce a unique judging method in which problems are submitted directly to the platform and develop a reliable Elo rating calculation system that aligns with the platform and is comparable with human participants but has lower variance. By testing on our CodeElo, we provide the Elo ratings of 30 existing popular open-source and 3 proprietary LLMs for the first time. The results show that o1-mini and QwQ-32B-Preview stand out significantly, achieving Elo ratings of 1578 and 1261, respectively, while other models struggle even with the easiest problems, placing in the lowest 20 percent among all human participants. Detailed analysis experiments are also conducted to provide insights into performance across algorithms and comparisons between using C++ and Python, which can suggest directions for future studies.', 'score': 36, 'issue_id': 1475, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'e31430bb6ba5dfc8', 'authors': ['Shanghaoran Quan', 'Jiaxi Yang', 'Bowen Yu', 'Bo Zheng', 'Dayiheng Liu', 'An Yang', 'Xuancheng Ren', 'Bofei Gao', 'Yibo Miao', 'Yunlong Feng', 'Zekun Wang', 'Jian Yang', 'Zeyu Cui', 'Yang Fan', 'Yichang Zhang', 'Binyuan Hui', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.01257.jpg', 'data': {'categories': ['#dataset', '#benchmark', '#reasoning', '#optimization', '#open_source'], 'emoji': '🏆', 'ru': {'title': 'CodeElo: новый стандарт оценки LLM в соревновательном программировании', 'desc': 'Статья представляет новый бенчмарк CodeElo для оценки способностей больших языковых моделей (LLM) в решении задач по программированию соревновательного уровня. CodeElo основан на платформе CodeForces и включает проблемы с детальной информацией о сложности и алгоритмических тегах. Авторы разработали систему расчета рейтинга Эло, сопоставимую с рейтингами человеческих участников. Результаты тестирования 33 LLM показали, что модели o1-mini и QwQ-32B-Preview значительно превосходят остальные, достигая рейтингов 1578 и 1261 соответственно.'}, 'en': {'title': 'CodeElo: Elevating Code Generation Benchmarks for LLMs', 'desc': 'This paper presents CodeElo, a new benchmark designed to evaluate the coding abilities of large language models (LLMs) in a competitive setting. Unlike existing benchmarks, CodeElo addresses limitations such as the lack of private test cases and misaligned execution environments by utilizing the CodeForces platform. The benchmark includes a unique judging method and an Elo rating system that allows for fair comparisons between LLMs and human participants. Results indicate that certain models, like o1-mini, perform significantly better than others, highlighting the varying capabilities of LLMs in code generation tasks.'}, 'zh': {'title': 'CodeElo:提升代码生成能力的标准化基准测试', 'desc': '随着大型语言模型(LLMs)在代码推理能力上的提升,开发更具挑战性和全面性的基准测试变得愈发重要。现有的基准测试如LiveCodeBench和USACO存在一些不足,例如缺乏私有测试用例和特殊评判支持。为了解决这些问题,我们提出了CodeElo,这是一个标准化的竞赛级代码生成基准,首次有效应对这些挑战。通过在CodeForces平台上编译最近六个月的竞赛问题,我们为30个流行的开源和3个专有LLMs提供了Elo评分,结果显示o1-mini和QwQ-32B-Preview表现突出。'}}}, {'id': 'https://huggingface.co/papers/2501.00599', 'title': 'VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with Video LLM', 'url': 'https://huggingface.co/papers/2501.00599', 'abstract': 'Video Large Language Models (Video LLMs) have recently exhibited remarkable capabilities in general video understanding. However, they mainly focus on holistic comprehension and struggle with capturing fine-grained spatial and temporal details. Besides, the lack of high-quality object-level video instruction data and a comprehensive benchmark further hinders their advancements. To tackle these challenges, we introduce the VideoRefer Suite to empower Video LLM for finer-level spatial-temporal video understanding, i.e., enabling perception and reasoning on any objects throughout the video. Specially, we thoroughly develop VideoRefer Suite across three essential aspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent data engine to meticulously curate a large-scale, high-quality object-level video instruction dataset, termed VideoRefer-700K. Next, we present the VideoRefer model, which equips a versatile spatial-temporal object encoder to capture precise regional and sequential representations. Finally, we meticulously create a VideoRefer-Bench to comprehensively assess the spatial-temporal understanding capability of a Video LLM, evaluating it across various aspects. Extensive experiments and analyses demonstrate that our VideoRefer model not only achieves promising performance on video referring benchmarks but also facilitates general video understanding capabilities.', 'score': 31, 'issue_id': 1474, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': 'daee687ce36ef3db', 'authors': ['Yuqian Yuan', 'Hang Zhang', 'Wentong Li', 'Zesen Cheng', 'Boqiang Zhang', 'Long Li', 'Xin Li', 'Deli Zhao', 'Wenqiao Zhang', 'Yueting Zhuang', 'Jianke Zhu', 'Lidong Bing'], 'affiliations': ['DAMO Academy, Alibaba Group', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00599.jpg', 'data': {'categories': ['#reasoning', '#benchmark', '#dataset', '#optimization', '#video'], 'emoji': '🎥', 'ru': {'title': 'Точное пространственно-временное понимание видео с помощью VideoRefer Suite', 'desc': 'Статья представляет VideoRefer Suite - комплексный подход к улучшению пространственно-временного понимания видео большими языковыми моделями. Авторы разработали масштабный набор данных VideoRefer-700K с инструкциями на уровне объектов, созданный с помощью мультиагентного движка. Они также представили модель VideoRefer с универсальным пространственно-временным кодировщиком объектов. Для оценки возможностей видео-LLM был создан бенчмарк VideoRefer-Bench, охватывающий различные аспекты понимания видео.'}, 'en': {'title': 'Empowering Video LLMs for Fine-Grained Understanding', 'desc': 'This paper introduces the VideoRefer Suite, which enhances Video Large Language Models (Video LLMs) for better understanding of videos by focusing on fine-grained spatial and temporal details. It addresses the limitations of existing models that primarily focus on overall comprehension and lack high-quality object-level instruction data. The suite includes a new dataset called VideoRefer-700K, a specialized VideoRefer model with a spatial-temporal object encoder, and a benchmark for evaluating video understanding capabilities. Experimental results show that the VideoRefer model significantly improves performance on video referring tasks while also enhancing general video comprehension.'}, 'zh': {'title': '提升视频理解,细致捕捉空间与时间', 'desc': '视频大型语言模型(Video LLMs)在视频理解方面展现了出色的能力,但在捕捉细粒度的空间和时间细节上存在困难。为了应对这些挑战,我们提出了VideoRefer Suite,以增强视频LLM在空间-时间视频理解方面的能力。我们开发了一个多代理数据引擎,创建了一个高质量的对象级视频指令数据集VideoRefer-700K,并提出了VideoRefer模型,配备了多功能的空间-时间对象编码器。最后,我们创建了VideoRefer-Bench,以全面评估视频LLM的空间-时间理解能力,实验结果表明我们的模型在视频引用基准上表现优异。'}}}, {'id': 'https://huggingface.co/papers/2501.01423', 'title': 'Reconstruction vs. Generation: Taming Optimization Dilemma in Latent Diffusion Models', 'url': 'https://huggingface.co/papers/2501.01423', 'abstract': 'Latent diffusion models with Transformer architectures excel at generating high-fidelity images. However, recent studies reveal an optimization dilemma in this two-stage design: while increasing the per-token feature dimension in visual tokenizers improves reconstruction quality, it requires substantially larger diffusion models and more training iterations to achieve comparable generation performance. Consequently, existing systems often settle for sub-optimal solutions, either producing visual artifacts due to information loss within tokenizers or failing to converge fully due to expensive computation costs. We argue that this dilemma stems from the inherent difficulty in learning unconstrained high-dimensional latent spaces. To address this, we propose aligning the latent space with pre-trained vision foundation models when training the visual tokenizers. Our proposed VA-VAE (Vision foundation model Aligned Variational AutoEncoder) significantly expands the reconstruction-generation frontier of latent diffusion models, enabling faster convergence of Diffusion Transformers (DiT) in high-dimensional latent spaces. To exploit the full potential of VA-VAE, we build an enhanced DiT baseline with improved training strategies and architecture designs, termed LightningDiT. The integrated system achieves state-of-the-art (SOTA) performance on ImageNet 256x256 generation with an FID score of 1.35 while demonstrating remarkable training efficiency by reaching an FID score of 2.11 in just 64 epochs--representing an over 21 times convergence speedup compared to the original DiT. Models and codes are available at: https://github.com/hustvl/LightningDiT.', 'score': 30, 'issue_id': 1473, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '173fa21b6e47d04c', 'authors': ['Jingfeng Yao', 'Xinggang Wang'], 'affiliations': ['Huazhong University of Science and Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.01423.jpg', 'data': {'categories': ['#training', '#optimization', '#cv', '#architecture', '#diffusion'], 'emoji': '⚡', 'ru': {'title': 'Революция в латентных диффузионных моделях: быстрее, лучше, эффективнее', 'desc': 'Статья представляет новый подход к улучшению латентных диффузионных моделей с архитектурой Трансформер для генерации изображений высокого качества. Авторы предлагают метод VA-VAE, который выравнивает латентное пространство с предобученными моделями компьютерного зрения. Это позволяет значительно расширить границы реконструкции-генерации и ускорить сходимость Диффузионных Трансформеров в высокоразмерных латентных пространствах. На основе VA-VAE авторы создали улучшенную модель LightningDiT, достигающую современного уровня производительности на задаче генерации изображений ImageNet 256x256.'}, 'en': {'title': 'Accelerating Image Generation with Aligned Latent Spaces', 'desc': 'This paper discusses the challenges faced by latent diffusion models, particularly when using Transformer architectures for image generation. It highlights an optimization issue where increasing the feature dimensions in visual tokenizers can lead to larger models and longer training times, often resulting in sub-optimal image quality. The authors propose a solution by aligning the latent space with pre-trained vision models, introducing a new framework called VA-VAE to enhance the training process. Their improved model, LightningDiT, achieves state-of-the-art performance in image generation while significantly speeding up the training process.'}, 'zh': {'title': '优化潜在扩散模型,提升图像生成效率', 'desc': '本论文探讨了潜在扩散模型与变换器架构在生成高质量图像时的优化困境。研究表明,虽然增加视觉标记器中的每个标记特征维度可以提高重建质量,但这也导致需要更大的扩散模型和更多的训练迭代。为了解决这一问题,作者提出将潜在空间与预训练的视觉基础模型对齐,从而提高训练效率。最终,提出的VA-VAE模型显著提升了潜在扩散模型的重建生成能力,并在ImageNet数据集上实现了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.00103', 'title': 'LTX-Video: Realtime Video Latent Diffusion', 'url': 'https://huggingface.co/papers/2501.00103', 'abstract': "We introduce LTX-Video, a transformer-based latent diffusion model that adopts a holistic approach to video generation by seamlessly integrating the responsibilities of the Video-VAE and the denoising transformer. Unlike existing methods, which treat these components as independent, LTX-Video aims to optimize their interaction for improved efficiency and quality. At its core is a carefully designed Video-VAE that achieves a high compression ratio of 1:192, with spatiotemporal downscaling of 32 x 32 x 8 pixels per token, enabled by relocating the patchifying operation from the transformer's input to the VAE's input. Operating in this highly compressed latent space enables the transformer to efficiently perform full spatiotemporal self-attention, which is essential for generating high-resolution videos with temporal consistency. However, the high compression inherently limits the representation of fine details. To address this, our VAE decoder is tasked with both latent-to-pixel conversion and the final denoising step, producing the clean result directly in pixel space. This approach preserves the ability to generate fine details without incurring the runtime cost of a separate upsampling module. Our model supports diverse use cases, including text-to-video and image-to-video generation, with both capabilities trained simultaneously. It achieves faster-than-real-time generation, producing 5 seconds of 24 fps video at 768x512 resolution in just 2 seconds on an Nvidia H100 GPU, outperforming all existing models of similar scale. The source code and pre-trained models are publicly available, setting a new benchmark for accessible and scalable video generation.", 'score': 29, 'issue_id': 1484, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'a2358f7cf156ff08', 'authors': ['Yoav HaCohen', 'Nisan Chiprut', 'Benny Brazowski', 'Daniel Shalem', 'Dudu Moshe', 'Eitan Richardson', 'Eran Levin', 'Guy Shiran', 'Nir Zabari', 'Ori Gordon', 'Poriya Panet', 'Sapir Weissbuch', 'Victor Kulikov', 'Yaki Bitterman', 'Zeev Melumian', 'Ofir Bibi'], 'affiliations': ['Lightricks'], 'pdf_title_img': 'assets/pdf/title_img/2501.00103.jpg', 'data': {'categories': ['#open_source', '#benchmark', '#video', '#diffusion'], 'emoji': '🎬', 'ru': {'title': 'Революция в генерации видео: быстрее реального времени', 'desc': 'LTX-Video - это трансформерная модель латентной диффузии для генерации видео. Она объединяет функции Video-VAE и шумоподавляющего трансформера, оптимизируя их взаимодействие. Модель использует сильно сжатое латентное пространство, позволяя трансформеру эффективно выполнять полное пространственно-временное самовнимание. LTX-Video поддерживает генерацию видео из текста и изображений, превосходя существующие модели по скорости и качеству.'}, 'en': {'title': 'Revolutionizing Video Generation with LTX-Video', 'desc': "LTX-Video is a novel transformer-based latent diffusion model designed for efficient video generation by integrating the roles of Video-VAE and denoising transformers. It achieves a high compression ratio of 1:192, allowing the model to operate in a compressed latent space while maintaining spatiotemporal self-attention for generating high-resolution videos. The model's VAE decoder performs both latent-to-pixel conversion and denoising, enabling the generation of fine details without the need for a separate upsampling module. With capabilities for text-to-video and image-to-video generation, LTX-Video produces videos faster than real-time, setting a new standard in the field."}, 'zh': {'title': 'LTX-Video:高效视频生成的新标准', 'desc': 'LTX-Video是一种基于变换器的潜在扩散模型,旨在通过整合视频生成中的Video-VAE和去噪变换器的功能来提高效率和质量。该模型的核心是一个高压缩比的Video-VAE,能够在压缩的潜在空间中高效执行时空自注意力,从而生成高分辨率且具有时间一致性的视频。为了克服高压缩带来的细节损失,VAE解码器同时负责潜在到像素的转换和最终的去噪步骤,直接在像素空间中生成清晰的结果。LTX-Video支持多种应用场景,包括文本到视频和图像到视频的生成,并且在Nvidia H100 GPU上以超实时速度生成视频,设立了视频生成的新基准。'}}}, {'id': 'https://huggingface.co/papers/2501.01264', 'title': 'ProgCo: Program Helps Self-Correction of Large Language Models', 'url': 'https://huggingface.co/papers/2501.01264', 'abstract': 'Self-Correction aims to enable large language models (LLMs) to self-verify and self-refine their initial responses without external feedback. However, LLMs often fail to effectively self-verify and generate correct feedback, further misleading refinement and leading to the failure of self-correction, especially in complex reasoning tasks. In this paper, we propose Program-driven Self-Correction (ProgCo). First, program-driven verification (ProgVe) achieves complex verification logic and extensive validation through self-generated, self-executing verification pseudo-programs. Then, program-driven refinement (ProgRe) receives feedback from ProgVe, conducts dual reflection and refinement on both responses and verification programs to mitigate misleading of incorrect feedback in complex reasoning tasks. Experiments on three instruction-following and mathematical benchmarks indicate that ProgCo achieves effective self-correction, and can be further enhance performance when combined with real program tools.', 'score': 22, 'issue_id': 1473, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'bda3f96e83319526', 'authors': ['Xiaoshuai Song', 'Yanan Wu', 'Weixun Wang', 'Jiaheng Liu', 'Wenbo Su', 'Bo Zheng'], 'affiliations': ['Taobao & Tmall Group of Alibaba'], 'pdf_title_img': 'assets/pdf/title_img/2501.01264.jpg', 'data': {'categories': ['#training', '#math', '#reasoning', '#interpretability', '#rlhf'], 'emoji': '🤖', 'ru': {'title': 'ProgCo: Самокоррекция языковых моделей через программно-управляемую верификацию и уточнение', 'desc': 'Эта статья представляет новый подход к самокоррекции больших языковых моделей (LLM) под названием Program-driven Self-Correction (ProgCo). Метод включает в себя программно-управляемую верификацию (ProgVe), которая использует самогенерируемые и самовыполняющиеся псевдопрограммы для сложной логики проверки. Затем программно-управляемое уточнение (ProgRe) проводит двойную рефлексию и улучшение как ответов, так и программ верификации. Эксперименты показали, что ProgCo эффективен в самокоррекции и может дополнительно улучшить производительность при комбинировании с реальными программными инструментами.'}, 'en': {'title': 'Empowering LLMs with Program-Driven Self-Correction', 'desc': 'This paper introduces Program-driven Self-Correction (ProgCo) to improve the self-verification and self-refinement capabilities of large language models (LLMs). It addresses the common issue where LLMs struggle to provide accurate feedback, which can lead to incorrect refinements, particularly in complex reasoning tasks. ProgCo utilizes program-driven verification (ProgVe) to create self-executing verification pseudo-programs that enhance the verification process. Additionally, program-driven refinement (ProgRe) allows the model to reflect on and refine both its responses and the verification programs, leading to more reliable self-correction outcomes.'}, 'zh': {'title': '基于程序的自我纠正:提升语言模型的自我验证能力', 'desc': '自我纠正旨在使大型语言模型(LLMs)能够在没有外部反馈的情况下自我验证和自我完善其初始响应。然而,LLMs往往无法有效自我验证并生成正确的反馈,这会进一步误导其完善过程,尤其是在复杂推理任务中。本文提出了基于程序的自我纠正(ProgCo),通过自生成、自执行的验证伪程序实现复杂的验证逻辑和广泛的验证。实验结果表明,ProgCo在三个指令遵循和数学基准测试中实现了有效的自我纠正,并且与真实程序工具结合时可以进一步提升性能。'}}}, {'id': 'https://huggingface.co/papers/2501.00316', 'title': 'MapEval: A Map-Based Evaluation of Geo-Spatial Reasoning in Foundation Models', 'url': 'https://huggingface.co/papers/2501.00316', 'abstract': "Recent advancements in foundation models have enhanced AI systems' capabilities in autonomous tool usage and reasoning. However, their ability in location or map-based reasoning - which improves daily life by optimizing navigation, facilitating resource discovery, and streamlining logistics - has not been systematically studied. To bridge this gap, we introduce MapEval, a benchmark designed to assess diverse and complex map-based user queries with geo-spatial reasoning. MapEval features three task types (textual, API-based, and visual) that require collecting world information via map tools, processing heterogeneous geo-spatial contexts (e.g., named entities, travel distances, user reviews or ratings, images), and compositional reasoning, which all state-of-the-art foundation models find challenging. Comprising 700 unique multiple-choice questions about locations across 180 cities and 54 countries, MapEval evaluates foundation models' ability to handle spatial relationships, map infographics, travel planning, and navigation challenges. Using MapEval, we conducted a comprehensive evaluation of 28 prominent foundation models. While no single model excelled across all tasks, Claude-3.5-Sonnet, GPT-4o, and Gemini-1.5-Pro achieved competitive performance overall. However, substantial performance gaps emerged, particularly in MapEval, where agents with Claude-3.5-Sonnet outperformed GPT-4o and Gemini-1.5-Pro by 16% and 21%, respectively, and the gaps became even more amplified when compared to open-source LLMs. Our detailed analyses provide insights into the strengths and weaknesses of current models, though all models still fall short of human performance by more than 20% on average, struggling with complex map images and rigorous geo-spatial reasoning. This gap highlights MapEval's critical role in advancing general-purpose foundation models with stronger geo-spatial understanding.", 'score': 20, 'issue_id': 1477, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': 'a4e45c6bd9d30ff4', 'authors': ['Mahir Labib Dihan', 'Md Tanvir Hassan', 'Md Tanvir Parvez', 'Md Hasebul Hasan', 'Md Almash Alam', 'Muhammad Aamir Cheema', 'Mohammed Eunus Ali', 'Md Rizwan Parvez'], 'affiliations': ['Bangladesh Computer Council (BCC)', 'Department of Computer Science and Engineering Bangladesh University of Engineering and Technology (BUET)', 'Monash University', 'Qatar Computing Research Institute (QCRI)', 'Statistics, Islamic University Bangladesh'], 'pdf_title_img': 'assets/pdf/title_img/2501.00316.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#multimodal', '#survey'], 'emoji': '🗺️', 'ru': {'title': 'MapEval: Новый рубеж в геопространственном ИИ', 'desc': 'Статья представляет MapEval - новый бенчмарк для оценки способностей моделей искусственного интеллекта в области пространственных рассуждений и работы с картами. MapEval включает 700 вопросов с множественным выбором, охватывающих 180 городов и 54 страны, и оценивает навыки моделей в понимании пространственных отношений, инфографики карт, планирования путешествий и навигации. Авторы провели оценку 28 ведущих фундаментальных моделей, выявив значительные различия в производительности, при этом все модели все еще отстают от человеческого уровня более чем на 20%. Результаты исследования подчеркивают важность MapEval для развития моделей с более сильным геопространственным пониманием.'}, 'en': {'title': "Enhancing AI's Geo-Spatial Reasoning with MapEval", 'desc': 'This paper introduces MapEval, a benchmark designed to evaluate the performance of foundation models in map-based reasoning tasks. It focuses on assessing how well these models can handle complex geo-spatial queries, which are essential for navigation and resource discovery. The benchmark includes various task types that require models to process diverse information, such as travel distances and user reviews, and perform compositional reasoning. The evaluation reveals that while some models perform competitively, they still lag behind human capabilities, indicating a need for further advancements in geo-spatial understanding within AI systems.'}, 'zh': {'title': '提升地图推理能力的基准评估', 'desc': '最近基础模型的进展提升了人工智能系统在自主工具使用和推理方面的能力。然而,它们在基于位置或地图的推理能力上尚未得到系统研究,这对于优化导航、资源发现和物流管理至关重要。为了解决这个问题,我们引入了MapEval,一个旨在评估复杂地图用户查询的基准,涉及地理空间推理。MapEval包含700个关于180个城市和54个国家的独特多项选择题,评估基础模型在处理空间关系、地图信息、旅行规划和导航挑战方面的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.01149', 'title': 'A3: Android Agent Arena for Mobile GUI Agents', 'url': 'https://huggingface.co/papers/2501.01149', 'abstract': 'AI agents have become increasingly prevalent in recent years, driven by significant advancements in the field of large language models (LLMs). Mobile GUI agents, a subset of AI agents, are designed to autonomously perform tasks on mobile devices. While numerous studies have introduced agents, datasets, and benchmarks to advance mobile GUI agent research, many existing datasets focus on static frame evaluations and fail to provide a comprehensive platform for assessing performance on real-world, in-the-wild tasks. To address this gap, we present Android Agent Arena (A3), a novel evaluation platform. Unlike existing in-the-wild systems, A3 offers: (1) meaningful and practical tasks, such as real-time online information retrieval and operational instructions; (2) a larger, more flexible action space, enabling compatibility with agents trained on any dataset; and (3) automated business-level LLM-based evaluation process. A3 includes 21 widely used general third-party apps and 201 tasks representative of common user scenarios, providing a robust foundation for evaluating mobile GUI agents in real-world situations and a new autonomous evaluation process for less human labor and coding expertise. The project is available at https://yuxiangchai.github.io/Android-Agent-Arena/.', 'score': 20, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '050f155aa526c100', 'authors': ['Yuxiang Chai', 'Hanhao Li', 'Jiayu Zhang', 'Liang Liu', 'Guozhi Wang', 'Shuai Ren', 'Siyuan Huang', 'Hongsheng Li'], 'affiliations': ['EE department @ CUHK', 'MMLab @ CUHK'], 'pdf_title_img': 'assets/pdf/title_img/2501.01149.jpg', 'data': {'categories': ['#benchmark', '#dataset', '#agents'], 'emoji': '🤖', 'ru': {'title': 'A3: Арена для тестирования мобильных AI-агентов в реальном мире', 'desc': 'Статья представляет новую платформу для оценки мобильных GUI-агентов под названием Android Agent Arena (A3). A3 предлагает реалистичные задачи, широкое пространство действий и автоматизированную оценку на основе больших языковых моделей. Платформа включает 21 популярное стороннее приложение и 201 задачу, отражающую типичные пользовательские сценарии. A3 позволяет оценивать производительность агентов в реальных условиях, что отличает её от существующих статических наборов данных.'}, 'en': {'title': 'Revolutionizing Mobile GUI Agent Evaluation with A3', 'desc': 'This paper introduces the Android Agent Arena (A3), a new evaluation platform for mobile GUI agents that addresses limitations in existing datasets. A3 focuses on real-world tasks, providing a larger action space that accommodates agents trained on various datasets. It features 21 popular third-party apps and 201 tasks that reflect common user scenarios, enhancing the assessment of agent performance. Additionally, A3 incorporates an automated evaluation process using large language models, reducing the need for extensive human involvement and coding skills.'}, 'zh': {'title': 'Android Agent Arena:移动GUI代理的新评估平台', 'desc': '近年来,人工智能代理的应用越来越广泛,尤其是在大型语言模型(LLMs)领域的进步推动下。移动图形用户界面(GUI)代理是人工智能代理的一种,旨在自主执行移动设备上的任务。现有的研究虽然提出了许多代理、数据集和基准,但大多数数据集仅关注静态框架评估,无法全面评估真实世界中的任务表现。为了解决这一问题,我们提出了Android Agent Arena(A3),这是一个新颖的评估平台,提供了实际的任务和更灵活的操作空间,支持基于LLM的自动化评估过程。'}}}, {'id': 'https://huggingface.co/papers/2501.00192', 'title': 'MLLM-as-a-Judge for Image Safety without Human Labeling', 'url': 'https://huggingface.co/papers/2501.00192', 'abstract': 'Image content safety has become a significant challenge with the rise of visual media on online platforms. Meanwhile, in the age of AI-generated content (AIGC), many image generation models are capable of producing harmful content, such as images containing sexual or violent material. Thus, it becomes crucial to identify such unsafe images based on established safety rules. Pre-trained Multimodal Large Language Models (MLLMs) offer potential in this regard, given their strong pattern recognition abilities. Existing approaches typically fine-tune MLLMs with human-labeled datasets, which however brings a series of drawbacks. First, relying on human annotators to label data following intricate and detailed guidelines is both expensive and labor-intensive. Furthermore, users of safety judgment systems may need to frequently update safety rules, making fine-tuning on human-based annotation more challenging. This raises the research question: Can we detect unsafe images by querying MLLMs in a zero-shot setting using a predefined safety constitution (a set of safety rules)? Our research showed that simply querying pre-trained MLLMs does not yield satisfactory results. This lack of effectiveness stems from factors such as the subjectivity of safety rules, the complexity of lengthy constitutions, and the inherent biases in the models. To address these challenges, we propose a MLLM-based method includes objectifying safety rules, assessing the relevance between rules and images, making quick judgments based on debiased token probabilities with logically complete yet simplified precondition chains for safety rules, and conducting more in-depth reasoning with cascaded chain-of-thought processes if necessary. Experiment results demonstrate that our method is highly effective for zero-shot image safety judgment tasks.', 'score': 20, 'issue_id': 1474, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': '2a62bcbb87c1b7a5', 'authors': ['Zhenting Wang', 'Shuming Hu', 'Shiyu Zhao', 'Xiaowen Lin', 'Felix Juefei-Xu', 'Zhuowei Li', 'Ligong Han', 'Harihar Subramanyam', 'Li Chen', 'Jianfa Chen', 'Nan Jiang', 'Lingjuan Lyu', 'Shiqing Ma', 'Dimitris N. Metaxas', 'Ankit Jain'], 'affiliations': ['GenAI @ Meta', 'Rutgers University', 'UMass Amherst', 'Westlake University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00192.jpg', 'data': {'categories': ['#reasoning', '#training', '#ethics', '#cv', '#multimodal'], 'emoji': '🛡️', 'ru': {'title': 'Интеллектуальная защита: Zero-shot оценка безопасности изображений с помощью MLLM', 'desc': 'Статья представляет метод определения безопасности изображений с использованием мультимодальных больших языковых моделей (MLLM) в режиме zero-shot. Авторы предлагают подход, включающий объективизацию правил безопасности, оценку релевантности между правилами и изображениями, и быстрое принятие решений на основе дебиасированных вероятностей токенов. Метод также включает каскадные цепочки рассуждений для более глубокого анализа при необходимости. Эксперименты показывают высокую эффективность предложенного метода для задач оценки безопасности изображений без предварительного обучения.'}, 'en': {'title': 'Zero-Shot Image Safety Detection with MLLMs', 'desc': 'This paper addresses the challenge of identifying unsafe images in the context of AI-generated content using Multimodal Large Language Models (MLLMs). The authors propose a novel approach that allows for zero-shot detection of harmful images by utilizing predefined safety rules without the need for extensive human labeling. They highlight the limitations of traditional methods, such as the subjectivity of safety rules and the biases present in models. The proposed method enhances safety judgment by objectifying rules, assessing their relevance to images, and employing a reasoning process that simplifies complex safety guidelines.'}, 'zh': {'title': '利用MLLMs实现零样本图像安全判断', 'desc': '随着在线平台视觉媒体的兴起,图像内容安全成为一个重要挑战。许多图像生成模型能够产生有害内容,因此识别不安全图像变得至关重要。我们提出了一种基于预训练多模态大语言模型(MLLMs)的方法,通过查询这些模型来检测不安全图像,而无需依赖人工标注。实验结果表明,我们的方法在零样本图像安全判断任务中非常有效。'}}}, {'id': 'https://huggingface.co/papers/2501.01426', 'title': 'Unifying Specialized Visual Encoders for Video Language Models', 'url': 'https://huggingface.co/papers/2501.01426', 'abstract': 'The recent advent of Large Language Models (LLMs) has ushered sophisticated reasoning capabilities into the realm of video through Video Large Language Models (VideoLLMs). However, VideoLLMs currently rely on a single vision encoder for all of their visual processing, which limits the amount and type of visual information that can be conveyed to the LLM. Our method, MERV, Multi-Encoder Representation of Videos, instead leverages multiple frozen visual encoders to create a unified representation of a video, providing the VideoLLM with a comprehensive set of specialized visual knowledge. Spatio-temporally aligning the features from each encoder allows us to tackle a wider range of open-ended and multiple-choice video understanding questions and outperform prior state-of-the-art works. MERV is up to 3.7% better in accuracy than Video-LLaVA across the standard suite video understanding benchmarks, while also having a better Video-ChatGPT score. We also improve upon SeViLA, the previous best on zero-shot Perception Test accuracy, by 2.2%. MERV introduces minimal extra parameters and trains faster than equivalent single-encoder methods while parallelizing the visual processing. Finally, we provide qualitative evidence that MERV successfully captures domain knowledge from each of its encoders. Our results offer promising directions in utilizing multiple vision encoders for comprehensive video understanding.', 'score': 19, 'issue_id': 1488, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'c868a7ebcbafa704', 'authors': ['Jihoon Chung', 'Tyler Zhu', 'Max Gonzalez Saez-Diez', 'Juan Carlos Niebles', 'Honglu Zhou', 'Olga Russakovsky'], 'affiliations': ['Princeton University', 'Salesforce Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.01426.jpg', 'data': {'categories': ['#architecture', '#reasoning', '#video', '#benchmark', '#multimodal', '#optimization'], 'emoji': '🎥', 'ru': {'title': 'MERV: Многоэнкодерное представление видео для улучшенного машинного понимания', 'desc': 'Статья представляет MERV - новый метод для улучшения понимания видео с помощью больших языковых моделей. MERV использует несколько замороженных визуальных энкодеров для создания единого представления видео, что позволяет охватить больший объем визуальной информации. Этот подход превосходит предыдущие методы в точности на стандартных тестах понимания видео. MERV вводит минимальное количество дополнительных параметров и обучается быстрее, чем эквивалентные методы с одним энкодером.'}, 'en': {'title': 'Unlocking Video Understanding with Multi-Encoder Magic!', 'desc': 'This paper introduces MERV, a method that enhances Video Large Language Models (VideoLLMs) by using multiple visual encoders instead of just one. By combining the outputs of these encoders, MERV creates a richer representation of videos, which helps the model understand complex video content better. The approach allows for improved performance on various video understanding tasks, achieving higher accuracy than previous models. Additionally, MERV is efficient, requiring fewer parameters and training time while effectively leveraging the strengths of each encoder.'}, 'zh': {'title': '多编码器提升视频理解能力', 'desc': '本文介绍了一种名为MERV(多编码器视频表示)的方法,旨在提升视频理解的能力。MERV通过使用多个冻结的视觉编码器,创建视频的统一表示,从而为视频大型语言模型(VideoLLM)提供更全面的视觉知识。通过时空对齐每个编码器的特征,MERV能够更好地处理开放式和多选的视频理解问题,且在准确性上超越了之前的最佳模型。该方法不仅提高了性能,还在参数和训练速度上优于单编码器方法,展示了多视觉编码器在视频理解中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01054', 'title': 'Dynamic Scaling of Unit Tests for Code Reward Modeling', 'url': 'https://huggingface.co/papers/2501.01054', 'abstract': 'Current large language models (LLMs) often struggle to produce accurate responses on the first attempt for complex reasoning tasks like code generation. Prior research tackles this challenge by generating multiple candidate solutions and validating them with LLM-generated unit tests. The execution results of unit tests serve as reward signals to identify correct solutions. As LLMs always confidently make mistakes, these unit tests are not reliable, thereby diminishing the quality of reward signals. Motivated by the observation that scaling the number of solutions improves LLM performance, we explore the impact of scaling unit tests to enhance reward signal quality. Our pioneer experiment reveals a positive correlation between the number of unit tests and reward signal quality, with greater benefits observed in more challenging problems. Based on these insights, we propose CodeRM-8B, a lightweight yet effective unit test generator that enables efficient and high-quality unit test scaling. Additionally, we implement a dynamic scaling mechanism that adapts the number of unit tests based on problem difficulty, further improving efficiency. Experimental results show that our approach significantly improves performance across various models on three benchmarks (e.g., with gains of 18.43% for Llama3-8B and 3.42% for GPT-4o-mini on HumanEval Plus).', 'score': 15, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '33b9590f2acb0e48', 'authors': ['Zeyao Ma', 'Xiaokang Zhang', 'Jing Zhang', 'Jifan Yu', 'Sijia Luo', 'Jie Tang'], 'affiliations': ['Key Laboratory of Data Engineering and Knowledge Engineering, Beijing, China', 'School of Information, Renmin University of China', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01054.jpg', 'data': {'categories': ['#reasoning', '#benchmark', '#training', '#small_models', '#rlhf', '#optimization'], 'emoji': '🧪', 'ru': {'title': 'Масштабирование юнит-тестов для повышения точности LLM в сложных задачах', 'desc': 'Эта статья посвящена улучшению точности больших языковых моделей (LLM) в задачах сложного мышления, таких как генерация кода. Авторы предлагают метод масштабирования юнит-тестов для повышения качества сигналов вознаграждения при оценке решений. Они разработали легковесный генератор юнит-тестов CodeRM-8B и механизм динамического масштабирования, адаптирующийся к сложности задачи. Эксперименты показали значительное улучшение производительности различных моделей на нескольких тестовых наборах.'}, 'en': {'title': 'Enhancing LLM Performance through Scaled Unit Testing', 'desc': 'This paper addresses the limitations of large language models (LLMs) in generating accurate responses for complex tasks like code generation. It highlights the issue of unreliable reward signals from LLM-generated unit tests, which can lead to incorrect solutions. The authors propose a novel approach, CodeRM-8B, which generates a larger number of unit tests to improve the quality of these reward signals. Their experiments demonstrate that scaling unit tests enhances LLM performance, particularly for more challenging problems, leading to significant improvements across various models.'}, 'zh': {'title': '提升单元测试质量,增强模型性能', 'desc': '当前的大型语言模型(LLMs)在复杂推理任务(如代码生成)中,往往难以在第一次尝试时产生准确的响应。以往的研究通过生成多个候选解决方案并使用LLM生成的单元测试进行验证来应对这一挑战。单元测试的执行结果作为奖励信号,用于识别正确的解决方案。然而,由于LLMs常常自信地犯错,这些单元测试的可靠性不足,从而降低了奖励信号的质量。我们提出了CodeRM-8B,一个轻量级且有效的单元测试生成器,能够高效地扩展单元测试,并根据问题的难度动态调整单元测试的数量,从而进一步提高效率。'}}}, {'id': 'https://huggingface.co/papers/2501.01320', 'title': 'SeedVR: Seeding Infinity in Diffusion Transformer Towards Generic Video Restoration', 'url': 'https://huggingface.co/papers/2501.01320', 'abstract': "Video restoration poses non-trivial challenges in maintaining fidelity while recovering temporally consistent details from unknown degradations in the wild. Despite recent advances in diffusion-based restoration, these methods often face limitations in generation capability and sampling efficiency. In this work, we present SeedVR, a diffusion transformer designed to handle real-world video restoration with arbitrary length and resolution. The core design of SeedVR lies in the shifted window attention that facilitates effective restoration on long video sequences. SeedVR further supports variable-sized windows near the boundary of both spatial and temporal dimensions, overcoming the resolution constraints of traditional window attention. Equipped with contemporary practices, including causal video autoencoder, mixed image and video training, and progressive training, SeedVR achieves highly-competitive performance on both synthetic and real-world benchmarks, as well as AI-generated videos. Extensive experiments demonstrate SeedVR's superiority over existing methods for generic video restoration.", 'score': 8, 'issue_id': 1479, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'fa277e5baed864a4', 'authors': ['Jianyi Wang', 'Zhijie Lin', 'Meng Wei', 'Yang Zhao', 'Ceyuan Yang', 'Chen Change Loy', 'Lu Jiang'], 'affiliations': ['ByteDance', 'Nanyang Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01320.jpg', 'data': {'categories': ['#architecture', '#benchmark', '#long_context', '#video', '#training', '#diffusion', '#synthetic'], 'emoji': '🎥', 'ru': {'title': 'SeedVR: Восстановление видео нового поколения с помощью диффузионных трансформеров', 'desc': 'SeedVR - это диффузионный трансформер для восстановления видео в реальных условиях. Он использует сдвинутое оконное внимание для эффективной обработки длинных видеопоследовательностей. SeedVR поддерживает окна переменного размера на границах пространственных и временных измерений, преодолевая ограничения традиционного оконного внимания. Благодаря современным практикам, таким как каузальный видеоавтоэнкодер и прогрессивное обучение, SeedVR достигает высоких результатов на синтетических и реальных тестовых наборах.'}, 'en': {'title': 'SeedVR: Revolutionizing Video Restoration with Diffusion Transformers', 'desc': 'This paper introduces SeedVR, a novel diffusion transformer aimed at improving video restoration by effectively managing long sequences and varying resolutions. It utilizes shifted window attention to enhance the restoration process, allowing for better handling of temporal consistency and fidelity in videos. SeedVR incorporates advanced techniques such as causal video autoencoders and mixed training strategies to boost its performance on both synthetic and real-world datasets. The results show that SeedVR outperforms existing video restoration methods, making it a significant advancement in the field.'}, 'zh': {'title': 'SeedVR:高效的视频修复新方法', 'desc': '视频修复面临着在恢复未知退化的同时保持细节一致性的挑战。尽管基于扩散的修复方法有所进展,但它们在生成能力和采样效率上仍存在局限性。本文提出了SeedVR,这是一种专为处理任意长度和分辨率的真实视频修复而设计的扩散变换器。SeedVR通过移动窗口注意力机制,有效地处理长视频序列,并在空间和时间维度的边界附近支持可变大小的窗口,克服了传统窗口注意力的分辨率限制。'}}}, {'id': 'https://huggingface.co/papers/2412.21015', 'title': 'MapQaTor: A System for Efficient Annotation of Map Query Datasets', 'url': 'https://huggingface.co/papers/2412.21015', 'abstract': 'Mapping and navigation services like Google Maps, Apple Maps, Openstreet Maps, are essential for accessing various location-based data, yet they often struggle to handle natural language geospatial queries. Recent advancements in Large Language Models (LLMs) show promise in question answering (QA), but creating reliable geospatial QA datasets from map services remains challenging. We introduce MapQaTor, a web application that streamlines the creation of reproducible, traceable map-based QA datasets. With its plug-and-play architecture, MapQaTor enables seamless integration with any maps API, allowing users to gather and visualize data from diverse sources with minimal setup. By caching API responses, the platform ensures consistent ground truth, enhancing the reliability of the data even as real-world information evolves. MapQaTor centralizes data retrieval, annotation, and visualization within a single platform, offering a unique opportunity to evaluate the current state of LLM-based geospatial reasoning while advancing their capabilities for improved geospatial understanding. Evaluation metrics show that, MapQaTor speeds up the annotation process by at least 30 times compared to manual methods, underscoring its potential for developing geospatial resources, such as complex map reasoning datasets. The website is live at: https://mapqator.github.io/ and a demo video is available at: https://youtu.be/7_aV9Wmhs6Q.', 'score': 8, 'issue_id': 1477, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '0d1081756b5bc4f7', 'authors': ['Mahir Labib Dihan', 'Mohammed Eunus Ali', 'Md Rizwan Parvez'], 'affiliations': ['Department of Computer Science and Engineering Bangladesh University of Engineering and Technology (BUET)', 'Qatar Computing Research Institute (QCRI)'], 'pdf_title_img': 'assets/pdf/title_img/2412.21015.jpg', 'data': {'categories': ['#dataset', '#science', '#reasoning', '#data', '#benchmark'], 'emoji': '🗺️', 'ru': {'title': 'MapQaTor: Революция в создании геопространственных данных для ИИ', 'desc': 'MapQaTor - это веб-приложение, которое упрощает создание воспроизводимых наборов данных для вопросно-ответных систем на основе карт. Оно интегрируется с любым картографическим API и позволяет собирать и визуализировать данные из различных источников. MapQaTor кэширует ответы API, обеспечивая согласованность данных, и централизует процессы сбора, аннотации и визуализации. Приложение ускоряет процесс аннотации в 30 раз по сравнению с ручными методами, что делает его полезным инструментом для развития геопространственных ресурсов и оценки возможностей больших языковых моделей в области геопространственных рассуждений.'}, 'en': {'title': 'Streamlining Geospatial QA with MapQaTor', 'desc': 'This paper presents MapQaTor, a web application designed to facilitate the creation of geospatial question answering (QA) datasets using map services. It leverages recent advancements in Large Language Models (LLMs) to improve the handling of natural language queries related to locations. The platform features a plug-and-play architecture that integrates with various maps APIs, allowing users to efficiently gather, annotate, and visualize geospatial data. By caching API responses, MapQaTor ensures consistent and reliable data, significantly speeding up the annotation process and enhancing the evaluation of LLM-based geospatial reasoning capabilities.'}, 'zh': {'title': 'MapQaTor:提升地图问答数据集创建效率的利器', 'desc': '本文介绍了MapQaTor,一个用于创建地图问答数据集的网络应用程序。它利用大型语言模型的优势,简化了从地图服务生成可重复和可追溯的数据集的过程。MapQaTor支持与任何地图API的无缝集成,并通过缓存API响应来确保数据的一致性。该平台显著提高了数据标注的效率,展示了在地理空间推理方面的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01407', 'title': 'Nested Attention: Semantic-aware Attention Values for Concept Personalization', 'url': 'https://huggingface.co/papers/2501.01407', 'abstract': "Personalizing text-to-image models to generate images of specific subjects across diverse scenes and styles is a rapidly advancing field. Current approaches often face challenges in maintaining a balance between identity preservation and alignment with the input text prompt. Some methods rely on a single textual token to represent a subject, which limits expressiveness, while others employ richer representations but disrupt the model's prior, diminishing prompt alignment. In this work, we introduce Nested Attention, a novel mechanism that injects a rich and expressive image representation into the model's existing cross-attention layers. Our key idea is to generate query-dependent subject values, derived from nested attention layers that learn to select relevant subject features for each region in the generated image. We integrate these nested layers into an encoder-based personalization method, and show that they enable high identity preservation while adhering to input text prompts. Our approach is general and can be trained on various domains. Additionally, its prior preservation allows us to combine multiple personalized subjects from different domains in a single image.", 'score': 7, 'issue_id': 1487, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '537e7bcc16fb17f5', 'authors': ['Or Patashnik', 'Rinon Gal', 'Daniil Ostashev', 'Sergey Tulyakov', 'Kfir Aberman', 'Daniel Cohen-Or'], 'affiliations': ['Snap Research', 'Tel Aviv University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01407.jpg', 'data': {'categories': ['#multimodal', '#architecture', '#cv'], 'emoji': '🎨', 'ru': {'title': 'Nested Attention: новый подход к персонализации генерации изображений', 'desc': "Статья представляет новый метод под названием 'Nested Attention' для персонализации моделей text-to-image. Этот механизм внедряет богатое и выразительное представление изображения в существующие слои кросс-внимания модели. Ключевая идея заключается в генерации зависимых от запроса значений субъекта, полученных из вложенных слоев внимания. Метод позволяет достичь высокого сохранения идентичности при соблюдении входных текстовых подсказок."}, 'en': {'title': 'Nested Attention: Balancing Identity and Text Alignment in Image Generation', 'desc': 'This paper presents a new method called Nested Attention for personalizing text-to-image models. The method addresses the challenge of balancing identity preservation of subjects with the alignment to text prompts. By using query-dependent subject values from nested attention layers, the model can effectively select relevant features for each part of the generated image. This approach not only maintains high identity fidelity but also allows for the integration of multiple personalized subjects from different domains into a single image.'}, 'zh': {'title': '嵌套注意力:个性化图像生成的新方法', 'desc': '本文介绍了一种新的机制,称为嵌套注意力,用于个性化文本到图像模型。该方法通过在模型的交叉注意力层中注入丰富的图像表示,解决了身份保留与文本提示对齐之间的平衡问题。嵌套注意力层能够为生成图像的每个区域选择相关的主题特征,从而实现高效的个性化。我们的研究表明,这种方法可以在多个领域进行训练,并允许在单个图像中结合来自不同领域的多个个性化主题。'}}}, {'id': 'https://huggingface.co/papers/2501.00658', 'title': 'Understanding and Mitigating Bottlenecks of State Space Models through the Lens of Recency and Over-smoothing', 'url': 'https://huggingface.co/papers/2501.00658', 'abstract': "Structured State Space Models (SSMs) have emerged as alternatives to transformers. While SSMs are often regarded as effective in capturing long-sequence dependencies, we rigorously demonstrate that they are inherently limited by strong recency bias. Our empirical studies also reveal that this bias impairs the models' ability to recall distant information and introduces robustness issues. Our scaling experiments then discovered that deeper structures in SSMs can facilitate the learning of long contexts. However, subsequent theoretical analysis reveals that as SSMs increase in depth, they exhibit another inevitable tendency toward over-smoothing, e.g., token representations becoming increasingly indistinguishable. This fundamental dilemma between recency and over-smoothing hinders the scalability of existing SSMs. Inspired by our theoretical findings, we propose to polarize two channels of the state transition matrices in SSMs, setting them to zero and one, respectively, simultaneously addressing recency bias and over-smoothing. Experiments demonstrate that our polarization technique consistently enhances the associative recall accuracy of long-range tokens and unlocks SSMs to benefit further from deeper architectures. All source codes are released at https://github.com/VITA-Group/SSM-Bottleneck.", 'score': 6, 'issue_id': 1476, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': '253304ea64defbe0', 'authors': ['Peihao Wang', 'Ruisi Cai', 'Yuehao Wang', 'Jiajun Zhu', 'Pragya Srivastava', 'Zhangyang Wang', 'Pan Li'], 'affiliations': ['Georgia Tech', 'Google DeepMind', 'University of Texas at Austin', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00658.jpg', 'data': {'categories': ['#training', '#open_source', '#long_context', '#optimization', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Преодоление ограничений SSM: баланс между недавностью и сглаживанием', 'desc': 'Структурированные модели пространства состояний (SSM) рассматриваются как альтернатива трансформерам в обработке длинных последовательностей. Исследование показало, что SSM имеют существенное ограничение в виде сильного смещения к недавним данным, что затрудняет запоминание отдаленной информации. Увеличение глубины SSM улучшает обработку длинных контекстов, но приводит к проблеме чрезмерного сглаживания. Авторы предлагают метод поляризации каналов матриц перехода состояний для решения этих проблем, что улучшает точность ассоциативного извлечения дальних токенов.'}, 'en': {'title': 'Balancing Recency and Over-Smoothing in SSMs', 'desc': "This paper discusses Structured State Space Models (SSMs) as alternatives to transformers, highlighting their limitations due to strong recency bias. This bias affects the models' ability to remember distant information and creates robustness issues. The authors propose a solution by polarizing the state transition matrices, which helps mitigate both recency bias and over-smoothing that occurs with deeper architectures. Their experiments show that this new approach improves the accuracy of recalling long-range tokens, allowing SSMs to effectively utilize deeper structures."}, 'zh': {'title': '解决近期偏见与过平滑的双重挑战', 'desc': '结构状态空间模型(SSMs)作为变换器的替代方案,虽然在捕捉长序列依赖性方面表现出色,但存在强烈的近期偏见限制。我们的实证研究表明,这种偏见影响了模型对远程信息的回忆能力,并引入了鲁棒性问题。通过扩展实验,我们发现SSMs的深层结构可以促进长上下文的学习,但理论分析显示,随着深度增加,模型会出现过平滑的趋势,使得标记表示变得难以区分。我们提出的极化技术通过将状态转移矩阵的两个通道设置为零和一,解决了近期偏见和过平滑的问题,显著提高了长距离标记的关联回忆准确性。'}}}, {'id': 'https://huggingface.co/papers/2501.01245', 'title': 'SeFAR: Semi-supervised Fine-grained Action Recognition with Temporal Perturbation and Learning Stabilization', 'url': 'https://huggingface.co/papers/2501.01245', 'abstract': 'Human action understanding is crucial for the advancement of multimodal systems. While recent developments, driven by powerful large language models (LLMs), aim to be general enough to cover a wide range of categories, they often overlook the need for more specific capabilities. In this work, we address the more challenging task of Fine-grained Action Recognition (FAR), which focuses on detailed semantic labels within shorter temporal duration (e.g., "salto backward tucked with 1 turn"). Given the high costs of annotating fine-grained labels and the substantial data needed for fine-tuning LLMs, we propose to adopt semi-supervised learning (SSL). Our framework, SeFAR, incorporates several innovative designs to tackle these challenges. Specifically, to capture sufficient visual details, we construct Dual-level temporal elements as more effective representations, based on which we design a new strong augmentation strategy for the Teacher-Student learning paradigm through involving moderate temporal perturbation. Furthermore, to handle the high uncertainty within the teacher model\'s predictions for FAR, we propose the Adaptive Regulation to stabilize the learning process. Experiments show that SeFAR achieves state-of-the-art performance on two FAR datasets, FineGym and FineDiving, across various data scopes. It also outperforms other semi-supervised methods on two classical coarse-grained datasets, UCF101 and HMDB51. Further analysis and ablation studies validate the effectiveness of our designs. Additionally, we show that the features extracted by our SeFAR could largely promote the ability of multimodal foundation models to understand fine-grained and domain-specific semantics.', 'score': 5, 'issue_id': 1475, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '30d94590a5c78569', 'authors': ['Yongle Huang', 'Haodong Chen', 'Zhenbang Xu', 'Zihan Jia', 'Haozhou Sun', 'Dian Shao'], 'affiliations': ['School of Automation, Northwestern Polytechnical University, Xian, China', 'School of Computer Science, Northwestern Polytechnical University, Xian, China', 'School of Software, Northwestern Polytechnical University, Xian, China', 'Unmanned System Research Institute, Northwestern Polytechnical University, Xian, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01245.jpg', 'data': {'categories': ['#dataset', '#transfer_learning', '#multimodal', '#optimization', '#training'], 'emoji': '🤸', 'ru': {'title': 'SeFAR: Прорыв в распознавании детализированных действий с помощью полу-контролируемого обучения', 'desc': 'Статья представляет новый подход к задаче распознавания детализированных действий (Fine-grained Action Recognition, FAR) с использованием полу-контролируемого обучения. Авторы предлагают фреймворк SeFAR, который включает в себя двухуровневые временные элементы для более эффективного представления действий и новую стратегию аугментации данных. SeFAR также использует адаптивную регуляцию для стабилизации процесса обучения при работе с неопределенностью в предсказаниях модели-учителя. Эксперименты показывают, что SeFAR достигает лучших результатов на нескольких наборах данных FAR и классических наборах данных для распознавания действий.'}, 'en': {'title': 'SeFAR: Elevating Fine-grained Action Recognition with Semi-supervised Learning', 'desc': "This paper focuses on improving Fine-grained Action Recognition (FAR), which identifies specific actions in short time frames. The authors introduce a semi-supervised learning framework called SeFAR, which uses innovative techniques to enhance the learning process despite the challenges of limited labeled data. They develop Dual-level temporal elements for better visual representation and implement a strong augmentation strategy within a Teacher-Student learning setup. The results demonstrate that SeFAR achieves top performance on FAR datasets and enhances multimodal models' understanding of detailed actions."}, 'zh': {'title': '细粒度动作识别的新突破', 'desc': '人类动作理解对多模态系统的发展至关重要。本文提出了一种新的框架SeFAR,专注于细粒度动作识别(FAR),旨在处理短时间内的详细语义标签。我们采用半监督学习(SSL)来减少对大量标注数据的需求,并通过构建双层时间元素和新的强增强策略来提高模型的表现。实验结果表明,SeFAR在多个数据集上达到了最先进的性能,证明了我们设计的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.00910', 'title': 'Population Aware Diffusion for Time Series Generation', 'url': 'https://huggingface.co/papers/2501.00910', 'abstract': 'Diffusion models have shown promising ability in generating high-quality time series (TS) data. Despite the initial success, existing works mostly focus on the authenticity of data at the individual level, but pay less attention to preserving the population-level properties on the entire dataset. Such population-level properties include value distributions for each dimension and distributions of certain functional dependencies (e.g., cross-correlation, CC) between different dimensions. For instance, when generating house energy consumption TS data, the value distributions of the outside temperature and the kitchen temperature should be preserved, as well as the distribution of CC between them. Preserving such TS population-level properties is critical in maintaining the statistical insights of the datasets, mitigating model bias, and augmenting downstream tasks like TS prediction. Yet, it is often overlooked by existing models. Hence, data generated by existing models often bear distribution shifts from the original data. We propose Population-aware Diffusion for Time Series (PaD-TS), a new TS generation model that better preserves the population-level properties. The key novelties of PaD-TS include 1) a new training method explicitly incorporating TS population-level property preservation, and 2) a new dual-channel encoder model architecture that better captures the TS data structure. Empirical results in major benchmark datasets show that PaD-TS can improve the average CC distribution shift score between real and synthetic data by 5.9x while maintaining a performance comparable to state-of-the-art models on individual-level authenticity.', 'score': 4, 'issue_id': 1486, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'cd3f9282d55e15f2', 'authors': ['Yang Li', 'Han Meng', 'Zhenyu Bi', 'Ingolv T. Urnes', 'Haipeng Chen'], 'affiliations': ['Generated Health', 'Virginia Tech', 'William & Mary'], 'pdf_title_img': 'assets/pdf/title_img/2501.00910.jpg', 'data': {'categories': ['#synthetic', '#benchmark', '#dataset', '#data', '#training', '#architecture', '#diffusion'], 'emoji': '📊', 'ru': {'title': 'Генерация временных рядов с сохранением свойств популяции', 'desc': 'Статья представляет новую модель генерации временных рядов под названием PaD-TS (Population-aware Diffusion for Time Series). Модель нацелена на сохранение свойств на уровне популяции, таких как распределения значений и функциональные зависимости между измерениями. PaD-TS использует новый метод обучения, явно включающий сохранение свойств временных рядов на уровне популяции, и новую архитектуру модели с двухканальным энкодером. Эмпирические результаты показывают значительное улучшение в сохранении распределения кросс-корреляций при сравнимой аутентичности на индивидуальном уровне.'}, 'en': {'title': 'Preserving Population Insights in Time Series Generation', 'desc': 'This paper introduces a new model called Population-aware Diffusion for Time Series (PaD-TS) that focuses on generating time series data while preserving important population-level properties. Unlike previous models that mainly ensure individual data authenticity, PaD-TS emphasizes maintaining the overall statistical characteristics of the dataset, such as value distributions and cross-correlations between different dimensions. The model employs a novel training method and a dual-channel encoder architecture to effectively capture the structure of time series data. Experimental results demonstrate that PaD-TS significantly reduces distribution shifts in generated data while achieving comparable performance in individual-level authenticity to existing state-of-the-art models.'}, 'zh': {'title': '保留人口级特性,提升时间序列生成质量', 'desc': '扩散模型在生成高质量时间序列数据方面表现出色。然而,现有研究主要关注个体数据的真实性,而忽视了整个数据集的人口级特性。我们提出了一种新的时间序列生成模型PaD-TS,旨在更好地保留这些人口级特性,包括值分布和不同维度之间的交叉相关性。实验结果表明,PaD-TS在保持个体级真实性的同时,显著改善了真实数据与合成数据之间的分布差异。'}}}, {'id': 'https://huggingface.co/papers/2501.00712', 'title': 'Rethinking Addressing in Language Models via Contexualized Equivariant Positional Encoding', 'url': 'https://huggingface.co/papers/2501.00712', 'abstract': 'Transformers rely on both content-based and position-based addressing mechanisms to make predictions, but existing positional encoding techniques often diminish the effectiveness of position-based addressing. Many current methods enforce rigid patterns in attention maps, limiting the ability to model long-range dependencies and adapt to diverse tasks. Additionally, most positional encodings are learned as general biases, lacking the specialization required for different instances within a dataset. To address this, we propose conTextualized equivariAnt Position Embedding (TAPE), a novel framework that enhances positional embeddings by incorporating sequence content across layers. TAPE introduces dynamic, context-aware positional encodings, overcoming the constraints of traditional fixed patterns. By enforcing permutation and orthogonal equivariance, TAPE ensures the stability of positional encodings during updates, improving robustness and adaptability. Our method can be easily integrated into pre-trained transformers, offering parameter-efficient fine-tuning with minimal overhead. Extensive experiments shows that TAPE achieves superior performance in language modeling, arithmetic reasoning, and long-context retrieval tasks compared to existing positional embedding techniques.', 'score': 4, 'issue_id': 1485, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'e5119d0e83ce2af2', 'authors': ['Jiajun Zhu', 'Peihao Wang', 'Ruisi Cai', 'Jason D. Lee', 'Pan Li', 'Zhangyang Wang'], 'affiliations': ['Georgia Tech', 'Princeton University', 'University of Texas at Austin', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00712.jpg', 'data': {'categories': ['#long_context', '#optimization', '#training', '#architecture', '#reasoning'], 'emoji': '🔀', 'ru': {'title': 'Динамические позиционные эмбеддинги для улучшения работы трансформеров', 'desc': 'Авторы предлагают новый метод позиционного кодирования для трансформеров под названием TAPE. Этот подход учитывает контекст последовательности и создает динамические позиционные эмбеддинги, адаптированные к конкретным задачам. TAPE обеспечивает стабильность кодирования благодаря свойствам перестановочной и ортогональной эквивариантности. Метод легко интегрируется в предобученные модели и показывает превосходные результаты в задачах языкового моделирования, арифметических рассуждений и поиска в длинных контекстах.'}, 'en': {'title': 'Enhancing Transformers with Context-Aware Positional Embeddings', 'desc': "This paper introduces a new method called conTextualized equivariAnt Position Embedding (TAPE) to improve how transformers use positional information. Traditional positional encodings often restrict the model's ability to understand long-range relationships in data. TAPE enhances these encodings by making them dynamic and context-aware, allowing them to adapt to different sequences and tasks. The method shows better performance in various applications, such as language modeling and reasoning, while being easy to integrate into existing transformer models."}, 'zh': {'title': '提升变换器模型的位置信息处理能力', 'desc': '本文提出了一种新的位置编码方法,称为TAPE(conTextualized equivariAnt Position Embedding),旨在提高变换器模型的预测能力。传统的位置编码方法往往限制了模型对长距离依赖关系的建模能力,而TAPE通过引入动态的、上下文感知的位置编码来克服这一问题。该方法确保了位置编码在更新过程中的稳定性,从而提高了模型的鲁棒性和适应性。实验结果表明,TAPE在语言建模、算术推理和长上下文检索任务中表现优于现有的位置编码技术。'}}}, {'id': 'https://huggingface.co/papers/2412.19723', 'title': 'OS-Genesis: Automating GUI Agent Trajectory Construction via Reverse Task Synthesis', 'url': 'https://huggingface.co/papers/2412.19723', 'abstract': "Graphical User Interface (GUI) agents powered by Vision-Language Models (VLMs) have demonstrated human-like computer control capability. Despite their utility in advancing digital automation, a critical bottleneck persists: collecting high-quality trajectory data for training. Common practices for collecting such data rely on human supervision or synthetic data generation through executing pre-defined tasks, which are either resource-intensive or unable to guarantee data quality. Moreover, these methods suffer from limited data diversity and significant gaps between synthetic data and real-world environments. To address these challenges, we propose OS-Genesis, a novel GUI data synthesis pipeline that reverses the conventional trajectory collection process. Instead of relying on pre-defined tasks, OS-Genesis enables agents first to perceive environments and perform step-wise interactions, then retrospectively derive high-quality tasks to enable trajectory-level exploration. A trajectory reward model is then employed to ensure the quality of the generated trajectories. We demonstrate that training GUI agents with OS-Genesis significantly improves their performance on highly challenging online benchmarks. In-depth analysis further validates OS-Genesis's efficiency and its superior data quality and diversity compared to existing synthesis methods. Our codes, data, and checkpoints are available at https://qiushisun.github.io/OS-Genesis-Home/{OS-Genesis Homepage}.", 'score': 50, 'issue_id': 1455, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': 'b331198d09aa8650', 'authors': ['Qiushi Sun', 'Kanzhi Cheng', 'Zichen Ding', 'Chuanyang Jin', 'Yian Wang', 'Fangzhi Xu', 'Zhenyu Wu', 'Chengyou Jia', 'Liheng Chen', 'Zhoumianze Liu', 'Ben Kao', 'Guohao Li', 'Junxian He', 'Yu Qiao', 'Zhiyong Wu'], 'affiliations': ['Hong Kong University of Science and Technology', 'Johns Hopkins University', 'Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The University of Hong Kong', 'University of Oxford'], 'pdf_title_img': 'assets/pdf/title_img/2412.19723.jpg', 'data': {'categories': ['#benchmark', '#synthetic', '#dataset', '#optimization', '#training', '#data', '#agents'], 'emoji': '🖥️', 'ru': {'title': 'Революция в обучении ИИ-агентов: от заданий к исследованию', 'desc': 'Статья представляет OS-Genesis - новый метод синтеза данных для обучения ИИ-агентов взаимодействию с графическим интерфейсом. Вместо предопределенных задач, агенты сначала исследуют среду и выполняют пошаговые действия, а затем ретроспективно формируют качественные траектории. Используется модель вознаграждения для обеспечения качества сгенерированных траекторий. Результаты показывают значительное улучшение производительности агентов на сложных онлайн-бенчмарках по сравнению с существующими методами.'}, 'en': {'title': 'Revolutionizing GUI Agent Training with OS-Genesis', 'desc': 'This paper introduces OS-Genesis, a new method for generating high-quality trajectory data for training GUI agents using Vision-Language Models (VLMs). Unlike traditional methods that rely on human supervision or predefined tasks, OS-Genesis allows agents to first interact with their environment and then derive tasks retrospectively. This approach enhances data diversity and quality by enabling agents to explore and learn from real-world interactions. The results show that GUI agents trained with OS-Genesis perform significantly better on challenging benchmarks, demonstrating the effectiveness of this novel data synthesis pipeline.'}, 'zh': {'title': 'OS-Genesis:提升GUI代理性能的新方法', 'desc': '本论文提出了一种名为OS-Genesis的新型图形用户界面(GUI)数据合成管道,旨在解决高质量轨迹数据收集的瓶颈。传统方法依赖于人类监督或合成数据生成,往往资源消耗大且数据质量难以保证。OS-Genesis通过让代理先感知环境并进行逐步交互,随后回溯生成高质量任务,从而实现轨迹级探索。实验结果表明,使用OS-Genesis训练的GUI代理在复杂的在线基准测试中表现显著提升,且其数据质量和多样性优于现有合成方法。'}}}, {'id': 'https://huggingface.co/papers/2412.19638', 'title': 'Xmodel-2 Technical Report', 'url': 'https://huggingface.co/papers/2412.19638', 'abstract': 'Xmodel-2 is a 1.2-billion-parameter large language model designed specifically for reasoning tasks. Its architecture enables different model scales to share a unified set of hyperparameters, allowing for extensive experimentation on smaller models and seamless transfer of optimal configurations to larger models. To maximize training efficiency and stability, Xmodel-2 employs the WSD learning rate scheduler from MiniCPM. Pretrained on 1.5 trillion tokens from diverse sources, Xmodel-2 achieves state-of-the-art performance in complex reasoning and agent-based tasks, while maintaining low training costs. These results highlight the potential of efficient model design and training strategies in advancing reasoning capabilities. Model checkpoints and code are publicly available on GitHub at https://github.com/XiaoduoAILab/Xmodel-2', 'score': 11, 'issue_id': 1453, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': '4707dc8ac5a87e66', 'authors': ['Wang Qun', 'Liu Yang', 'Lin Qingquan', 'Qu Zhijiu', 'Jiang Ling'], 'affiliations': ['AI Lab, Xiaodu Technology'], 'pdf_title_img': 'assets/pdf/title_img/2412.19638.jpg', 'data': {'categories': ['#optimization', '#training', '#small_models', '#reasoning', '#open_source', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективное рассуждение с Xmodel-2: мощь в компактности', 'desc': 'Xmodel-2 - это языковая модель с 1,2 миллиардами параметров, специализирующаяся на задачах рассуждения. Её архитектура позволяет разным масштабам модели использовать единый набор гиперпараметров, что облегчает эксперименты и перенос оптимальных конфигураций. Модель использует планировщик скорости обучения WSD из MiniCPM для повышения эффективности и стабильности. Предобученная на 1,5 триллионах токенов, Xmodel-2 достигает передовых результатов в сложных задачах рассуждения, сохраняя низкие затраты на обучение.'}, 'en': {'title': 'Unlocking Reasoning Power with Efficient Model Design', 'desc': 'Xmodel-2 is a large language model with 1.2 billion parameters, specifically built for reasoning tasks. It features a flexible architecture that allows different model sizes to use the same hyperparameters, facilitating experimentation and optimization across scales. The model utilizes the WSD learning rate scheduler to enhance training efficiency and stability. With pretraining on 1.5 trillion tokens, Xmodel-2 demonstrates superior performance in complex reasoning tasks while keeping training costs low, showcasing the benefits of efficient model design.'}, 'zh': {'title': '高效推理能力的模型设计与训练策略', 'desc': 'Xmodel-2 是一个拥有 12 亿参数的大型语言模型,专门设计用于推理任务。它的架构允许不同规模的模型共享统一的超参数,从而可以在较小的模型上进行广泛实验,并将最佳配置无缝转移到更大的模型上。为了最大化训练效率和稳定性,Xmodel-2 采用了 MiniCPM 的 WSD 学习率调度器。经过在 1.5 万亿个来自多样化来源的标记上进行预训练,Xmodel-2 在复杂推理和基于代理的任务中达到了最先进的性能,同时保持了较低的训练成本。'}}}, {'id': 'https://huggingface.co/papers/2412.20735', 'title': 'HUNYUANPROVER: A Scalable Data Synthesis Framework and Guided Tree Search for Automated Theorem Proving', 'url': 'https://huggingface.co/papers/2412.20735', 'abstract': 'We introduce HunyuanProver, an language model finetuned from the Hunyuan 7B for interactive automatic theorem proving with LEAN4. To alleviate the data sparsity issue, we design a scalable framework to iterative synthesize data with low cost. Besides, guided tree search algorithms are designed to enable effective ``system 2 thinking`` of the prover. HunyuanProver achieves state-of-the-art (SOTA) performances on major benchmarks. Specifically, it achieves a pass of 68.4% on the miniF2F-test compared to 65.9%, the current SOTA results. It proves 4 IMO statements (imo_1960_p2, imo_1962_p2}, imo_1964_p2 and imo_1983_p6) in miniF2F-test. To benefit the community, we will open-source a dataset of 30k synthesized instances, where each instance contains the original question in natural language, the converted statement by autoformalization, and the proof by HunyuanProver.', 'score': 3, 'issue_id': 1464, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '18d70581e862bf86', 'authors': ['Yang Li', 'Dong Du', 'Linfeng Song', 'Chen Li', 'Weikang Wang', 'Tao Yang', 'Haitao Mi'], 'affiliations': ['Tencent', 'Tencent Hunyuan Teams'], 'pdf_title_img': 'assets/pdf/title_img/2412.20735.jpg', 'data': {'categories': ['#dataset', '#synthetic', '#data', '#benchmark', '#reasoning', '#open_source', '#training', '#math'], 'emoji': '🧠', 'ru': {'title': 'Прорыв в автоматическом доказательстве теорем с помощью ИИ', 'desc': "HunyuanProver - это языковая модель, настроенная для автоматического доказательства теорем с использованием LEAN4. Модель использует масштабируемую структуру для итеративного синтеза данных и алгоритмы направленного поиска по дереву для эффективного 'системного мышления'. HunyuanProver достигает лучших результатов на основных бенчмарках, включая 68.4% прохождения на miniF2F-test. Авторы планируют открыть доступ к набору данных из 30 тысяч синтезированных примеров для пользы сообщества."}, 'en': {'title': 'HunyuanProver: Advancing Theorem Proving with AI', 'desc': 'HunyuanProver is a language model specifically fine-tuned for interactive automatic theorem proving using LEAN4. To address the challenge of data sparsity, the authors developed a scalable framework that allows for the iterative synthesis of data at a low cost. They also implemented guided tree search algorithms to enhance the reasoning capabilities of the prover, enabling it to perform complex logical deductions. HunyuanProver has achieved state-of-the-art performance on key benchmarks, including a notable pass rate of 68.4% on the miniF2F-test, surpassing previous results and proving several significant mathematical statements.'}, 'zh': {'title': 'HunyuanProver:自动定理证明的新突破', 'desc': '本文介绍了HunyuanProver,这是一个基于Hunyuan 7B微调的语言模型,旨在与LEAN4进行交互式自动定理证明。为了缓解数据稀疏问题,我们设计了一个可扩展的框架,以低成本迭代合成数据。此外,我们还设计了引导树搜索算法,以实现证明者的有效“系统2思维”。HunyuanProver在主要基准测试中达到了最先进的性能,特别是在miniF2F-test中取得了68.4%的通过率,超越了当前的65.9%最先进结果。'}}}, {'id': 'https://huggingface.co/papers/2501.05441', 'title': 'The GAN is dead; long live the GAN! A Modern GAN Baseline', 'url': 'https://huggingface.co/papers/2501.05441', 'abstract': 'There is a widely-spread claim that GANs are difficult to train, and GAN architectures in the literature are littered with empirical tricks. We provide evidence against this claim and build a modern GAN baseline in a more principled manner. First, we derive a well-behaved regularized relativistic GAN loss that addresses issues of mode dropping and non-convergence that were previously tackled via a bag of ad-hoc tricks. We analyze our loss mathematically and prove that it admits local convergence guarantees, unlike most existing relativistic losses. Second, our new loss allows us to discard all ad-hoc tricks and replace outdated backbones used in common GANs with modern architectures. Using StyleGAN2 as an example, we present a roadmap of simplification and modernization that results in a new minimalist baseline -- R3GAN. Despite being simple, our approach surpasses StyleGAN2 on FFHQ, ImageNet, CIFAR, and Stacked MNIST datasets, and compares favorably against state-of-the-art GANs and diffusion models.', 'score': 51, 'issue_id': 1596, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'eb1cd90c4d5cb0ef', 'authors': ['Yiwen Huang', 'Aaron Gokaslan', 'Volodymyr Kuleshov', 'James Tompkin'], 'affiliations': ['Brown University', 'Cornell University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05441.jpg', 'data': {'categories': ['#training', '#architecture', '#diffusion', '#optimization', '#cv'], 'emoji': '🔬', 'ru': {'title': 'Упрощение и модернизация GAN: новый взгляд на обучение генеративных моделей', 'desc': 'Исследователи опровергают распространенное мнение о сложности обучения генеративно-состязательных сетей (GAN). Они разработали новый регуляризованный релятивистский GAN-лосс, который решает проблемы потери мод и отсутствия сходимости. Авторы математически доказывают, что их лосс обеспечивает локальную сходимость, в отличие от большинства существующих релятивистских лоссов. На основе этого подхода они создали минималистичную базовую модель R3GAN, которая превосходит StyleGAN2 и другие современные GAN на нескольких наборах данных.'}, 'en': {'title': 'Simplifying GAN Training with R3GAN: A New Era of Efficiency', 'desc': 'This paper challenges the common belief that Generative Adversarial Networks (GANs) are inherently difficult to train. It introduces a new GAN loss function called the regularized relativistic GAN loss, which effectively addresses issues like mode dropping and non-convergence without relying on numerous empirical tricks. The authors provide mathematical analysis showing that their loss function guarantees local convergence, which is a significant improvement over existing methods. By applying this new loss to modern architectures like StyleGAN2, they create a simplified and efficient GAN model named R3GAN, which outperforms previous models on several benchmark datasets.'}, 'zh': {'title': '简化GAN训练,超越传统架构', 'desc': '这篇论文探讨了生成对抗网络(GAN)训练的难点,并提出了一种新的方法来简化这一过程。作者提出了一种正则化的相对GAN损失函数,解决了模式丢失和非收敛的问题。通过数学分析,证明了这种损失函数具有局部收敛的保证,优于现有的相对损失函数。最终,作者展示了一个新的简约基线R3GAN,其在多个数据集上的表现超过了StyleGAN2,并与最先进的GAN和扩散模型相媲美。'}}}, {'id': 'https://huggingface.co/papers/2501.05032', 'title': 'Enhancing Human-Like Responses in Large Language Models', 'url': 'https://huggingface.co/papers/2501.05032', 'abstract': 'This paper explores the advancements in making large language models (LLMs) more human-like. We focus on techniques that enhance natural language understanding, conversational coherence, and emotional intelligence in AI systems. The study evaluates various approaches, including fine-tuning with diverse datasets, incorporating psychological principles, and designing models that better mimic human reasoning patterns. Our findings demonstrate that these enhancements not only improve user interactions but also open new possibilities for AI applications across different domains. Future work will address the ethical implications and potential biases introduced by these human-like attributes.', 'score': 28, 'issue_id': 1609, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '64e14687fd1e5dab', 'authors': ['Ethem Yağız Çalık', 'Talha Rüzgar Akkuş'], 'affiliations': ['Hugging Face'], 'pdf_title_img': 'assets/pdf/title_img/2501.05032.jpg', 'data': {'categories': ['#training', '#alignment', '#rlhf', '#ethics', '#multimodal'], 'emoji': '🤖', 'ru': {'title': 'Путь к человекоподобному ИИ: улучшение больших языковых моделей', 'desc': 'Статья исследует методы повышения человекоподобности больших языковых моделей (LLM). Авторы рассматривают техники улучшения понимания естественного языка, связности диалогов и эмоционального интеллекта в системах искусственного интеллекта. Исследование оценивает различные подходы, включая дообучение на разнообразных датасетах, внедрение психологических принципов и разработку моделей, лучше имитирующих человеческие паттерны мышления. Результаты показывают, что эти улучшения не только совершенствуют взаимодействие с пользователем, но и открывают новые возможности для применения ИИ в различных областях.'}, 'en': {'title': 'Enhancing AI: Making Language Models More Human-Like', 'desc': 'This paper investigates how to make large language models (LLMs) behave more like humans. It emphasizes improving natural language understanding, making conversations more coherent, and increasing emotional intelligence in AI. The research assesses methods such as fine-tuning models with varied datasets and applying psychological principles to enhance human-like reasoning. The results show that these improvements lead to better user experiences and expand the potential uses of AI, while also highlighting the need to consider ethical issues and biases that may arise.'}, 'zh': {'title': '让人工智能更像人类的未来', 'desc': '本文探讨了使大型语言模型(LLMs)更具人性化的进展。我们重点关注增强自然语言理解、对话连贯性和情感智能的技术。研究评估了多种方法,包括使用多样化数据集进行微调、融入心理学原理,以及设计更好模拟人类推理模式的模型。我们的发现表明,这些增强不仅改善了用户互动,还为不同领域的人工智能应用开辟了新可能。'}}}, {'id': 'https://huggingface.co/papers/2501.05453', 'title': 'An Empirical Study of Autoregressive Pre-training from Videos', 'url': 'https://huggingface.co/papers/2501.05453', 'abstract': 'We empirically study autoregressive pre-training from videos. To perform our study, we construct a series of autoregressive video models, called Toto. We treat videos as sequences of visual tokens and train transformer models to autoregressively predict future tokens. Our models are pre-trained on a diverse dataset of videos and images comprising over 1 trillion visual tokens. We explore different architectural, training, and inference design choices. We evaluate the learned visual representations on a range of downstream tasks including image recognition, video classification, object tracking, and robotics. Our results demonstrate that, despite minimal inductive biases, autoregressive pre-training leads to competitive performance across all benchmarks. Finally, we find that scaling our video models results in similar scaling curves to those seen in language models, albeit with a different rate. More details at https://brjathu.github.io/toto/', 'score': 28, 'issue_id': 1596, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '3846ea8507d046be', 'authors': ['Jathushan Rajasegaran', 'Ilija Radosavovic', 'Rahul Ravishankar', 'Yossi Gandelsman', 'Christoph Feichtenhofer', 'Jitendra Malik'], 'affiliations': ['Meta FAIR', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.05453.jpg', 'data': {'categories': ['#training', '#dataset', '#benchmark', '#architecture', '#robotics', '#video', '#cv'], 'emoji': '🎬', 'ru': {'title': 'Авторегрессионное предобучение видео: путь к универсальному компьютерному зрению', 'desc': 'В статье исследуется авторегрессионное предобучение на видеоданных с использованием модели Toto. Авторы рассматривают видео как последовательности визуальных токенов и обучают трансформеры предсказывать будущие токены. Модели предобучаются на разнообразном наборе данных из более чем триллиона визуальных токенов. Результаты показывают, что такой подход дает конкурентоспособную производительность на различных задачах компьютерного зрения.'}, 'en': {'title': 'Unlocking Video Understanding with Autoregressive Models', 'desc': 'This paper investigates the use of autoregressive pre-training for video data through a series of models named Toto. The authors treat videos as sequences of visual tokens and employ transformer architectures to predict future tokens in these sequences. They pre-train their models on a massive dataset containing over 1 trillion visual tokens, exploring various design choices in architecture and training. The results show that these autoregressive models achieve strong performance on tasks like image recognition and video classification, indicating that scaling video models can yield similar benefits as seen in language models.'}, 'zh': {'title': '自回归预训练:视频模型的新突破', 'desc': '本文研究了视频的自回归预训练。我们构建了一系列名为Toto的自回归视频模型,将视频视为视觉标记的序列,并训练变换器模型以自回归方式预测未来的标记。我们的模型在一个包含超过1万亿视觉标记的多样化视频和图像数据集上进行预训练,并在多个下游任务上评估学习到的视觉表示。结果表明,尽管诱导偏差较小,自回归预训练在所有基准测试中表现出竞争力的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04003', 'title': 'Are VLMs Ready for Autonomous Driving? An Empirical Study from the Reliability, Data, and Metric Perspectives', 'url': 'https://huggingface.co/papers/2501.04003', 'abstract': "Recent advancements in Vision-Language Models (VLMs) have sparked interest in their use for autonomous driving, particularly in generating interpretable driving decisions through natural language. However, the assumption that VLMs inherently provide visually grounded, reliable, and interpretable explanations for driving remains largely unexamined. To address this gap, we introduce DriveBench, a benchmark dataset designed to evaluate VLM reliability across 17 settings (clean, corrupted, and text-only inputs), encompassing 19,200 frames, 20,498 question-answer pairs, three question types, four mainstream driving tasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often generate plausible responses derived from general knowledge or textual cues rather than true visual grounding, especially under degraded or missing visual inputs. This behavior, concealed by dataset imbalances and insufficient evaluation metrics, poses significant risks in safety-critical scenarios like autonomous driving. We further observe that VLMs struggle with multi-modal reasoning and display heightened sensitivity to input corruptions, leading to inconsistencies in performance. To address these challenges, we propose refined evaluation metrics that prioritize robust visual grounding and multi-modal understanding. Additionally, we highlight the potential of leveraging VLMs' awareness of corruptions to enhance their reliability, offering a roadmap for developing more trustworthy and interpretable decision-making systems in real-world autonomous driving contexts. The benchmark toolkit is publicly accessible.", 'score': 20, 'issue_id': 1599, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '720b493a608f478a', 'authors': ['Shaoyuan Xie', 'Lingdong Kong', 'Yuhao Dong', 'Chonghao Sima', 'Wenwei Zhang', 'Qi Alfred Chen', 'Ziwei Liu', 'Liang Pan'], 'affiliations': ['National University of Singapore', 'S-Lab, Nanyang Technological University', 'Shanghai AI Laboratory', 'The University of Hong Kong', 'University of California, Irvine'], 'pdf_title_img': 'assets/pdf/title_img/2501.04003.jpg', 'data': {'categories': ['#security', '#interpretability', '#dataset', '#multimodal', '#reasoning', '#benchmark', '#cv'], 'emoji': '🚗', 'ru': {'title': 'Проверка надёжности VLM для безопасного автономного вождения', 'desc': 'Статья представляет DriveBench - набор данных для оценки надёжности мультимодальных языковых моделей (VLM) в контексте автономного вождения. Исследование выявило, что VLM часто генерируют правдоподобные ответы на основе общих знаний, а не визуальной информации, что опасно в критически важных сценариях. Авторы предлагают усовершенствованные метрики оценки, ориентированные на надёжную визуальную привязку и мультимодальное понимание. Также отмечается потенциал использования осведомленности VLM о искажениях для повышения их надёжности.'}, 'en': {'title': 'Enhancing Trust in Vision-Language Models for Safer Autonomous Driving', 'desc': 'This paper discusses the limitations of Vision-Language Models (VLMs) in the context of autonomous driving, particularly their ability to provide reliable and interpretable driving decisions. The authors introduce DriveBench, a comprehensive benchmark dataset that tests VLM performance across various conditions, including clean and corrupted inputs. Their research shows that VLMs often rely on general knowledge rather than true visual understanding, especially when visual data is compromised. To improve VLM reliability, the paper suggests new evaluation metrics focused on visual grounding and multi-modal reasoning, aiming to enhance the safety of autonomous driving systems.'}, 'zh': {'title': '提升自动驾驶决策的可靠性与可解释性', 'desc': '本文介绍了DriveBench,一个用于评估视觉语言模型(VLMs)在自动驾驶中可靠性的基准数据集。该数据集包含19200帧图像和20498个问答对,涵盖了多种驾驶任务和输入类型。研究发现,VLMs在处理受损或缺失的视觉输入时,往往依赖于一般知识而非真实的视觉信息,导致安全隐患。为了解决这些问题,本文提出了改进的评估指标,强调视觉基础和多模态理解的重要性。'}}}, {'id': 'https://huggingface.co/papers/2501.05122', 'title': 'Centurio: On Drivers of Multilingual Ability of Large Vision-Language Model', 'url': 'https://huggingface.co/papers/2501.05122', 'abstract': 'Most Large Vision-Language Models (LVLMs) to date are trained predominantly on English data, which makes them struggle to understand non-English input and fail to generate output in the desired target language. Existing efforts mitigate these issues by adding multilingual training data, but do so in a largely ad-hoc manner, lacking insight into how different training mixes tip the scale for different groups of languages. In this work, we present a comprehensive investigation into the training strategies for massively multilingual LVLMs. First, we conduct a series of multi-stage experiments spanning 13 downstream vision-language tasks and 43 languages, systematically examining: (1) the number of training languages that can be included without degrading English performance and (2) optimal language distributions of pre-training as well as (3) instruction-tuning data. Further, we (4) investigate how to improve multilingual text-in-image understanding, and introduce a new benchmark for the task. Surprisingly, our analysis reveals that one can (i) include as many as 100 training languages simultaneously (ii) with as little as 25-50\\% of non-English data, to greatly improve multilingual performance while retaining strong English performance. We further find that (iii) including non-English OCR data in pre-training and instruction-tuning is paramount for improving multilingual text-in-image understanding. Finally, we put all our findings together and train Centurio, a 100-language LVLM, offering state-of-the-art performance in an evaluation covering 14 tasks and 56 languages.', 'score': 13, 'issue_id': 1604, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '92d74f3bbeb4a400', 'authors': ['Gregor Geigle', 'Florian Schneider', 'Carolin Holtermann', 'Chris Biemann', 'Radu Timofte', 'Anne Lauscher', 'Goran Glavaš'], 'affiliations': ['Data Science Group, University of Hamburg', 'Language Technology Group', 'WüNLP, Computer Vision Lab, CAIDAS, University of Würzburg'], 'pdf_title_img': 'assets/pdf/title_img/2501.05122.jpg', 'data': {'categories': ['#machine_translation', '#multilingual', '#benchmark', '#low_resource'], 'emoji': '🌍', 'ru': {'title': 'Centurio: Прорыв в многоязычном визуально-языковом ИИ', 'desc': 'В статье описывается исследование стратегий обучения многоязычных крупномасштабных визуально-языковых моделей (LVLMs). Авторы проводят эксперименты на 13 задачах и 43 языках, изучая оптимальное распределение языков в данных для предобучения и инструктивной настройки. Они обнаруживают, что можно включить до 100 языков обучения, используя всего 25-50% неанглийских данных, значительно улучшив многоязычную производительность при сохранении высокой эффективности на английском. На основе полученных результатов авторы обучают Centurio - 100-язычную LVLM, демонстрирующую передовые результаты на 14 задачах и 56 языках.'}, 'en': {'title': 'Unlocking Multilingual Mastery in Vision-Language Models', 'desc': 'This paper investigates how to effectively train Large Vision-Language Models (LVLMs) on multiple languages, particularly focusing on improving their performance in non-English languages. The authors conduct experiments across various tasks and languages to determine the best strategies for including multilingual data without harming English performance. They discover that including up to 100 languages and using a smaller proportion of non-English data can enhance multilingual capabilities while maintaining strong English results. Additionally, they emphasize the importance of incorporating non-English OCR data to boost understanding of text within images, culminating in the development of Centurio, a 100-language LVLM with state-of-the-art performance.'}, 'zh': {'title': '提升多语言理解,Centurio引领新潮流', 'desc': '本文研究了大规模多语言视觉-语言模型(LVLM)的训练策略,特别关注如何提高模型对非英语输入的理解和输出能力。我们通过多阶段实验,分析了包含多种语言的训练数据对英语性能的影响,并探索了最佳的语言分布策略。研究发现,最多可以同时包含100种语言的训练数据,并且只需25-50%的非英语数据即可显著提升多语言性能。最后,我们结合所有发现,训练了Centurio,一个支持100种语言的LVLM,在14个任务和56种语言的评估中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.03489', 'title': 'Entropy-Guided Attention for Private LLMs', 'url': 'https://huggingface.co/papers/2501.03489', 'abstract': "The pervasiveness of proprietary language models has raised critical privacy concerns, necessitating advancements in private inference (PI), where computations are performed directly on encrypted data without revealing users' sensitive information. While PI offers a promising solution, its practical deployment is hindered by substantial communication and latency overheads, primarily stemming from nonlinear operations. To address this, we introduce an information-theoretic framework to characterize the role of nonlinearities in decoder-only language models, laying a principled foundation for optimizing transformer-architectures tailored to the demands of PI. By leveraging Shannon's entropy as a quantitative measure, we uncover the previously unexplored dual significance of nonlinearities: beyond ensuring training stability, they are crucial for maintaining attention head diversity. Specifically, we find that their removal triggers two critical failure modes: {\\em entropy collapse} in deeper layers that destabilizes training, and {\\em entropic overload} in earlier layers that leads to under-utilization of Multi-Head Attention's (MHA) representational capacity. We propose an entropy-guided attention mechanism paired with a novel entropy regularization technique to mitigate entropic overload. Additionally, we explore PI-friendly alternatives to layer normalization for preventing entropy collapse and stabilizing the training of LLMs with reduced-nonlinearities. Our study bridges the gap between information theory and architectural design, establishing entropy dynamics as a principled guide for developing efficient PI architectures. The code and implementation are available at https://github.com/Nandan91/entropy-guided-attention-llm{entropy-guided-llm}.", 'score': 11, 'issue_id': 1597, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '18abcfb3fe1b209b', 'authors': ['Nandan Kumar Jha', 'Brandon Reagen'], 'affiliations': ['New York University'], 'pdf_title_img': 'assets/pdf/title_img/2501.03489.jpg', 'data': {'categories': ['#security', '#inference', '#optimization', '#architecture', '#training', '#open_source'], 'emoji': '🔐', 'ru': {'title': 'Энтропия как ключ к конфиденциальным языковым моделям', 'desc': 'Статья рассматривает проблему конфиденциальности при использовании языковых моделей и предлагает решение через частное вычисление (PI). Авторы представляют информационно-теоретическую основу для оптимизации архитектур трансформеров под задачи PI, используя энтропию Шеннона как количественную меру. Исследование выявляет двойную роль нелинейностей в моделях: обеспечение стабильности обучения и поддержание разнообразия в механизме внимания. Предложен энтропийно-управляемый механизм внимания и новая техника регуляризации энтропии для улучшения эффективности PI-архитектур.'}, 'en': {'title': 'Optimizing Language Models for Privacy with Entropy Dynamics', 'desc': 'This paper addresses privacy concerns related to proprietary language models by focusing on private inference (PI), which allows computations on encrypted data. The authors introduce an information-theoretic framework to analyze the impact of nonlinearities in decoder-only language models, which are essential for optimizing transformer architectures for PI. They identify two critical issues caused by the removal of nonlinearities: entropy collapse in deeper layers and entropic overload in earlier layers, both of which affect training stability and attention mechanisms. To resolve these issues, the paper proposes an entropy-guided attention mechanism and explores alternatives to layer normalization, aiming to enhance the efficiency of PI architectures while maintaining model performance.'}, 'zh': {'title': '优化私密推理架构的熵动态', 'desc': '本论文探讨了在加密数据上进行私密推理(PI)时,非线性操作对解码器语言模型的影响。我们提出了一种信息论框架,帮助优化适合PI需求的变换器架构。研究发现,非线性不仅确保了训练的稳定性,还对注意力头的多样性至关重要。为了解决熵崩溃和熵过载问题,我们提出了一种基于熵的注意力机制和新的熵正则化技术。'}}}, {'id': 'https://huggingface.co/papers/2501.05040', 'title': 'SWE-Fixer: Training Open-Source LLMs for Effective and Efficient GitHub Issue Resolution', 'url': 'https://huggingface.co/papers/2501.05040', 'abstract': 'Large Language Models (LLMs) have demonstrated remarkable proficiency across a variety of complex tasks. One significant application of LLMs is in tackling software engineering challenges, particularly in resolving real-world tasks on GitHub by fixing code based on the issues reported by the users. However, many current approaches rely on proprietary LLMs, which limits reproducibility, accessibility, and transparency. The critical components of LLMs for addressing software engineering issues and how their capabilities can be effectively enhanced remain unclear. To address these challenges, we introduce SWE-Fixer, a novel open-source LLM designed to effectively and efficiently resolve GitHub issues. SWE-Fixer comprises two essential modules: a code file retrieval module and a code editing module. The retrieval module employs BM25 along with a lightweight LLM model to achieve coarse-to-fine file retrieval. Subsequently, the code editing module utilizes the other LLM model to generate patches for the identified files. Then, to mitigate the lack of publicly available datasets, we compile an extensive dataset that includes 110K GitHub issues along with their corresponding patches, and train the two modules of SWE-Fixer separately. We assess our approach on the SWE-Bench Lite and Verified benchmarks, achieving state-of-the-art performance among open-source models with scores of 23.3% and 30.2%, respectively. These outcomes highlight the efficacy of our approach. We will make our model, dataset, and code publicly available at https://github.com/InternLM/SWE-Fixer.', 'score': 8, 'issue_id': 1608, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '54d8f8a0fe5436c6', 'authors': ['Chengxing Xie', 'Bowen Li', 'Chang Gao', 'He Du', 'Wai Lam', 'Difan Zou', 'Kai Chen'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong', 'The University of Hong Kong', 'Xidian University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05040.jpg', 'data': {'categories': ['#data', '#open_source', '#dataset', '#architecture', '#benchmark', '#training', '#science'], 'emoji': '🛠️', 'ru': {'title': 'Открытая языковая модель для эффективного решения проблем на GitHub', 'desc': 'SWE-Fixer - это новая модель с открытым исходным кодом для решения проблем на GitHub. Она состоит из модуля поиска файлов кода и модуля редактирования кода, использующих легковесные языковые модели. Авторы создали обширный датасет из 110 тысяч GitHub-issues с патчами для обучения модели. SWE-Fixer достигла лучших результатов среди моделей с открытым кодом на бенчмарках SWE-Bench Lite и Verified.'}, 'en': {'title': 'SWE-Fixer: Open-Source Solutions for GitHub Issues', 'desc': 'This paper presents SWE-Fixer, an open-source Large Language Model (LLM) specifically designed to address software engineering challenges on GitHub. It features two main components: a code file retrieval module that uses BM25 and a lightweight LLM for efficient file identification, and a code editing module that generates code patches using another LLM. The authors also created a comprehensive dataset of 110,000 GitHub issues and their corresponding patches to train the model effectively. SWE-Fixer achieves state-of-the-art performance on benchmark tests, demonstrating its potential to enhance accessibility and transparency in software engineering solutions.'}, 'zh': {'title': '开源LLM助力软件工程问题解决', 'desc': '大型语言模型(LLMs)在处理复杂任务方面表现出色,尤其是在软件工程领域。本文介绍了一种新颖的开源LLM,名为SWE-Fixer,旨在有效解决GitHub上的问题。SWE-Fixer包含两个主要模块:代码文件检索模块和代码编辑模块,前者使用BM25和轻量级LLM进行文件检索,后者生成代码补丁。通过构建包含11万个GitHub问题及其补丁的数据集,SWE-Fixer在开源模型中实现了领先的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04377', 'title': 'On Computational Limits and Provably Efficient Criteria of Visual Autoregressive Models: A Fine-Grained Complexity Analysis', 'url': 'https://huggingface.co/papers/2501.04377', 'abstract': 'Recently, Visual Autoregressive (VAR) Models introduced a groundbreaking advancement in the field of image generation, offering a scalable approach through a coarse-to-fine "next-scale prediction" paradigm. However, the state-of-the-art algorithm of VAR models in [Tian, Jiang, Yuan, Peng and Wang, NeurIPS 2024] takes O(n^4) time, which is computationally inefficient. In this work, we analyze the computational limits and efficiency criteria of VAR Models through a fine-grained complexity lens. Our key contribution is identifying the conditions under which VAR computations can achieve sub-quadratic time complexity. Specifically, we establish a critical threshold for the norm of input matrices used in VAR attention mechanisms. Above this threshold, assuming the Strong Exponential Time Hypothesis (SETH) from fine-grained complexity theory, a sub-quartic time algorithm for VAR models is impossible. To substantiate our theoretical findings, we present efficient constructions leveraging low-rank approximations that align with the derived criteria. This work initiates the study of the computational efficiency of the VAR model from a theoretical perspective. Our technique will shed light on advancing scalable and efficient image generation in VAR frameworks.', 'score': 8, 'issue_id': 1597, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'be8a0f20db676680', 'authors': ['Yekun Ke', 'Xiaoyu Li', 'Yingyu Liang', 'Zhizhou Sha', 'Zhenmei Shi', 'Zhao Song'], 'affiliations': ['The Simons Institute for the Theory of Computing at UC Berkeley', 'The University of Hong Kong', 'Tsinghua University', 'University of Wisconsin-Madison'], 'pdf_title_img': 'assets/pdf/title_img/2501.04377.jpg', 'data': {'categories': ['#math', '#optimization', '#cv'], 'emoji': '🔬', 'ru': {'title': 'Преодоление вычислительных барьеров в VAR моделях', 'desc': 'Статья исследует вычислительные ограничения и критерии эффективности Визуальных Авторегрессионных (VAR) моделей с точки зрения тонкой теории сложности. Авторы определяют условия, при которых вычисления VAR могут достичь субквадратичной временной сложности. Они устанавливают критический порог для нормы входных матриц, используемых в механизмах внимания VAR, выше которого невозможен субкварцевый алгоритм времени для моделей VAR. Представлены эффективные конструкции, использующие аппроксимации низкого ранга, которые соответствуют выведенным критериям.'}, 'en': {'title': 'Unlocking Efficiency in Image Generation with VAR Models', 'desc': 'This paper explores the computational efficiency of Visual Autoregressive (VAR) Models, which are used for generating images. The authors identify that the current state-of-the-art VAR algorithm is computationally expensive, operating in O(n^4) time complexity. They establish conditions under which VAR computations can be optimized to achieve sub-quadratic time complexity, particularly focusing on the input matrix norms in the attention mechanisms. By applying low-rank approximations, the authors provide practical constructions that meet their theoretical criteria, paving the way for more efficient image generation techniques in VAR frameworks.'}, 'zh': {'title': '提升VAR模型的计算效率', 'desc': '最近,视觉自回归(VAR)模型在图像生成领域取得了突破性进展,采用粗到细的“下一个尺度预测”范式。然而,VAR模型的最新算法在计算上效率低下,时间复杂度为O(n^4)。本文通过细粒度复杂性分析,探讨了VAR模型的计算限制和效率标准。我们确定了VAR计算可以实现亚二次时间复杂度的条件,并提出了利用低秩近似的高效构造,以支持我们的理论发现。'}}}, {'id': 'https://huggingface.co/papers/2501.04828', 'title': 'Building Foundations for Natural Language Processing of Historical Turkish: Resources and Models', 'url': 'https://huggingface.co/papers/2501.04828', 'abstract': 'This paper introduces foundational resources and models for natural language processing (NLP) of historical Turkish, a domain that has remained underexplored in computational linguistics. We present the first named entity recognition (NER) dataset, HisTR and the first Universal Dependencies treebank, OTA-BOUN for a historical form of the Turkish language along with transformer-based models trained using these datasets for named entity recognition, dependency parsing, and part-of-speech tagging tasks. Additionally, we introduce Ottoman Text Corpus (OTC), a clean corpus of transliterated historical Turkish texts that spans a wide range of historical periods. Our experimental results show significant improvements in the computational analysis of historical Turkish, achieving promising results in tasks that require understanding of historical linguistic structures. They also highlight existing challenges, such as domain adaptation and language variations across time periods. All of the presented resources and models are made available at https://huggingface.co/bucolin to serve as a benchmark for future progress in historical Turkish NLP.', 'score': 6, 'issue_id': 1603, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '40fe69c40d907fc4', 'authors': ['Şaziye Betül Özateş', 'Tarık Emre Tıraş', 'Ece Elif Adak', 'Berat Doğan', 'Fatih Burak Karagöz', 'Efe Eren Genç', 'Esma F. Bilgin Taşdemir'], 'affiliations': ['Bogaziçi University', 'Medeniyet University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04828.jpg', 'data': {'categories': ['#dataset', '#data', '#low_resource', '#science', '#multilingual', '#benchmark'], 'emoji': '🏛️', 'ru': {'title': 'Прорыв в NLP для исторического турецкого языка', 'desc': 'Статья представляет первые ресурсы и модели для обработки естественного языка (NLP) исторического турецкого языка. Авторы создали первый датасет для распознавания именованных сущностей (NER) HisTR и первый Universal Dependencies тривбанк OTA-BOUN для исторической формы турецкого языка. Также были разработаны трансформерные модели для задач NER, синтаксического анализа и морфологической разметки. Дополнительно представлен Османский текстовый корпус (OTC) - очищенный корпус транслитерированных исторических турецких текстов разных периодов.'}, 'en': {'title': 'Unlocking Historical Turkish: New Resources for NLP', 'desc': 'This paper provides essential resources and models for processing historical Turkish language using natural language processing (NLP) techniques. It introduces the first named entity recognition (NER) dataset, HisTR, and the first Universal Dependencies treebank, OTA-BOUN, specifically for historical Turkish. The authors also present the Ottoman Text Corpus (OTC), a comprehensive collection of transliterated texts from various historical periods. The results demonstrate advancements in analyzing historical Turkish, while also addressing challenges like domain adaptation and linguistic variations over time.'}, 'zh': {'title': '推动历史土耳其语NLP的进步', 'desc': '本文介绍了历史土耳其语自然语言处理(NLP)的基础资源和模型,这是一个在计算语言学中尚未深入研究的领域。我们首次发布了命名实体识别(NER)数据集HisTR和历史土耳其语的Universal Dependencies树库OTA-BOUN,并基于这些数据集训练了用于命名实体识别、依存句法分析和词性标注任务的变换器模型。此外,我们还推出了奥斯曼文本语料库(OTC),这是一个涵盖多个历史时期的清晰转写历史土耳其语文本的语料库。实验结果显示,在历史土耳其语的计算分析中取得了显著进展,但也突显了领域适应和语言随时间变化等挑战。'}}}];
const articlesContainer = document.getElementById('articles-container');
const sortDropdown = document.getElementById('sort-dropdown');
const categoryFiltersContainer = document.getElementById('category-filters');
@@ -1184,7 +1184,7 @@
function updateTimeDiffs() {
const timeDiff = document.getElementById('timeDiff');
- timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-18 12:37',lang=currentLang);
+ timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-18 18:26',lang=currentLang);
}
function updateSortingOptions() {
const sortingLabels = {
@@ -1238,14 +1238,14 @@
}
function hideNextLink(format) {
if (format === 'monthly') {
- if (isCurrentMonth('2025-01-18 12:37')) {
+ if (isCurrentMonth('2025-01-18 18:26')) {
const element = document.getElementById('nav-next');
if (element) {
element.style.display = 'none';
}
}
} else {
- if (isToday('2025-01-18 12:37')) {
+ if (isToday('2025-01-18 18:26')) {
const element = document.getElementById('nav-next');
if (element) {
element.style.display = 'none';