From 8c398dc30deaa87b073fbda5f68fce05f280c4f1 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Wed, 29 Jan 2025 23:09:45 +0000 Subject: [PATCH] Auto. Make Doomgrad HF Review on 29 January --- d/2025-01-29.html | 8 +- d/2025-01-29.json | 8 +- hf_papers.json | 8 +- index.html | 8 +- log.txt | 6 +- logs/2025-01-29_last_log.txt | 178 +++++++++++++++++------------------ m/2025-01.html | 8 +- 7 files changed, 112 insertions(+), 112 deletions(-) diff --git a/d/2025-01-29.html b/d/2025-01-29.html index 1c305089d..d36e47aa2 100644 --- a/d/2025-01-29.html +++ b/d/2025-01-29.html @@ -881,7 +881,7 @@ } } - const articlesData = [{'id': 'https://huggingface.co/papers/2501.17161', 'title': 'SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training', 'url': 'https://huggingface.co/papers/2501.17161', 'abstract': "Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic reasoning card game, and adopt V-IRL, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants in both textual and visual domains. We show that RL, especially when trained with an outcome-based reward, generalizes across both rule-based textual and visual variants. SFT, in contrast, tends to memorize training data and struggles to generalize out-of-distribution scenarios. Further analysis reveals that RL improves the model's underlying visual recognition capabilities, contributing to its enhanced generalization in the visual domain. Despite RL's superior generalization, we show that SFT remains essential for effective RL training; SFT stabilizes the model's output format, enabling subsequent RL to achieve its performance gains. These findings demonstrates the capability of RL for acquiring generalizable knowledge in complex, multi-modal tasks.", 'score': 27, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'ce9300709a3cdc7a', 'authors': ['Tianzhe Chu', 'Yuexiang Zhai', 'Jihan Yang', 'Shengbang Tong', 'Saining Xie', 'Dale Schuurmans', 'Quoc V. Le', 'Sergey Levine', 'Yi Ma'], 'affiliations': ['Google DeepMind', 'HKU', 'NYU', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.17161.jpg', 'data': {'categories': ['#reasoning', '#training', '#optimization', '#rl', '#multimodal', '#games'], 'emoji': '🧠', 'ru': {'title': 'RL превосходит SFT в обобщении для мультимодальных задач', 'desc': 'Это исследование сравнивает методы дообучения языковых моделей: обучение с учителем (SFT) и обучение с подкреплением (RL). Авторы анализируют способность моделей к обобщению на новые текстовые и визуальные варианты задач. Результаты показывают, что RL лучше обобщается на новые ситуации, особенно при использовании награды, основанной на результате. SFT, напротив, склонно к запоминанию обучающих данных и хуже справляется с обобщением.'}, 'en': {'title': 'Unlocking Generalization: RL Outshines SFT in Multi-Modal Tasks', 'desc': 'This paper investigates how supervised fine-tuning (SFT) and reinforcement learning (RL) affect the generalization abilities of foundation models. It highlights that while SFT often leads to memorization of training data, RL, particularly with outcome-based rewards, enhances generalization across unseen textual and visual variants. The study introduces GeneralPoints, a reasoning game, and V-IRL, a navigation environment, to evaluate model performance. The results indicate that RL not only improves generalization but also strengthens visual recognition, although SFT is still crucial for stabilizing the model before RL training.'}, 'zh': {'title': '强化学习提升模型泛化能力的研究', 'desc': '这篇论文研究了监督微调(SFT)和强化学习(RL)在基础模型中的作用,特别是在提高模型的泛化能力方面。研究表明,RL在处理文本和视觉变体时,能够更好地泛化,而SFT则倾向于记忆训练数据,难以应对未见过的情况。通过引入算术推理卡牌游戏GeneralPoints和真实世界导航环境V-IRL,作者评估了这两种方法的效果。尽管RL在泛化能力上表现优越,但SFT仍然对有效的RL训练至关重要,因为它稳定了模型的输出格式。'}}}, {'id': 'https://huggingface.co/papers/2501.17116', 'title': 'Optimizing Large Language Model Training Using FP4 Quantization', 'url': 'https://huggingface.co/papers/2501.17116', 'abstract': 'The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work introduces the first FP4 training framework for LLMs, addressing these challenges with two key innovations: a differentiable quantization estimator for precise weight updates and an outlier clamping and compensation strategy to prevent activation collapse. To ensure stability, the framework integrates a mixed-precision training scheme and vector-wise quantization. Experimental results demonstrate that our FP4 framework achieves accuracy comparable to BF16 and FP8, with minimal degradation, scaling effectively to 13B-parameter LLMs trained on up to 100B tokens. With the emergence of next-generation hardware supporting FP4, our framework sets a foundation for efficient ultra-low precision training.', 'score': 12, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '9ce85dc91aee17fc', 'authors': ['Ruizhe Wang', 'Yeyun Gong', 'Xiao Liu', 'Guoshuai Zhao', 'Ziyue Yang', 'Baining Guo', 'Zhengjun Zha', 'Peng Cheng'], 'affiliations': ['Microsoft Research Asia', 'Microsoft SIGMA Team', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.17116.jpg', 'data': {'categories': ['#optimization', '#training', '#inference'], 'emoji': '🔢', 'ru': {'title': 'FP4: Революция в эффективности обучения языковых моделей', 'desc': 'Статья представляет первую систему обучения больших языковых моделей (LLM) с использованием 4-битной точности с плавающей запятой (FP4). Авторы разработали дифференцируемый оценщик квантования для точного обновления весов и стратегию ограничения и компенсации выбросов для предотвращения коллапса активаций. Система включает схему обучения со смешанной точностью и векторное квантование для обеспечения стабильности. Экспериментальные результаты показывают, что FP4-обучение достигает точности, сравнимой с BF16 и FP8, эффективно масштабируясь до LLM с 13 млрд параметров.'}, 'en': {'title': 'Efficient Training of Large Language Models with FP4 Precision', 'desc': 'This paper addresses the high computational costs associated with training large language models (LLMs) by introducing a novel FP4 training framework. The framework utilizes quantized training techniques, specifically focusing on low-bit arithmetic to enhance efficiency while maintaining model accuracy. Key innovations include a differentiable quantization estimator for better weight updates and a strategy to manage outliers, which helps prevent activation collapse. Experimental results show that this FP4 approach achieves performance similar to higher precision formats like BF16 and FP8, making it suitable for large-scale LLMs.'}, 'zh': {'title': 'FP4训练框架:高效的超低精度训练新方案', 'desc': '随着大型语言模型(LLMs)训练对计算资源的需求不断增加,寻找更高效的方法变得尤为重要。量化训练通过允许低位数算术运算来降低这些成本,展现出良好的前景。尽管FP8精度已被证明可行,但FP4的应用仍面临显著的量化误差和有限的表示能力。本文提出了首个FP4训练框架,通过可微分量化估计器和异常值钳制与补偿策略,解决了这些挑战,并在稳定性方面结合了混合精度训练方案和向量级量化。'}}}, {'id': 'https://huggingface.co/papers/2501.16975', 'title': 'Over-Tokenized Transformer: Vocabulary is Generally Worth Scaling', 'url': 'https://huggingface.co/papers/2501.16975', 'abstract': 'Tokenization is a fundamental component of large language models (LLMs), yet its influence on model scaling and performance is not fully explored. In this paper, we introduce Over-Tokenized Transformers, a novel framework that decouples input and output vocabularies to improve language modeling performance. Specifically, our approach scales up input vocabularies to leverage multi-gram tokens. Through extensive experiments, we uncover a log-linear relationship between input vocabulary size and training loss, demonstrating that larger input vocabularies consistently enhance model performance, regardless of model size. Using a large input vocabulary, we achieve performance comparable to double-sized baselines with no additional cost. Our findings highlight the importance of tokenization in scaling laws and provide practical insight for tokenizer design, paving the way for more efficient and powerful LLMs.', 'score': 10, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '27930c2f5d17471e', 'authors': ['Hongzhi Huang', 'Defa Zhu', 'Banggu Wu', 'Yutao Zeng', 'Ya Wang', 'Qiyang Min', 'Xun Zhou'], 'affiliations': ['Seed-Foundation-Model Team, Bytedance'], 'pdf_title_img': 'assets/pdf/title_img/2501.16975.jpg', 'data': {'categories': ['#optimization', '#training', '#architecture'], 'emoji': '🔤', 'ru': {'title': 'Больше токенов - выше эффективность: новый взгляд на масштабирование языковых моделей', 'desc': 'Статья представляет новый подход к токенизации в больших языковых моделях, называемый Over-Tokenized Transformers. Авторы предлагают разделить входной и выходной словари, увеличивая размер входного словаря для использования мультиграммных токенов. Исследование выявило логарифмически-линейную зависимость между размером входного словаря и потерями при обучении. Результаты показывают, что увеличение входного словаря consistently улучшает производительность модели независимо от её размера.'}, 'en': {'title': 'Unlocking Performance: The Power of Over-Tokenization in Language Models', 'desc': "This paper presents a new approach called Over-Tokenized Transformers, which focuses on improving the tokenization process in large language models (LLMs). By separating the input and output vocabularies, the authors demonstrate that increasing the input vocabulary size can significantly reduce training loss and enhance model performance. Their experiments reveal a consistent log-linear relationship between the size of the input vocabulary and the model's effectiveness, showing that larger vocabularies lead to better results without increasing computational costs. This research emphasizes the critical role of tokenization in the scaling of LLMs and offers valuable insights for designing more efficient tokenizers."}, 'zh': {'title': '分词技术提升大语言模型性能的关键', 'desc': '本文探讨了大语言模型中的分词技术对模型性能的影响。我们提出了一种新的框架——过度分词变换器,旨在通过解耦输入和输出词汇表来提升语言建模性能。研究表明,增大输入词汇表可以有效降低训练损失,从而提高模型性能。我们的实验结果显示,使用更大的输入词汇表可以在不增加成本的情况下,达到与双倍基线相当的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.16764', 'title': 'DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation', 'url': 'https://huggingface.co/papers/2501.16764', 'abstract': 'Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-image diffusion models. It differs from previous 3D generative models by effectively utilizing web-scale 2D priors while maintaining 3D consistency in a unified model. To bootstrap the training, a lightweight reconstruction model is proposed to instantly produce multi-view Gaussian splat grids for scalable dataset curation. In conjunction with the regular diffusion loss on these grids, a 3D rendering loss is introduced to facilitate 3D coherence across arbitrary views. The compatibility with image diffusion models enables seamless adaptions of numerous techniques for image generation to the 3D realm. Extensive experiments reveal the superiority of DiffSplat in text- and image-conditioned generation tasks and downstream applications. Thorough ablation studies validate the efficacy of each critical design choice and provide insights into the underlying mechanism.', 'score': 8, 'issue_id': 1921, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '00ee1a0338716711', 'authors': ['Chenguo Lin', 'Panwang Pan', 'Bangbang Yang', 'Zeming Li', 'Yadong Mu'], 'affiliations': ['ByteDance', 'Peking University'], 'pdf_title_img': 'assets/pdf/title_img/2501.16764.jpg', 'data': {'categories': ['#diffusion', '#optimization', '#training', '#dataset', '#3d'], 'emoji': '🎨', 'ru': {'title': 'DiffSplat: Генерация 3D контента на новом уровне', 'desc': 'DiffSplat - это новая система генерации 3D контента, использующая диффузионные модели для создания трехмерных гауссовых сплатов. Она решает проблемы ограниченных 3D датасетов и несогласованности при мультиракурсной 2D генерации. DiffSplat объединяет масштабные 2D-приоры с 3D-согласованностью, используя легковесную модель реконструкции и специальную функцию потерь. Эксперименты показывают превосходство DiffSplat в задачах генерации по тексту и изображениям.'}, 'en': {'title': 'Revolutionizing 3D Generation with DiffSplat', 'desc': 'DiffSplat is a new framework for generating 3D content from text or images, addressing challenges like the lack of high-quality 3D datasets. It uses advanced text-to-image diffusion models to create 3D Gaussian splats while ensuring consistency across different views. The framework includes a lightweight reconstruction model that helps quickly generate multi-view datasets for training. Through extensive testing, DiffSplat shows improved performance in generating 3D content and offers insights into its effective design choices.'}, 'zh': {'title': 'DiffSplat:3D生成的新突破', 'desc': '最近,3D内容生成从文本或单张图像中取得了进展,但高质量3D数据集有限,且2D多视图生成存在不一致性。我们提出了DiffSplat,这是一种新颖的3D生成框架,能够通过控制大规模文本到图像的扩散模型,原生生成3D高斯点云。与以往的3D生成模型不同,DiffSplat有效利用了网络规模的2D先验,同时在统一模型中保持3D一致性。通过引入轻量级重建模型和3D渲染损失,DiffSplat在文本和图像条件生成任务中表现出色,且在下游应用中也显示出其优越性。'}}}, {'id': 'https://huggingface.co/papers/2501.16496', 'title': 'Open Problems in Mechanistic Interpretability', 'url': 'https://huggingface.co/papers/2501.16496', 'abstract': "Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting scientific questions about the nature of intelligence. Despite recent progress toward these goals, there are many open problems in the field that require solutions before many scientific and practical benefits can be realized: Our methods require both conceptual and practical improvements to reveal deeper insights; we must figure out how best to apply our methods in pursuit of specific goals; and the field must grapple with socio-technical challenges that influence and are influenced by our work. This forward-facing review discusses the current frontier of mechanistic interpretability and the open problems that the field may benefit from prioritizing.", 'score': 7, 'issue_id': 1920, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '5a7a914accebfa33', 'authors': ['Lee Sharkey', 'Bilal Chughtai', 'Joshua Batson', 'Jack Lindsey', 'Jeff Wu', 'Lucius Bushnaq', 'Nicholas Goldowsky-Dill', 'Stefan Heimersheim', 'Alejandro Ortega', 'Joseph Bloom', 'Stella Biderman', 'Adria Garriga-Alonso', 'Arthur Conmy', 'Neel Nanda', 'Jessica Rumbelow', 'Martin Wattenberg', 'Nandi Schoots', 'Joseph Miller', 'Eric J. Michaud', 'Stephen Casper', 'Max Tegmark', 'William Saunders', 'David Bau', 'Eric Todd', 'Atticus Geiger', 'Mor Geva', 'Jesse Hoogland', 'Daniel Murfet', 'Tom McGrath'], 'affiliations': ['Anthropic', 'Apollo Research', 'Google DeepMind', 'Harvard University', 'Imperial College London', 'Kings College London', 'Leap Laboratories', 'MIT', 'Northeastern University', 'Tel Aviv University', 'University of Melbourne'], 'pdf_title_img': 'assets/pdf/title_img/2501.16496.jpg', 'data': {'categories': ['#interpretability', '#survey'], 'emoji': '🧠', 'ru': {'title': 'Раскрывая тайны нейронных сетей: путь к пониманию искусственного интеллекта', 'desc': 'Статья посвящена механистической интерпретируемости нейронных сетей, цель которой - понять вычислительные механизмы, лежащие в основе их возможностей. Прогресс в этой области обещает обеспечить большую уверенность в поведении систем искусственного интеллекта и пролить свет на природу интеллекта. Авторы обсуждают открытые проблемы в области, требующие решения для реализации научных и практических преимуществ. Статья рассматривает текущие границы механистической интерпретируемости и приоритетные задачи для дальнейшего развития области.'}, 'en': {'title': 'Unlocking the Secrets of Neural Networks for Reliable AI', 'desc': 'Mechanistic interpretability focuses on understanding how neural networks work to achieve specific tasks, which can enhance the reliability of AI systems. This area of research aims to uncover the underlying processes that contribute to the intelligence exhibited by these models. Despite advancements, there are still significant challenges that need to be addressed, including improving methods for deeper insights and applying these methods effectively. Additionally, the field must consider socio-technical issues that affect and are affected by mechanistic interpretability efforts.'}, 'zh': {'title': '揭示神经网络的计算机制', 'desc': '机械解释性旨在理解神经网络能力背后的计算机制,以实现具体的科学和工程目标。该领域的进展有望提高对人工智能系统行为的信心,并揭示关于智能本质的有趣科学问题。尽管最近在这些目标上取得了一些进展,但仍有许多未解决的问题需要解决,以便实现更多的科学和实际利益。本文回顾了机械解释性的当前前沿及该领域应优先解决的开放问题。'}}}, {'id': 'https://huggingface.co/papers/2501.16372', 'title': 'Low-Rank Adapters Meet Neural Architecture Search for LLM Compression', 'url': 'https://huggingface.co/papers/2501.16372', 'abstract': 'The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank representations with Neural Architecture Search (NAS) techniques, particularly weight-sharing super-networks. Robust solutions for compressing and fine-tuning large pre-trained models are developed by integrating these methodologies. Our analysis highlights the potential of these combined strategies to democratize the use of LLMs, making them more accessible for deployment in resource-constrained environments. The resulting models exhibit reduced memory footprints and faster inference times, paving the way for more practical and scalable applications of LLMs. Models and code are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.', 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'f1d43a985dbea0af', 'authors': ['J. Pablo Muñoz', 'Jinjie Yuan', 'Nilesh Jain'], 'affiliations': ['Intel Corporation', 'Intel Labs'], 'pdf_title_img': 'assets/pdf/title_img/2501.16372.jpg', 'data': {'categories': ['#inference', '#optimization', '#open_source', '#training', '#low_resource', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективная настройка крупных языковых моделей для ограниченных ресурсов', 'desc': 'Эта статья рассматривает проблему больших вычислительных ресурсов, необходимых для настройки и развертывания крупных языковых моделей (LLM). Авторы предлагают комбинировать низкоранговые адаптеры и методы поиска нейронных архитектур (NAS) для эффективной настройки параметров. Такой подход позволяет сжимать и дообучать большие предобученные модели, делая их более доступными в условиях ограниченных ресурсов. В результате получаются модели с меньшим потреблением памяти и более быстрым выводом, что открывает путь к более практичному применению LLM.'}, 'en': {'title': 'Democratizing Large Language Models with Efficient Fine-Tuning Techniques', 'desc': 'This paper addresses the challenges of using Large Language Models (LLMs) due to their high computational demands. It explores the use of low-rank adapters for parameter-efficient fine-tuning (PEFT), which helps reduce the resources needed. The authors combine low-rank representations with Neural Architecture Search (NAS) techniques, particularly through weight-sharing super-networks, to create efficient solutions for model compression and fine-tuning. The findings suggest that these strategies can make LLMs more accessible and practical for deployment in environments with limited resources, resulting in models that are faster and require less memory.'}, 'zh': {'title': '低秩适配器助力大型语言模型的高效微调', 'desc': '大型语言模型(LLMs)的快速发展带来了在微调和部署时对计算资源的巨大挑战。最近,低秩适配器在参数高效微调(PEFT)方面显示出了良好的效果。本文回顾了将低秩表示与神经架构搜索(NAS)技术相结合的创新方法,特别是权重共享超网络。通过整合这些方法,开发了压缩和微调大型预训练模型的稳健解决方案,使得LLMs在资源受限的环境中更易于部署。'}}}, {'id': 'https://huggingface.co/papers/2501.15747', 'title': 'IndicMMLU-Pro: Benchmarking Indic Large Language Models on Multi-Task Language Understanding', 'url': 'https://huggingface.co/papers/2501.15747', 'abstract': "Known by more than 1.5 billion people in the Indian subcontinent, Indic languages present unique challenges and opportunities for natural language processing (NLP) research due to their rich cultural heritage, linguistic diversity, and complex structures. IndicMMLU-Pro is a comprehensive benchmark designed to evaluate Large Language Models (LLMs) across Indic languages, building upon the MMLU Pro (Massive Multitask Language Understanding) framework. Covering major languages such as Hindi, Bengali, Gujarati, Marathi, Kannada, Punjabi, Tamil, Telugu, and Urdu, our benchmark addresses the unique challenges and opportunities presented by the linguistic diversity of the Indian subcontinent. This benchmark encompasses a wide range of tasks in language comprehension, reasoning, and generation, meticulously crafted to capture the intricacies of Indian languages. IndicMMLU-Pro provides a standardized evaluation framework to push the research boundaries in Indic language AI, facilitating the development of more accurate, efficient, and culturally sensitive models. This paper outlines the benchmarks' design principles, task taxonomy, and data collection methodology, and presents baseline results from state-of-the-art multilingual models.", 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '4b666d035c5e5c4c', 'authors': ['Sankalp KJ', 'Ashutosh Kumar', 'Laxmaan Balaji', 'Nikunj Kotecha', 'Vinija Jain', 'Aman Chadha', 'Sreyoshi Bhaduri'], 'affiliations': ['Amazon Gen AI', 'Artificial Intelligence Institute, University of South Carolina', 'Independent Researcher', 'Meta AI', 'Rochester Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.15747.jpg', 'data': {'categories': ['#reasoning', '#low_resource', '#multilingual', '#benchmark'], 'emoji': '🇮🇳', 'ru': {'title': 'Новый рубеж в NLP: комплексная оценка языковых моделей для индийских языков', 'desc': 'IndicMMLU-Pro - это комплексный бенчмарк для оценки языковых моделей в индийских языках. Он охватывает 9 основных языков Индийского субконтинента и включает широкий спектр задач по пониманию языка, рассуждению и генерации текста. Бенчмарк разработан с учетом уникальных особенностей и сложностей индийских языков. IndicMMLU-Pro предоставляет стандартизированную систему оценки для продвижения исследований в области ИИ для индийских языков.'}, 'en': {'title': 'Empowering Indic Languages with Advanced NLP Benchmarks', 'desc': 'The paper introduces IndicMMLU-Pro, a benchmark specifically designed to assess Large Language Models (LLMs) in the context of Indic languages. It builds on the existing MMLU Pro framework and includes major languages like Hindi, Bengali, and Tamil, addressing the unique linguistic challenges of the Indian subcontinent. The benchmark features a variety of tasks that test language comprehension, reasoning, and generation, ensuring a comprehensive evaluation of models. By providing a standardized framework, IndicMMLU-Pro aims to enhance the development of more accurate and culturally aware AI models for Indic languages.'}, 'zh': {'title': '推动印度语言AI研究的基准', 'desc': 'IndicMMLU-Pro是一个专门为印度语言设计的基准,旨在评估大型语言模型(LLMs)的表现。该基准基于MMLU Pro框架,涵盖了印地语、孟加拉语、古吉拉特语等主要语言,解决了印度次大陆语言的多样性带来的挑战。它包括语言理解、推理和生成等多种任务,旨在捕捉印度语言的复杂性。通过提供标准化的评估框架,IndicMMLU-Pro推动了印度语言人工智能的研究,促进了更准确、高效和文化敏感的模型的发展。'}}}, {'id': 'https://huggingface.co/papers/2501.17117', 'title': 'Histoires Morales: A French Dataset for Assessing Moral Alignment', 'url': 'https://huggingface.co/papers/2501.17117', 'abstract': 'Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant progress in languages like English and Chinese, French has seen little attention in this area, leaving a gap in understanding how LLMs handle moral reasoning in this language. To address this gap, we introduce Histoires Morales, a French dataset derived from Moral Stories, created through translation and subsequently refined with the assistance of native speakers to guarantee grammatical accuracy and adaptation to the French cultural context. We also rely on annotations of the moral values within the dataset to ensure their alignment with French norms. Histoires Morales covers a wide range of social situations, including differences in tipping practices, expressions of honesty in relationships, and responsibilities toward animals. To foster future research, we also conduct preliminary experiments on the alignment of multilingual models on French and English data and the robustness of the alignment. We find that while LLMs are generally aligned with human moral norms by default, they can be easily influenced with user-preference optimization for both moral and immoral data.', 'score': 2, 'issue_id': 1924, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'd2d1461e245219e8', 'authors': ['Thibaud Leteno', 'Irina Proskurina', 'Antoine Gourru', 'Julien Velcin', 'Charlotte Laclau', 'Guillaume Metzler', 'Christophe Gravier'], 'affiliations': ['Laboratoire Hubert Curien, UMR CNRS 5516, Saint-Etienne, France', 'Télécom Paris, Institut Polytechnique de Paris, Paris, France', 'Université Lumière Lyon 2, Université Claude Bernard Lyon 1, ERIC, 69007, Lyon, France'], 'pdf_title_img': 'assets/pdf/title_img/2501.17117.jpg', 'data': {'categories': ['#dataset', '#multilingual', '#alignment', '#ethics'], 'emoji': '🇫🇷', 'ru': {'title': 'Французский датасет для морального выравнивания языковых моделей', 'desc': "Статья представляет набор данных 'Histoires Morales' на французском языке для выравнивания языковых моделей с человеческими ценностями. Этот датасет создан на основе 'Moral Stories' путем перевода и адаптации к французскому культурному контексту. Исследование включает эксперименты по выравниванию мультиязычных моделей на французских и английских данных. Результаты показывают, что языковые модели в целом соответствуют человеческим моральным нормам, но могут быть легко подвержены влиянию при оптимизации под предпочтения пользователей."}, 'en': {'title': 'Bridging Language Models and French Moral Values', 'desc': 'This paper emphasizes the importance of aligning language models with human values, particularly in the context of the French language. It introduces Histoires Morales, a dataset created from Moral Stories, which has been translated and refined to reflect French cultural norms and moral reasoning. The dataset includes various social situations to better understand how language models handle moral values in French. Preliminary experiments show that while language models generally align with human morals, they can be swayed by user preferences, highlighting the need for careful optimization.'}, 'zh': {'title': '让语言模型与人类价值观对齐', 'desc': '本论文强调了将语言模型与人类价值观对齐的重要性,尤其是在日常生活中。我们介绍了一个名为Histoires Morales的法语数据集,旨在填补法语在道德推理方面的研究空白。该数据集通过翻译和母语者的帮助进行精细化,确保其语法准确并适应法国文化背景。我们的初步实验表明,尽管大型语言模型通常与人类道德规范一致,但它们可以通过用户偏好优化轻易受到影响。'}}}]; + const articlesData = [{'id': 'https://huggingface.co/papers/2501.17161', 'title': 'SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training', 'url': 'https://huggingface.co/papers/2501.17161', 'abstract': "Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic reasoning card game, and adopt V-IRL, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants in both textual and visual domains. We show that RL, especially when trained with an outcome-based reward, generalizes across both rule-based textual and visual variants. SFT, in contrast, tends to memorize training data and struggles to generalize out-of-distribution scenarios. Further analysis reveals that RL improves the model's underlying visual recognition capabilities, contributing to its enhanced generalization in the visual domain. Despite RL's superior generalization, we show that SFT remains essential for effective RL training; SFT stabilizes the model's output format, enabling subsequent RL to achieve its performance gains. These findings demonstrates the capability of RL for acquiring generalizable knowledge in complex, multi-modal tasks.", 'score': 28, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'ce9300709a3cdc7a', 'authors': ['Tianzhe Chu', 'Yuexiang Zhai', 'Jihan Yang', 'Shengbang Tong', 'Saining Xie', 'Dale Schuurmans', 'Quoc V. Le', 'Sergey Levine', 'Yi Ma'], 'affiliations': ['Google DeepMind', 'HKU', 'NYU', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.17161.jpg', 'data': {'categories': ['#reasoning', '#training', '#optimization', '#rl', '#multimodal', '#games'], 'emoji': '🧠', 'ru': {'title': 'RL превосходит SFT в обобщении для мультимодальных задач', 'desc': 'Это исследование сравнивает методы дообучения языковых моделей: обучение с учителем (SFT) и обучение с подкреплением (RL). Авторы анализируют способность моделей к обобщению на новые текстовые и визуальные варианты задач. Результаты показывают, что RL лучше обобщается на новые ситуации, особенно при использовании награды, основанной на результате. SFT, напротив, склонно к запоминанию обучающих данных и хуже справляется с обобщением.'}, 'en': {'title': 'Unlocking Generalization: RL Outshines SFT in Multi-Modal Tasks', 'desc': 'This paper investigates how supervised fine-tuning (SFT) and reinforcement learning (RL) affect the generalization abilities of foundation models. It highlights that while SFT often leads to memorization of training data, RL, particularly with outcome-based rewards, enhances generalization across unseen textual and visual variants. The study introduces GeneralPoints, a reasoning game, and V-IRL, a navigation environment, to evaluate model performance. The results indicate that RL not only improves generalization but also strengthens visual recognition, although SFT is still crucial for stabilizing the model before RL training.'}, 'zh': {'title': '强化学习提升模型泛化能力的研究', 'desc': '这篇论文研究了监督微调(SFT)和强化学习(RL)在基础模型中的作用,特别是在提高模型的泛化能力方面。研究表明,RL在处理文本和视觉变体时,能够更好地泛化,而SFT则倾向于记忆训练数据,难以应对未见过的情况。通过引入算术推理卡牌游戏GeneralPoints和真实世界导航环境V-IRL,作者评估了这两种方法的效果。尽管RL在泛化能力上表现优越,但SFT仍然对有效的RL训练至关重要,因为它稳定了模型的输出格式。'}}}, {'id': 'https://huggingface.co/papers/2501.17116', 'title': 'Optimizing Large Language Model Training Using FP4 Quantization', 'url': 'https://huggingface.co/papers/2501.17116', 'abstract': 'The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work introduces the first FP4 training framework for LLMs, addressing these challenges with two key innovations: a differentiable quantization estimator for precise weight updates and an outlier clamping and compensation strategy to prevent activation collapse. To ensure stability, the framework integrates a mixed-precision training scheme and vector-wise quantization. Experimental results demonstrate that our FP4 framework achieves accuracy comparable to BF16 and FP8, with minimal degradation, scaling effectively to 13B-parameter LLMs trained on up to 100B tokens. With the emergence of next-generation hardware supporting FP4, our framework sets a foundation for efficient ultra-low precision training.', 'score': 13, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '9ce85dc91aee17fc', 'authors': ['Ruizhe Wang', 'Yeyun Gong', 'Xiao Liu', 'Guoshuai Zhao', 'Ziyue Yang', 'Baining Guo', 'Zhengjun Zha', 'Peng Cheng'], 'affiliations': ['Microsoft Research Asia', 'Microsoft SIGMA Team', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.17116.jpg', 'data': {'categories': ['#optimization', '#training', '#inference'], 'emoji': '🔢', 'ru': {'title': 'FP4: Революция в эффективности обучения языковых моделей', 'desc': 'Статья представляет первую систему обучения больших языковых моделей (LLM) с использованием 4-битной точности с плавающей запятой (FP4). Авторы разработали дифференцируемый оценщик квантования для точного обновления весов и стратегию ограничения и компенсации выбросов для предотвращения коллапса активаций. Система включает схему обучения со смешанной точностью и векторное квантование для обеспечения стабильности. Экспериментальные результаты показывают, что FP4-обучение достигает точности, сравнимой с BF16 и FP8, эффективно масштабируясь до LLM с 13 млрд параметров.'}, 'en': {'title': 'Efficient Training of Large Language Models with FP4 Precision', 'desc': 'This paper addresses the high computational costs associated with training large language models (LLMs) by introducing a novel FP4 training framework. The framework utilizes quantized training techniques, specifically focusing on low-bit arithmetic to enhance efficiency while maintaining model accuracy. Key innovations include a differentiable quantization estimator for better weight updates and a strategy to manage outliers, which helps prevent activation collapse. Experimental results show that this FP4 approach achieves performance similar to higher precision formats like BF16 and FP8, making it suitable for large-scale LLMs.'}, 'zh': {'title': 'FP4训练框架:高效的超低精度训练新方案', 'desc': '随着大型语言模型(LLMs)训练对计算资源的需求不断增加,寻找更高效的方法变得尤为重要。量化训练通过允许低位数算术运算来降低这些成本,展现出良好的前景。尽管FP8精度已被证明可行,但FP4的应用仍面临显著的量化误差和有限的表示能力。本文提出了首个FP4训练框架,通过可微分量化估计器和异常值钳制与补偿策略,解决了这些挑战,并在稳定性方面结合了混合精度训练方案和向量级量化。'}}}, {'id': 'https://huggingface.co/papers/2501.16975', 'title': 'Over-Tokenized Transformer: Vocabulary is Generally Worth Scaling', 'url': 'https://huggingface.co/papers/2501.16975', 'abstract': 'Tokenization is a fundamental component of large language models (LLMs), yet its influence on model scaling and performance is not fully explored. In this paper, we introduce Over-Tokenized Transformers, a novel framework that decouples input and output vocabularies to improve language modeling performance. Specifically, our approach scales up input vocabularies to leverage multi-gram tokens. Through extensive experiments, we uncover a log-linear relationship between input vocabulary size and training loss, demonstrating that larger input vocabularies consistently enhance model performance, regardless of model size. Using a large input vocabulary, we achieve performance comparable to double-sized baselines with no additional cost. Our findings highlight the importance of tokenization in scaling laws and provide practical insight for tokenizer design, paving the way for more efficient and powerful LLMs.', 'score': 10, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '27930c2f5d17471e', 'authors': ['Hongzhi Huang', 'Defa Zhu', 'Banggu Wu', 'Yutao Zeng', 'Ya Wang', 'Qiyang Min', 'Xun Zhou'], 'affiliations': ['Seed-Foundation-Model Team, Bytedance'], 'pdf_title_img': 'assets/pdf/title_img/2501.16975.jpg', 'data': {'categories': ['#optimization', '#training', '#architecture'], 'emoji': '🔤', 'ru': {'title': 'Больше токенов - выше эффективность: новый взгляд на масштабирование языковых моделей', 'desc': 'Статья представляет новый подход к токенизации в больших языковых моделях, называемый Over-Tokenized Transformers. Авторы предлагают разделить входной и выходной словари, увеличивая размер входного словаря для использования мультиграммных токенов. Исследование выявило логарифмически-линейную зависимость между размером входного словаря и потерями при обучении. Результаты показывают, что увеличение входного словаря consistently улучшает производительность модели независимо от её размера.'}, 'en': {'title': 'Unlocking Performance: The Power of Over-Tokenization in Language Models', 'desc': "This paper presents a new approach called Over-Tokenized Transformers, which focuses on improving the tokenization process in large language models (LLMs). By separating the input and output vocabularies, the authors demonstrate that increasing the input vocabulary size can significantly reduce training loss and enhance model performance. Their experiments reveal a consistent log-linear relationship between the size of the input vocabulary and the model's effectiveness, showing that larger vocabularies lead to better results without increasing computational costs. This research emphasizes the critical role of tokenization in the scaling of LLMs and offers valuable insights for designing more efficient tokenizers."}, 'zh': {'title': '分词技术提升大语言模型性能的关键', 'desc': '本文探讨了大语言模型中的分词技术对模型性能的影响。我们提出了一种新的框架——过度分词变换器,旨在通过解耦输入和输出词汇表来提升语言建模性能。研究表明,增大输入词汇表可以有效降低训练损失,从而提高模型性能。我们的实验结果显示,使用更大的输入词汇表可以在不增加成本的情况下,达到与双倍基线相当的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.16764', 'title': 'DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation', 'url': 'https://huggingface.co/papers/2501.16764', 'abstract': 'Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-image diffusion models. It differs from previous 3D generative models by effectively utilizing web-scale 2D priors while maintaining 3D consistency in a unified model. To bootstrap the training, a lightweight reconstruction model is proposed to instantly produce multi-view Gaussian splat grids for scalable dataset curation. In conjunction with the regular diffusion loss on these grids, a 3D rendering loss is introduced to facilitate 3D coherence across arbitrary views. The compatibility with image diffusion models enables seamless adaptions of numerous techniques for image generation to the 3D realm. Extensive experiments reveal the superiority of DiffSplat in text- and image-conditioned generation tasks and downstream applications. Thorough ablation studies validate the efficacy of each critical design choice and provide insights into the underlying mechanism.', 'score': 8, 'issue_id': 1921, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '00ee1a0338716711', 'authors': ['Chenguo Lin', 'Panwang Pan', 'Bangbang Yang', 'Zeming Li', 'Yadong Mu'], 'affiliations': ['ByteDance', 'Peking University'], 'pdf_title_img': 'assets/pdf/title_img/2501.16764.jpg', 'data': {'categories': ['#diffusion', '#optimization', '#training', '#dataset', '#3d'], 'emoji': '🎨', 'ru': {'title': 'DiffSplat: Генерация 3D контента на новом уровне', 'desc': 'DiffSplat - это новая система генерации 3D контента, использующая диффузионные модели для создания трехмерных гауссовых сплатов. Она решает проблемы ограниченных 3D датасетов и несогласованности при мультиракурсной 2D генерации. DiffSplat объединяет масштабные 2D-приоры с 3D-согласованностью, используя легковесную модель реконструкции и специальную функцию потерь. Эксперименты показывают превосходство DiffSplat в задачах генерации по тексту и изображениям.'}, 'en': {'title': 'Revolutionizing 3D Generation with DiffSplat', 'desc': 'DiffSplat is a new framework for generating 3D content from text or images, addressing challenges like the lack of high-quality 3D datasets. It uses advanced text-to-image diffusion models to create 3D Gaussian splats while ensuring consistency across different views. The framework includes a lightweight reconstruction model that helps quickly generate multi-view datasets for training. Through extensive testing, DiffSplat shows improved performance in generating 3D content and offers insights into its effective design choices.'}, 'zh': {'title': 'DiffSplat:3D生成的新突破', 'desc': '最近,3D内容生成从文本或单张图像中取得了进展,但高质量3D数据集有限,且2D多视图生成存在不一致性。我们提出了DiffSplat,这是一种新颖的3D生成框架,能够通过控制大规模文本到图像的扩散模型,原生生成3D高斯点云。与以往的3D生成模型不同,DiffSplat有效利用了网络规模的2D先验,同时在统一模型中保持3D一致性。通过引入轻量级重建模型和3D渲染损失,DiffSplat在文本和图像条件生成任务中表现出色,且在下游应用中也显示出其优越性。'}}}, {'id': 'https://huggingface.co/papers/2501.16496', 'title': 'Open Problems in Mechanistic Interpretability', 'url': 'https://huggingface.co/papers/2501.16496', 'abstract': "Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting scientific questions about the nature of intelligence. Despite recent progress toward these goals, there are many open problems in the field that require solutions before many scientific and practical benefits can be realized: Our methods require both conceptual and practical improvements to reveal deeper insights; we must figure out how best to apply our methods in pursuit of specific goals; and the field must grapple with socio-technical challenges that influence and are influenced by our work. This forward-facing review discusses the current frontier of mechanistic interpretability and the open problems that the field may benefit from prioritizing.", 'score': 7, 'issue_id': 1920, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '5a7a914accebfa33', 'authors': ['Lee Sharkey', 'Bilal Chughtai', 'Joshua Batson', 'Jack Lindsey', 'Jeff Wu', 'Lucius Bushnaq', 'Nicholas Goldowsky-Dill', 'Stefan Heimersheim', 'Alejandro Ortega', 'Joseph Bloom', 'Stella Biderman', 'Adria Garriga-Alonso', 'Arthur Conmy', 'Neel Nanda', 'Jessica Rumbelow', 'Martin Wattenberg', 'Nandi Schoots', 'Joseph Miller', 'Eric J. Michaud', 'Stephen Casper', 'Max Tegmark', 'William Saunders', 'David Bau', 'Eric Todd', 'Atticus Geiger', 'Mor Geva', 'Jesse Hoogland', 'Daniel Murfet', 'Tom McGrath'], 'affiliations': ['Anthropic', 'Apollo Research', 'Google DeepMind', 'Harvard University', 'Imperial College London', 'Kings College London', 'Leap Laboratories', 'MIT', 'Northeastern University', 'Tel Aviv University', 'University of Melbourne'], 'pdf_title_img': 'assets/pdf/title_img/2501.16496.jpg', 'data': {'categories': ['#interpretability', '#survey'], 'emoji': '🧠', 'ru': {'title': 'Раскрывая тайны нейронных сетей: путь к пониманию искусственного интеллекта', 'desc': 'Статья посвящена механистической интерпретируемости нейронных сетей, цель которой - понять вычислительные механизмы, лежащие в основе их возможностей. Прогресс в этой области обещает обеспечить большую уверенность в поведении систем искусственного интеллекта и пролить свет на природу интеллекта. Авторы обсуждают открытые проблемы в области, требующие решения для реализации научных и практических преимуществ. Статья рассматривает текущие границы механистической интерпретируемости и приоритетные задачи для дальнейшего развития области.'}, 'en': {'title': 'Unlocking the Secrets of Neural Networks for Reliable AI', 'desc': 'Mechanistic interpretability focuses on understanding how neural networks work to achieve specific tasks, which can enhance the reliability of AI systems. This area of research aims to uncover the underlying processes that contribute to the intelligence exhibited by these models. Despite advancements, there are still significant challenges that need to be addressed, including improving methods for deeper insights and applying these methods effectively. Additionally, the field must consider socio-technical issues that affect and are affected by mechanistic interpretability efforts.'}, 'zh': {'title': '揭示神经网络的计算机制', 'desc': '机械解释性旨在理解神经网络能力背后的计算机制,以实现具体的科学和工程目标。该领域的进展有望提高对人工智能系统行为的信心,并揭示关于智能本质的有趣科学问题。尽管最近在这些目标上取得了一些进展,但仍有许多未解决的问题需要解决,以便实现更多的科学和实际利益。本文回顾了机械解释性的当前前沿及该领域应优先解决的开放问题。'}}}, {'id': 'https://huggingface.co/papers/2501.16372', 'title': 'Low-Rank Adapters Meet Neural Architecture Search for LLM Compression', 'url': 'https://huggingface.co/papers/2501.16372', 'abstract': 'The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank representations with Neural Architecture Search (NAS) techniques, particularly weight-sharing super-networks. Robust solutions for compressing and fine-tuning large pre-trained models are developed by integrating these methodologies. Our analysis highlights the potential of these combined strategies to democratize the use of LLMs, making them more accessible for deployment in resource-constrained environments. The resulting models exhibit reduced memory footprints and faster inference times, paving the way for more practical and scalable applications of LLMs. Models and code are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.', 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'f1d43a985dbea0af', 'authors': ['J. Pablo Muñoz', 'Jinjie Yuan', 'Nilesh Jain'], 'affiliations': ['Intel Corporation', 'Intel Labs'], 'pdf_title_img': 'assets/pdf/title_img/2501.16372.jpg', 'data': {'categories': ['#inference', '#optimization', '#open_source', '#training', '#low_resource', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективная настройка крупных языковых моделей для ограниченных ресурсов', 'desc': 'Эта статья рассматривает проблему больших вычислительных ресурсов, необходимых для настройки и развертывания крупных языковых моделей (LLM). Авторы предлагают комбинировать низкоранговые адаптеры и методы поиска нейронных архитектур (NAS) для эффективной настройки параметров. Такой подход позволяет сжимать и дообучать большие предобученные модели, делая их более доступными в условиях ограниченных ресурсов. В результате получаются модели с меньшим потреблением памяти и более быстрым выводом, что открывает путь к более практичному применению LLM.'}, 'en': {'title': 'Democratizing Large Language Models with Efficient Fine-Tuning Techniques', 'desc': 'This paper addresses the challenges of using Large Language Models (LLMs) due to their high computational demands. It explores the use of low-rank adapters for parameter-efficient fine-tuning (PEFT), which helps reduce the resources needed. The authors combine low-rank representations with Neural Architecture Search (NAS) techniques, particularly through weight-sharing super-networks, to create efficient solutions for model compression and fine-tuning. The findings suggest that these strategies can make LLMs more accessible and practical for deployment in environments with limited resources, resulting in models that are faster and require less memory.'}, 'zh': {'title': '低秩适配器助力大型语言模型的高效微调', 'desc': '大型语言模型(LLMs)的快速发展带来了在微调和部署时对计算资源的巨大挑战。最近,低秩适配器在参数高效微调(PEFT)方面显示出了良好的效果。本文回顾了将低秩表示与神经架构搜索(NAS)技术相结合的创新方法,特别是权重共享超网络。通过整合这些方法,开发了压缩和微调大型预训练模型的稳健解决方案,使得LLMs在资源受限的环境中更易于部署。'}}}, {'id': 'https://huggingface.co/papers/2501.15747', 'title': 'IndicMMLU-Pro: Benchmarking Indic Large Language Models on Multi-Task Language Understanding', 'url': 'https://huggingface.co/papers/2501.15747', 'abstract': "Known by more than 1.5 billion people in the Indian subcontinent, Indic languages present unique challenges and opportunities for natural language processing (NLP) research due to their rich cultural heritage, linguistic diversity, and complex structures. IndicMMLU-Pro is a comprehensive benchmark designed to evaluate Large Language Models (LLMs) across Indic languages, building upon the MMLU Pro (Massive Multitask Language Understanding) framework. Covering major languages such as Hindi, Bengali, Gujarati, Marathi, Kannada, Punjabi, Tamil, Telugu, and Urdu, our benchmark addresses the unique challenges and opportunities presented by the linguistic diversity of the Indian subcontinent. This benchmark encompasses a wide range of tasks in language comprehension, reasoning, and generation, meticulously crafted to capture the intricacies of Indian languages. IndicMMLU-Pro provides a standardized evaluation framework to push the research boundaries in Indic language AI, facilitating the development of more accurate, efficient, and culturally sensitive models. This paper outlines the benchmarks' design principles, task taxonomy, and data collection methodology, and presents baseline results from state-of-the-art multilingual models.", 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '4b666d035c5e5c4c', 'authors': ['Sankalp KJ', 'Ashutosh Kumar', 'Laxmaan Balaji', 'Nikunj Kotecha', 'Vinija Jain', 'Aman Chadha', 'Sreyoshi Bhaduri'], 'affiliations': ['Amazon Gen AI', 'Artificial Intelligence Institute, University of South Carolina', 'Independent Researcher', 'Meta AI', 'Rochester Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.15747.jpg', 'data': {'categories': ['#reasoning', '#low_resource', '#multilingual', '#benchmark'], 'emoji': '🇮🇳', 'ru': {'title': 'Новый рубеж в NLP: комплексная оценка языковых моделей для индийских языков', 'desc': 'IndicMMLU-Pro - это комплексный бенчмарк для оценки языковых моделей в индийских языках. Он охватывает 9 основных языков Индийского субконтинента и включает широкий спектр задач по пониманию языка, рассуждению и генерации текста. Бенчмарк разработан с учетом уникальных особенностей и сложностей индийских языков. IndicMMLU-Pro предоставляет стандартизированную систему оценки для продвижения исследований в области ИИ для индийских языков.'}, 'en': {'title': 'Empowering Indic Languages with Advanced NLP Benchmarks', 'desc': 'The paper introduces IndicMMLU-Pro, a benchmark specifically designed to assess Large Language Models (LLMs) in the context of Indic languages. It builds on the existing MMLU Pro framework and includes major languages like Hindi, Bengali, and Tamil, addressing the unique linguistic challenges of the Indian subcontinent. The benchmark features a variety of tasks that test language comprehension, reasoning, and generation, ensuring a comprehensive evaluation of models. By providing a standardized framework, IndicMMLU-Pro aims to enhance the development of more accurate and culturally aware AI models for Indic languages.'}, 'zh': {'title': '推动印度语言AI研究的基准', 'desc': 'IndicMMLU-Pro是一个专门为印度语言设计的基准,旨在评估大型语言模型(LLMs)的表现。该基准基于MMLU Pro框架,涵盖了印地语、孟加拉语、古吉拉特语等主要语言,解决了印度次大陆语言的多样性带来的挑战。它包括语言理解、推理和生成等多种任务,旨在捕捉印度语言的复杂性。通过提供标准化的评估框架,IndicMMLU-Pro推动了印度语言人工智能的研究,促进了更准确、高效和文化敏感的模型的发展。'}}}, {'id': 'https://huggingface.co/papers/2501.17117', 'title': 'Histoires Morales: A French Dataset for Assessing Moral Alignment', 'url': 'https://huggingface.co/papers/2501.17117', 'abstract': 'Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant progress in languages like English and Chinese, French has seen little attention in this area, leaving a gap in understanding how LLMs handle moral reasoning in this language. To address this gap, we introduce Histoires Morales, a French dataset derived from Moral Stories, created through translation and subsequently refined with the assistance of native speakers to guarantee grammatical accuracy and adaptation to the French cultural context. We also rely on annotations of the moral values within the dataset to ensure their alignment with French norms. Histoires Morales covers a wide range of social situations, including differences in tipping practices, expressions of honesty in relationships, and responsibilities toward animals. To foster future research, we also conduct preliminary experiments on the alignment of multilingual models on French and English data and the robustness of the alignment. We find that while LLMs are generally aligned with human moral norms by default, they can be easily influenced with user-preference optimization for both moral and immoral data.', 'score': 2, 'issue_id': 1924, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'd2d1461e245219e8', 'authors': ['Thibaud Leteno', 'Irina Proskurina', 'Antoine Gourru', 'Julien Velcin', 'Charlotte Laclau', 'Guillaume Metzler', 'Christophe Gravier'], 'affiliations': ['Laboratoire Hubert Curien, UMR CNRS 5516, Saint-Etienne, France', 'Télécom Paris, Institut Polytechnique de Paris, Paris, France', 'Université Lumière Lyon 2, Université Claude Bernard Lyon 1, ERIC, 69007, Lyon, France'], 'pdf_title_img': 'assets/pdf/title_img/2501.17117.jpg', 'data': {'categories': ['#dataset', '#multilingual', '#alignment', '#ethics'], 'emoji': '🇫🇷', 'ru': {'title': 'Французский датасет для морального выравнивания языковых моделей', 'desc': "Статья представляет набор данных 'Histoires Morales' на французском языке для выравнивания языковых моделей с человеческими ценностями. Этот датасет создан на основе 'Moral Stories' путем перевода и адаптации к французскому культурному контексту. Исследование включает эксперименты по выравниванию мультиязычных моделей на французских и английских данных. Результаты показывают, что языковые модели в целом соответствуют человеческим моральным нормам, но могут быть легко подвержены влиянию при оптимизации под предпочтения пользователей."}, 'en': {'title': 'Bridging Language Models and French Moral Values', 'desc': 'This paper emphasizes the importance of aligning language models with human values, particularly in the context of the French language. It introduces Histoires Morales, a dataset created from Moral Stories, which has been translated and refined to reflect French cultural norms and moral reasoning. The dataset includes various social situations to better understand how language models handle moral values in French. Preliminary experiments show that while language models generally align with human morals, they can be swayed by user preferences, highlighting the need for careful optimization.'}, 'zh': {'title': '让语言模型与人类价值观对齐', 'desc': '本论文强调了将语言模型与人类价值观对齐的重要性,尤其是在日常生活中。我们介绍了一个名为Histoires Morales的法语数据集,旨在填补法语在道德推理方面的研究空白。该数据集通过翻译和母语者的帮助进行精细化,确保其语法准确并适应法国文化背景。我们的初步实验表明,尽管大型语言模型通常与人类道德规范一致,但它们可以通过用户偏好优化轻易受到影响。'}}}]; const articlesContainer = document.getElementById('articles-container'); const sortDropdown = document.getElementById('sort-dropdown'); const categoryFiltersContainer = document.getElementById('category-filters'); @@ -1184,7 +1184,7 @@ function updateTimeDiffs() { const timeDiff = document.getElementById('timeDiff'); - timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-29 21:09',lang=currentLang); + timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-29 22:09',lang=currentLang); } function updateSortingOptions() { const sortingLabels = { @@ -1238,14 +1238,14 @@ } function hideNextLink(format) { if (format === 'monthly') { - if (isCurrentMonth('2025-01-29 21:09')) { + if (isCurrentMonth('2025-01-29 22:09')) { const element = document.getElementById('nav-next'); if (element) { element.style.display = 'none'; } } } else { - if (isToday('2025-01-29 21:09')) { + if (isToday('2025-01-29 22:09')) { const element = document.getElementById('nav-next'); if (element) { element.style.display = 'none'; diff --git a/d/2025-01-29.json b/d/2025-01-29.json index 3bcb6c045..7a53ba981 100644 --- a/d/2025-01-29.json +++ b/d/2025-01-29.json @@ -4,9 +4,9 @@ "en": "January 29", "zh": "1月29日" }, - "time_utc": "2025-01-29 21:09", + "time_utc": "2025-01-29 22:09", "weekday": 2, - "issue_id": 1935, + "issue_id": 1936, "home_page_url": "https://huggingface.co/papers", "papers": [ { @@ -14,7 +14,7 @@ "title": "SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training", "url": "https://huggingface.co/papers/2501.17161", "abstract": "Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic reasoning card game, and adopt V-IRL, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants in both textual and visual domains. We show that RL, especially when trained with an outcome-based reward, generalizes across both rule-based textual and visual variants. SFT, in contrast, tends to memorize training data and struggles to generalize out-of-distribution scenarios. Further analysis reveals that RL improves the model's underlying visual recognition capabilities, contributing to its enhanced generalization in the visual domain. Despite RL's superior generalization, we show that SFT remains essential for effective RL training; SFT stabilizes the model's output format, enabling subsequent RL to achieve its performance gains. These findings demonstrates the capability of RL for acquiring generalizable knowledge in complex, multi-modal tasks.", - "score": 27, + "score": 28, "issue_id": 1920, "pub_date": "2025-01-28", "pub_date_card": { @@ -70,7 +70,7 @@ "title": "Optimizing Large Language Model Training Using FP4 Quantization", "url": "https://huggingface.co/papers/2501.17116", "abstract": "The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work introduces the first FP4 training framework for LLMs, addressing these challenges with two key innovations: a differentiable quantization estimator for precise weight updates and an outlier clamping and compensation strategy to prevent activation collapse. To ensure stability, the framework integrates a mixed-precision training scheme and vector-wise quantization. Experimental results demonstrate that our FP4 framework achieves accuracy comparable to BF16 and FP8, with minimal degradation, scaling effectively to 13B-parameter LLMs trained on up to 100B tokens. With the emergence of next-generation hardware supporting FP4, our framework sets a foundation for efficient ultra-low precision training.", - "score": 12, + "score": 13, "issue_id": 1920, "pub_date": "2025-01-28", "pub_date_card": { diff --git a/hf_papers.json b/hf_papers.json index 7a53ba981..86b937e9f 100644 --- a/hf_papers.json +++ b/hf_papers.json @@ -4,9 +4,9 @@ "en": "January 29", "zh": "1月29日" }, - "time_utc": "2025-01-29 22:09", + "time_utc": "2025-01-29 23:09", "weekday": 2, - "issue_id": 1936, + "issue_id": 1937, "home_page_url": "https://huggingface.co/papers", "papers": [ { @@ -218,7 +218,7 @@ "title": "Open Problems in Mechanistic Interpretability", "url": "https://huggingface.co/papers/2501.16496", "abstract": "Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting scientific questions about the nature of intelligence. Despite recent progress toward these goals, there are many open problems in the field that require solutions before many scientific and practical benefits can be realized: Our methods require both conceptual and practical improvements to reveal deeper insights; we must figure out how best to apply our methods in pursuit of specific goals; and the field must grapple with socio-technical challenges that influence and are influenced by our work. This forward-facing review discusses the current frontier of mechanistic interpretability and the open problems that the field may benefit from prioritizing.", - "score": 7, + "score": 8, "issue_id": 1920, "pub_date": "2025-01-27", "pub_date_card": { @@ -297,7 +297,7 @@ "title": "Low-Rank Adapters Meet Neural Architecture Search for LLM Compression", "url": "https://huggingface.co/papers/2501.16372", "abstract": "The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank representations with Neural Architecture Search (NAS) techniques, particularly weight-sharing super-networks. Robust solutions for compressing and fine-tuning large pre-trained models are developed by integrating these methodologies. Our analysis highlights the potential of these combined strategies to democratize the use of LLMs, making them more accessible for deployment in resource-constrained environments. The resulting models exhibit reduced memory footprints and faster inference times, paving the way for more practical and scalable applications of LLMs. Models and code are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.", - "score": 4, + "score": 5, "issue_id": 1918, "pub_date": "2025-01-23", "pub_date_card": { diff --git a/index.html b/index.html index d36e47aa2..b5910dbb7 100644 --- a/index.html +++ b/index.html @@ -881,7 +881,7 @@ } } - const articlesData = [{'id': 'https://huggingface.co/papers/2501.17161', 'title': 'SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training', 'url': 'https://huggingface.co/papers/2501.17161', 'abstract': "Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic reasoning card game, and adopt V-IRL, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants in both textual and visual domains. We show that RL, especially when trained with an outcome-based reward, generalizes across both rule-based textual and visual variants. SFT, in contrast, tends to memorize training data and struggles to generalize out-of-distribution scenarios. Further analysis reveals that RL improves the model's underlying visual recognition capabilities, contributing to its enhanced generalization in the visual domain. Despite RL's superior generalization, we show that SFT remains essential for effective RL training; SFT stabilizes the model's output format, enabling subsequent RL to achieve its performance gains. These findings demonstrates the capability of RL for acquiring generalizable knowledge in complex, multi-modal tasks.", 'score': 28, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'ce9300709a3cdc7a', 'authors': ['Tianzhe Chu', 'Yuexiang Zhai', 'Jihan Yang', 'Shengbang Tong', 'Saining Xie', 'Dale Schuurmans', 'Quoc V. Le', 'Sergey Levine', 'Yi Ma'], 'affiliations': ['Google DeepMind', 'HKU', 'NYU', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.17161.jpg', 'data': {'categories': ['#reasoning', '#training', '#optimization', '#rl', '#multimodal', '#games'], 'emoji': '🧠', 'ru': {'title': 'RL превосходит SFT в обобщении для мультимодальных задач', 'desc': 'Это исследование сравнивает методы дообучения языковых моделей: обучение с учителем (SFT) и обучение с подкреплением (RL). Авторы анализируют способность моделей к обобщению на новые текстовые и визуальные варианты задач. Результаты показывают, что RL лучше обобщается на новые ситуации, особенно при использовании награды, основанной на результате. SFT, напротив, склонно к запоминанию обучающих данных и хуже справляется с обобщением.'}, 'en': {'title': 'Unlocking Generalization: RL Outshines SFT in Multi-Modal Tasks', 'desc': 'This paper investigates how supervised fine-tuning (SFT) and reinforcement learning (RL) affect the generalization abilities of foundation models. It highlights that while SFT often leads to memorization of training data, RL, particularly with outcome-based rewards, enhances generalization across unseen textual and visual variants. The study introduces GeneralPoints, a reasoning game, and V-IRL, a navigation environment, to evaluate model performance. The results indicate that RL not only improves generalization but also strengthens visual recognition, although SFT is still crucial for stabilizing the model before RL training.'}, 'zh': {'title': '强化学习提升模型泛化能力的研究', 'desc': '这篇论文研究了监督微调(SFT)和强化学习(RL)在基础模型中的作用,特别是在提高模型的泛化能力方面。研究表明,RL在处理文本和视觉变体时,能够更好地泛化,而SFT则倾向于记忆训练数据,难以应对未见过的情况。通过引入算术推理卡牌游戏GeneralPoints和真实世界导航环境V-IRL,作者评估了这两种方法的效果。尽管RL在泛化能力上表现优越,但SFT仍然对有效的RL训练至关重要,因为它稳定了模型的输出格式。'}}}, {'id': 'https://huggingface.co/papers/2501.17116', 'title': 'Optimizing Large Language Model Training Using FP4 Quantization', 'url': 'https://huggingface.co/papers/2501.17116', 'abstract': 'The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work introduces the first FP4 training framework for LLMs, addressing these challenges with two key innovations: a differentiable quantization estimator for precise weight updates and an outlier clamping and compensation strategy to prevent activation collapse. To ensure stability, the framework integrates a mixed-precision training scheme and vector-wise quantization. Experimental results demonstrate that our FP4 framework achieves accuracy comparable to BF16 and FP8, with minimal degradation, scaling effectively to 13B-parameter LLMs trained on up to 100B tokens. With the emergence of next-generation hardware supporting FP4, our framework sets a foundation for efficient ultra-low precision training.', 'score': 13, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '9ce85dc91aee17fc', 'authors': ['Ruizhe Wang', 'Yeyun Gong', 'Xiao Liu', 'Guoshuai Zhao', 'Ziyue Yang', 'Baining Guo', 'Zhengjun Zha', 'Peng Cheng'], 'affiliations': ['Microsoft Research Asia', 'Microsoft SIGMA Team', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.17116.jpg', 'data': {'categories': ['#optimization', '#training', '#inference'], 'emoji': '🔢', 'ru': {'title': 'FP4: Революция в эффективности обучения языковых моделей', 'desc': 'Статья представляет первую систему обучения больших языковых моделей (LLM) с использованием 4-битной точности с плавающей запятой (FP4). Авторы разработали дифференцируемый оценщик квантования для точного обновления весов и стратегию ограничения и компенсации выбросов для предотвращения коллапса активаций. Система включает схему обучения со смешанной точностью и векторное квантование для обеспечения стабильности. Экспериментальные результаты показывают, что FP4-обучение достигает точности, сравнимой с BF16 и FP8, эффективно масштабируясь до LLM с 13 млрд параметров.'}, 'en': {'title': 'Efficient Training of Large Language Models with FP4 Precision', 'desc': 'This paper addresses the high computational costs associated with training large language models (LLMs) by introducing a novel FP4 training framework. The framework utilizes quantized training techniques, specifically focusing on low-bit arithmetic to enhance efficiency while maintaining model accuracy. Key innovations include a differentiable quantization estimator for better weight updates and a strategy to manage outliers, which helps prevent activation collapse. Experimental results show that this FP4 approach achieves performance similar to higher precision formats like BF16 and FP8, making it suitable for large-scale LLMs.'}, 'zh': {'title': 'FP4训练框架:高效的超低精度训练新方案', 'desc': '随着大型语言模型(LLMs)训练对计算资源的需求不断增加,寻找更高效的方法变得尤为重要。量化训练通过允许低位数算术运算来降低这些成本,展现出良好的前景。尽管FP8精度已被证明可行,但FP4的应用仍面临显著的量化误差和有限的表示能力。本文提出了首个FP4训练框架,通过可微分量化估计器和异常值钳制与补偿策略,解决了这些挑战,并在稳定性方面结合了混合精度训练方案和向量级量化。'}}}, {'id': 'https://huggingface.co/papers/2501.16975', 'title': 'Over-Tokenized Transformer: Vocabulary is Generally Worth Scaling', 'url': 'https://huggingface.co/papers/2501.16975', 'abstract': 'Tokenization is a fundamental component of large language models (LLMs), yet its influence on model scaling and performance is not fully explored. In this paper, we introduce Over-Tokenized Transformers, a novel framework that decouples input and output vocabularies to improve language modeling performance. Specifically, our approach scales up input vocabularies to leverage multi-gram tokens. Through extensive experiments, we uncover a log-linear relationship between input vocabulary size and training loss, demonstrating that larger input vocabularies consistently enhance model performance, regardless of model size. Using a large input vocabulary, we achieve performance comparable to double-sized baselines with no additional cost. Our findings highlight the importance of tokenization in scaling laws and provide practical insight for tokenizer design, paving the way for more efficient and powerful LLMs.', 'score': 10, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '27930c2f5d17471e', 'authors': ['Hongzhi Huang', 'Defa Zhu', 'Banggu Wu', 'Yutao Zeng', 'Ya Wang', 'Qiyang Min', 'Xun Zhou'], 'affiliations': ['Seed-Foundation-Model Team, Bytedance'], 'pdf_title_img': 'assets/pdf/title_img/2501.16975.jpg', 'data': {'categories': ['#optimization', '#training', '#architecture'], 'emoji': '🔤', 'ru': {'title': 'Больше токенов - выше эффективность: новый взгляд на масштабирование языковых моделей', 'desc': 'Статья представляет новый подход к токенизации в больших языковых моделях, называемый Over-Tokenized Transformers. Авторы предлагают разделить входной и выходной словари, увеличивая размер входного словаря для использования мультиграммных токенов. Исследование выявило логарифмически-линейную зависимость между размером входного словаря и потерями при обучении. Результаты показывают, что увеличение входного словаря consistently улучшает производительность модели независимо от её размера.'}, 'en': {'title': 'Unlocking Performance: The Power of Over-Tokenization in Language Models', 'desc': "This paper presents a new approach called Over-Tokenized Transformers, which focuses on improving the tokenization process in large language models (LLMs). By separating the input and output vocabularies, the authors demonstrate that increasing the input vocabulary size can significantly reduce training loss and enhance model performance. Their experiments reveal a consistent log-linear relationship between the size of the input vocabulary and the model's effectiveness, showing that larger vocabularies lead to better results without increasing computational costs. This research emphasizes the critical role of tokenization in the scaling of LLMs and offers valuable insights for designing more efficient tokenizers."}, 'zh': {'title': '分词技术提升大语言模型性能的关键', 'desc': '本文探讨了大语言模型中的分词技术对模型性能的影响。我们提出了一种新的框架——过度分词变换器,旨在通过解耦输入和输出词汇表来提升语言建模性能。研究表明,增大输入词汇表可以有效降低训练损失,从而提高模型性能。我们的实验结果显示,使用更大的输入词汇表可以在不增加成本的情况下,达到与双倍基线相当的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.16764', 'title': 'DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation', 'url': 'https://huggingface.co/papers/2501.16764', 'abstract': 'Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-image diffusion models. It differs from previous 3D generative models by effectively utilizing web-scale 2D priors while maintaining 3D consistency in a unified model. To bootstrap the training, a lightweight reconstruction model is proposed to instantly produce multi-view Gaussian splat grids for scalable dataset curation. In conjunction with the regular diffusion loss on these grids, a 3D rendering loss is introduced to facilitate 3D coherence across arbitrary views. The compatibility with image diffusion models enables seamless adaptions of numerous techniques for image generation to the 3D realm. Extensive experiments reveal the superiority of DiffSplat in text- and image-conditioned generation tasks and downstream applications. Thorough ablation studies validate the efficacy of each critical design choice and provide insights into the underlying mechanism.', 'score': 8, 'issue_id': 1921, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '00ee1a0338716711', 'authors': ['Chenguo Lin', 'Panwang Pan', 'Bangbang Yang', 'Zeming Li', 'Yadong Mu'], 'affiliations': ['ByteDance', 'Peking University'], 'pdf_title_img': 'assets/pdf/title_img/2501.16764.jpg', 'data': {'categories': ['#diffusion', '#optimization', '#training', '#dataset', '#3d'], 'emoji': '🎨', 'ru': {'title': 'DiffSplat: Генерация 3D контента на новом уровне', 'desc': 'DiffSplat - это новая система генерации 3D контента, использующая диффузионные модели для создания трехмерных гауссовых сплатов. Она решает проблемы ограниченных 3D датасетов и несогласованности при мультиракурсной 2D генерации. DiffSplat объединяет масштабные 2D-приоры с 3D-согласованностью, используя легковесную модель реконструкции и специальную функцию потерь. Эксперименты показывают превосходство DiffSplat в задачах генерации по тексту и изображениям.'}, 'en': {'title': 'Revolutionizing 3D Generation with DiffSplat', 'desc': 'DiffSplat is a new framework for generating 3D content from text or images, addressing challenges like the lack of high-quality 3D datasets. It uses advanced text-to-image diffusion models to create 3D Gaussian splats while ensuring consistency across different views. The framework includes a lightweight reconstruction model that helps quickly generate multi-view datasets for training. Through extensive testing, DiffSplat shows improved performance in generating 3D content and offers insights into its effective design choices.'}, 'zh': {'title': 'DiffSplat:3D生成的新突破', 'desc': '最近,3D内容生成从文本或单张图像中取得了进展,但高质量3D数据集有限,且2D多视图生成存在不一致性。我们提出了DiffSplat,这是一种新颖的3D生成框架,能够通过控制大规模文本到图像的扩散模型,原生生成3D高斯点云。与以往的3D生成模型不同,DiffSplat有效利用了网络规模的2D先验,同时在统一模型中保持3D一致性。通过引入轻量级重建模型和3D渲染损失,DiffSplat在文本和图像条件生成任务中表现出色,且在下游应用中也显示出其优越性。'}}}, {'id': 'https://huggingface.co/papers/2501.16496', 'title': 'Open Problems in Mechanistic Interpretability', 'url': 'https://huggingface.co/papers/2501.16496', 'abstract': "Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting scientific questions about the nature of intelligence. Despite recent progress toward these goals, there are many open problems in the field that require solutions before many scientific and practical benefits can be realized: Our methods require both conceptual and practical improvements to reveal deeper insights; we must figure out how best to apply our methods in pursuit of specific goals; and the field must grapple with socio-technical challenges that influence and are influenced by our work. This forward-facing review discusses the current frontier of mechanistic interpretability and the open problems that the field may benefit from prioritizing.", 'score': 7, 'issue_id': 1920, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '5a7a914accebfa33', 'authors': ['Lee Sharkey', 'Bilal Chughtai', 'Joshua Batson', 'Jack Lindsey', 'Jeff Wu', 'Lucius Bushnaq', 'Nicholas Goldowsky-Dill', 'Stefan Heimersheim', 'Alejandro Ortega', 'Joseph Bloom', 'Stella Biderman', 'Adria Garriga-Alonso', 'Arthur Conmy', 'Neel Nanda', 'Jessica Rumbelow', 'Martin Wattenberg', 'Nandi Schoots', 'Joseph Miller', 'Eric J. Michaud', 'Stephen Casper', 'Max Tegmark', 'William Saunders', 'David Bau', 'Eric Todd', 'Atticus Geiger', 'Mor Geva', 'Jesse Hoogland', 'Daniel Murfet', 'Tom McGrath'], 'affiliations': ['Anthropic', 'Apollo Research', 'Google DeepMind', 'Harvard University', 'Imperial College London', 'Kings College London', 'Leap Laboratories', 'MIT', 'Northeastern University', 'Tel Aviv University', 'University of Melbourne'], 'pdf_title_img': 'assets/pdf/title_img/2501.16496.jpg', 'data': {'categories': ['#interpretability', '#survey'], 'emoji': '🧠', 'ru': {'title': 'Раскрывая тайны нейронных сетей: путь к пониманию искусственного интеллекта', 'desc': 'Статья посвящена механистической интерпретируемости нейронных сетей, цель которой - понять вычислительные механизмы, лежащие в основе их возможностей. Прогресс в этой области обещает обеспечить большую уверенность в поведении систем искусственного интеллекта и пролить свет на природу интеллекта. Авторы обсуждают открытые проблемы в области, требующие решения для реализации научных и практических преимуществ. Статья рассматривает текущие границы механистической интерпретируемости и приоритетные задачи для дальнейшего развития области.'}, 'en': {'title': 'Unlocking the Secrets of Neural Networks for Reliable AI', 'desc': 'Mechanistic interpretability focuses on understanding how neural networks work to achieve specific tasks, which can enhance the reliability of AI systems. This area of research aims to uncover the underlying processes that contribute to the intelligence exhibited by these models. Despite advancements, there are still significant challenges that need to be addressed, including improving methods for deeper insights and applying these methods effectively. Additionally, the field must consider socio-technical issues that affect and are affected by mechanistic interpretability efforts.'}, 'zh': {'title': '揭示神经网络的计算机制', 'desc': '机械解释性旨在理解神经网络能力背后的计算机制,以实现具体的科学和工程目标。该领域的进展有望提高对人工智能系统行为的信心,并揭示关于智能本质的有趣科学问题。尽管最近在这些目标上取得了一些进展,但仍有许多未解决的问题需要解决,以便实现更多的科学和实际利益。本文回顾了机械解释性的当前前沿及该领域应优先解决的开放问题。'}}}, {'id': 'https://huggingface.co/papers/2501.16372', 'title': 'Low-Rank Adapters Meet Neural Architecture Search for LLM Compression', 'url': 'https://huggingface.co/papers/2501.16372', 'abstract': 'The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank representations with Neural Architecture Search (NAS) techniques, particularly weight-sharing super-networks. Robust solutions for compressing and fine-tuning large pre-trained models are developed by integrating these methodologies. Our analysis highlights the potential of these combined strategies to democratize the use of LLMs, making them more accessible for deployment in resource-constrained environments. The resulting models exhibit reduced memory footprints and faster inference times, paving the way for more practical and scalable applications of LLMs. Models and code are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.', 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'f1d43a985dbea0af', 'authors': ['J. Pablo Muñoz', 'Jinjie Yuan', 'Nilesh Jain'], 'affiliations': ['Intel Corporation', 'Intel Labs'], 'pdf_title_img': 'assets/pdf/title_img/2501.16372.jpg', 'data': {'categories': ['#inference', '#optimization', '#open_source', '#training', '#low_resource', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективная настройка крупных языковых моделей для ограниченных ресурсов', 'desc': 'Эта статья рассматривает проблему больших вычислительных ресурсов, необходимых для настройки и развертывания крупных языковых моделей (LLM). Авторы предлагают комбинировать низкоранговые адаптеры и методы поиска нейронных архитектур (NAS) для эффективной настройки параметров. Такой подход позволяет сжимать и дообучать большие предобученные модели, делая их более доступными в условиях ограниченных ресурсов. В результате получаются модели с меньшим потреблением памяти и более быстрым выводом, что открывает путь к более практичному применению LLM.'}, 'en': {'title': 'Democratizing Large Language Models with Efficient Fine-Tuning Techniques', 'desc': 'This paper addresses the challenges of using Large Language Models (LLMs) due to their high computational demands. It explores the use of low-rank adapters for parameter-efficient fine-tuning (PEFT), which helps reduce the resources needed. The authors combine low-rank representations with Neural Architecture Search (NAS) techniques, particularly through weight-sharing super-networks, to create efficient solutions for model compression and fine-tuning. The findings suggest that these strategies can make LLMs more accessible and practical for deployment in environments with limited resources, resulting in models that are faster and require less memory.'}, 'zh': {'title': '低秩适配器助力大型语言模型的高效微调', 'desc': '大型语言模型(LLMs)的快速发展带来了在微调和部署时对计算资源的巨大挑战。最近,低秩适配器在参数高效微调(PEFT)方面显示出了良好的效果。本文回顾了将低秩表示与神经架构搜索(NAS)技术相结合的创新方法,特别是权重共享超网络。通过整合这些方法,开发了压缩和微调大型预训练模型的稳健解决方案,使得LLMs在资源受限的环境中更易于部署。'}}}, {'id': 'https://huggingface.co/papers/2501.15747', 'title': 'IndicMMLU-Pro: Benchmarking Indic Large Language Models on Multi-Task Language Understanding', 'url': 'https://huggingface.co/papers/2501.15747', 'abstract': "Known by more than 1.5 billion people in the Indian subcontinent, Indic languages present unique challenges and opportunities for natural language processing (NLP) research due to their rich cultural heritage, linguistic diversity, and complex structures. IndicMMLU-Pro is a comprehensive benchmark designed to evaluate Large Language Models (LLMs) across Indic languages, building upon the MMLU Pro (Massive Multitask Language Understanding) framework. Covering major languages such as Hindi, Bengali, Gujarati, Marathi, Kannada, Punjabi, Tamil, Telugu, and Urdu, our benchmark addresses the unique challenges and opportunities presented by the linguistic diversity of the Indian subcontinent. This benchmark encompasses a wide range of tasks in language comprehension, reasoning, and generation, meticulously crafted to capture the intricacies of Indian languages. IndicMMLU-Pro provides a standardized evaluation framework to push the research boundaries in Indic language AI, facilitating the development of more accurate, efficient, and culturally sensitive models. This paper outlines the benchmarks' design principles, task taxonomy, and data collection methodology, and presents baseline results from state-of-the-art multilingual models.", 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '4b666d035c5e5c4c', 'authors': ['Sankalp KJ', 'Ashutosh Kumar', 'Laxmaan Balaji', 'Nikunj Kotecha', 'Vinija Jain', 'Aman Chadha', 'Sreyoshi Bhaduri'], 'affiliations': ['Amazon Gen AI', 'Artificial Intelligence Institute, University of South Carolina', 'Independent Researcher', 'Meta AI', 'Rochester Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.15747.jpg', 'data': {'categories': ['#reasoning', '#low_resource', '#multilingual', '#benchmark'], 'emoji': '🇮🇳', 'ru': {'title': 'Новый рубеж в NLP: комплексная оценка языковых моделей для индийских языков', 'desc': 'IndicMMLU-Pro - это комплексный бенчмарк для оценки языковых моделей в индийских языках. Он охватывает 9 основных языков Индийского субконтинента и включает широкий спектр задач по пониманию языка, рассуждению и генерации текста. Бенчмарк разработан с учетом уникальных особенностей и сложностей индийских языков. IndicMMLU-Pro предоставляет стандартизированную систему оценки для продвижения исследований в области ИИ для индийских языков.'}, 'en': {'title': 'Empowering Indic Languages with Advanced NLP Benchmarks', 'desc': 'The paper introduces IndicMMLU-Pro, a benchmark specifically designed to assess Large Language Models (LLMs) in the context of Indic languages. It builds on the existing MMLU Pro framework and includes major languages like Hindi, Bengali, and Tamil, addressing the unique linguistic challenges of the Indian subcontinent. The benchmark features a variety of tasks that test language comprehension, reasoning, and generation, ensuring a comprehensive evaluation of models. By providing a standardized framework, IndicMMLU-Pro aims to enhance the development of more accurate and culturally aware AI models for Indic languages.'}, 'zh': {'title': '推动印度语言AI研究的基准', 'desc': 'IndicMMLU-Pro是一个专门为印度语言设计的基准,旨在评估大型语言模型(LLMs)的表现。该基准基于MMLU Pro框架,涵盖了印地语、孟加拉语、古吉拉特语等主要语言,解决了印度次大陆语言的多样性带来的挑战。它包括语言理解、推理和生成等多种任务,旨在捕捉印度语言的复杂性。通过提供标准化的评估框架,IndicMMLU-Pro推动了印度语言人工智能的研究,促进了更准确、高效和文化敏感的模型的发展。'}}}, {'id': 'https://huggingface.co/papers/2501.17117', 'title': 'Histoires Morales: A French Dataset for Assessing Moral Alignment', 'url': 'https://huggingface.co/papers/2501.17117', 'abstract': 'Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant progress in languages like English and Chinese, French has seen little attention in this area, leaving a gap in understanding how LLMs handle moral reasoning in this language. To address this gap, we introduce Histoires Morales, a French dataset derived from Moral Stories, created through translation and subsequently refined with the assistance of native speakers to guarantee grammatical accuracy and adaptation to the French cultural context. We also rely on annotations of the moral values within the dataset to ensure their alignment with French norms. Histoires Morales covers a wide range of social situations, including differences in tipping practices, expressions of honesty in relationships, and responsibilities toward animals. To foster future research, we also conduct preliminary experiments on the alignment of multilingual models on French and English data and the robustness of the alignment. We find that while LLMs are generally aligned with human moral norms by default, they can be easily influenced with user-preference optimization for both moral and immoral data.', 'score': 2, 'issue_id': 1924, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'd2d1461e245219e8', 'authors': ['Thibaud Leteno', 'Irina Proskurina', 'Antoine Gourru', 'Julien Velcin', 'Charlotte Laclau', 'Guillaume Metzler', 'Christophe Gravier'], 'affiliations': ['Laboratoire Hubert Curien, UMR CNRS 5516, Saint-Etienne, France', 'Télécom Paris, Institut Polytechnique de Paris, Paris, France', 'Université Lumière Lyon 2, Université Claude Bernard Lyon 1, ERIC, 69007, Lyon, France'], 'pdf_title_img': 'assets/pdf/title_img/2501.17117.jpg', 'data': {'categories': ['#dataset', '#multilingual', '#alignment', '#ethics'], 'emoji': '🇫🇷', 'ru': {'title': 'Французский датасет для морального выравнивания языковых моделей', 'desc': "Статья представляет набор данных 'Histoires Morales' на французском языке для выравнивания языковых моделей с человеческими ценностями. Этот датасет создан на основе 'Moral Stories' путем перевода и адаптации к французскому культурному контексту. Исследование включает эксперименты по выравниванию мультиязычных моделей на французских и английских данных. Результаты показывают, что языковые модели в целом соответствуют человеческим моральным нормам, но могут быть легко подвержены влиянию при оптимизации под предпочтения пользователей."}, 'en': {'title': 'Bridging Language Models and French Moral Values', 'desc': 'This paper emphasizes the importance of aligning language models with human values, particularly in the context of the French language. It introduces Histoires Morales, a dataset created from Moral Stories, which has been translated and refined to reflect French cultural norms and moral reasoning. The dataset includes various social situations to better understand how language models handle moral values in French. Preliminary experiments show that while language models generally align with human morals, they can be swayed by user preferences, highlighting the need for careful optimization.'}, 'zh': {'title': '让语言模型与人类价值观对齐', 'desc': '本论文强调了将语言模型与人类价值观对齐的重要性,尤其是在日常生活中。我们介绍了一个名为Histoires Morales的法语数据集,旨在填补法语在道德推理方面的研究空白。该数据集通过翻译和母语者的帮助进行精细化,确保其语法准确并适应法国文化背景。我们的初步实验表明,尽管大型语言模型通常与人类道德规范一致,但它们可以通过用户偏好优化轻易受到影响。'}}}]; + const articlesData = [{'id': 'https://huggingface.co/papers/2501.17161', 'title': 'SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training', 'url': 'https://huggingface.co/papers/2501.17161', 'abstract': "Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic reasoning card game, and adopt V-IRL, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants in both textual and visual domains. We show that RL, especially when trained with an outcome-based reward, generalizes across both rule-based textual and visual variants. SFT, in contrast, tends to memorize training data and struggles to generalize out-of-distribution scenarios. Further analysis reveals that RL improves the model's underlying visual recognition capabilities, contributing to its enhanced generalization in the visual domain. Despite RL's superior generalization, we show that SFT remains essential for effective RL training; SFT stabilizes the model's output format, enabling subsequent RL to achieve its performance gains. These findings demonstrates the capability of RL for acquiring generalizable knowledge in complex, multi-modal tasks.", 'score': 28, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'ce9300709a3cdc7a', 'authors': ['Tianzhe Chu', 'Yuexiang Zhai', 'Jihan Yang', 'Shengbang Tong', 'Saining Xie', 'Dale Schuurmans', 'Quoc V. Le', 'Sergey Levine', 'Yi Ma'], 'affiliations': ['Google DeepMind', 'HKU', 'NYU', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.17161.jpg', 'data': {'categories': ['#reasoning', '#training', '#optimization', '#rl', '#multimodal', '#games'], 'emoji': '🧠', 'ru': {'title': 'RL превосходит SFT в обобщении для мультимодальных задач', 'desc': 'Это исследование сравнивает методы дообучения языковых моделей: обучение с учителем (SFT) и обучение с подкреплением (RL). Авторы анализируют способность моделей к обобщению на новые текстовые и визуальные варианты задач. Результаты показывают, что RL лучше обобщается на новые ситуации, особенно при использовании награды, основанной на результате. SFT, напротив, склонно к запоминанию обучающих данных и хуже справляется с обобщением.'}, 'en': {'title': 'Unlocking Generalization: RL Outshines SFT in Multi-Modal Tasks', 'desc': 'This paper investigates how supervised fine-tuning (SFT) and reinforcement learning (RL) affect the generalization abilities of foundation models. It highlights that while SFT often leads to memorization of training data, RL, particularly with outcome-based rewards, enhances generalization across unseen textual and visual variants. The study introduces GeneralPoints, a reasoning game, and V-IRL, a navigation environment, to evaluate model performance. The results indicate that RL not only improves generalization but also strengthens visual recognition, although SFT is still crucial for stabilizing the model before RL training.'}, 'zh': {'title': '强化学习提升模型泛化能力的研究', 'desc': '这篇论文研究了监督微调(SFT)和强化学习(RL)在基础模型中的作用,特别是在提高模型的泛化能力方面。研究表明,RL在处理文本和视觉变体时,能够更好地泛化,而SFT则倾向于记忆训练数据,难以应对未见过的情况。通过引入算术推理卡牌游戏GeneralPoints和真实世界导航环境V-IRL,作者评估了这两种方法的效果。尽管RL在泛化能力上表现优越,但SFT仍然对有效的RL训练至关重要,因为它稳定了模型的输出格式。'}}}, {'id': 'https://huggingface.co/papers/2501.17116', 'title': 'Optimizing Large Language Model Training Using FP4 Quantization', 'url': 'https://huggingface.co/papers/2501.17116', 'abstract': 'The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work introduces the first FP4 training framework for LLMs, addressing these challenges with two key innovations: a differentiable quantization estimator for precise weight updates and an outlier clamping and compensation strategy to prevent activation collapse. To ensure stability, the framework integrates a mixed-precision training scheme and vector-wise quantization. Experimental results demonstrate that our FP4 framework achieves accuracy comparable to BF16 and FP8, with minimal degradation, scaling effectively to 13B-parameter LLMs trained on up to 100B tokens. With the emergence of next-generation hardware supporting FP4, our framework sets a foundation for efficient ultra-low precision training.', 'score': 13, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '9ce85dc91aee17fc', 'authors': ['Ruizhe Wang', 'Yeyun Gong', 'Xiao Liu', 'Guoshuai Zhao', 'Ziyue Yang', 'Baining Guo', 'Zhengjun Zha', 'Peng Cheng'], 'affiliations': ['Microsoft Research Asia', 'Microsoft SIGMA Team', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.17116.jpg', 'data': {'categories': ['#optimization', '#training', '#inference'], 'emoji': '🔢', 'ru': {'title': 'FP4: Революция в эффективности обучения языковых моделей', 'desc': 'Статья представляет первую систему обучения больших языковых моделей (LLM) с использованием 4-битной точности с плавающей запятой (FP4). Авторы разработали дифференцируемый оценщик квантования для точного обновления весов и стратегию ограничения и компенсации выбросов для предотвращения коллапса активаций. Система включает схему обучения со смешанной точностью и векторное квантование для обеспечения стабильности. Экспериментальные результаты показывают, что FP4-обучение достигает точности, сравнимой с BF16 и FP8, эффективно масштабируясь до LLM с 13 млрд параметров.'}, 'en': {'title': 'Efficient Training of Large Language Models with FP4 Precision', 'desc': 'This paper addresses the high computational costs associated with training large language models (LLMs) by introducing a novel FP4 training framework. The framework utilizes quantized training techniques, specifically focusing on low-bit arithmetic to enhance efficiency while maintaining model accuracy. Key innovations include a differentiable quantization estimator for better weight updates and a strategy to manage outliers, which helps prevent activation collapse. Experimental results show that this FP4 approach achieves performance similar to higher precision formats like BF16 and FP8, making it suitable for large-scale LLMs.'}, 'zh': {'title': 'FP4训练框架:高效的超低精度训练新方案', 'desc': '随着大型语言模型(LLMs)训练对计算资源的需求不断增加,寻找更高效的方法变得尤为重要。量化训练通过允许低位数算术运算来降低这些成本,展现出良好的前景。尽管FP8精度已被证明可行,但FP4的应用仍面临显著的量化误差和有限的表示能力。本文提出了首个FP4训练框架,通过可微分量化估计器和异常值钳制与补偿策略,解决了这些挑战,并在稳定性方面结合了混合精度训练方案和向量级量化。'}}}, {'id': 'https://huggingface.co/papers/2501.16975', 'title': 'Over-Tokenized Transformer: Vocabulary is Generally Worth Scaling', 'url': 'https://huggingface.co/papers/2501.16975', 'abstract': 'Tokenization is a fundamental component of large language models (LLMs), yet its influence on model scaling and performance is not fully explored. In this paper, we introduce Over-Tokenized Transformers, a novel framework that decouples input and output vocabularies to improve language modeling performance. Specifically, our approach scales up input vocabularies to leverage multi-gram tokens. Through extensive experiments, we uncover a log-linear relationship between input vocabulary size and training loss, demonstrating that larger input vocabularies consistently enhance model performance, regardless of model size. Using a large input vocabulary, we achieve performance comparable to double-sized baselines with no additional cost. Our findings highlight the importance of tokenization in scaling laws and provide practical insight for tokenizer design, paving the way for more efficient and powerful LLMs.', 'score': 10, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '27930c2f5d17471e', 'authors': ['Hongzhi Huang', 'Defa Zhu', 'Banggu Wu', 'Yutao Zeng', 'Ya Wang', 'Qiyang Min', 'Xun Zhou'], 'affiliations': ['Seed-Foundation-Model Team, Bytedance'], 'pdf_title_img': 'assets/pdf/title_img/2501.16975.jpg', 'data': {'categories': ['#optimization', '#training', '#architecture'], 'emoji': '🔤', 'ru': {'title': 'Больше токенов - выше эффективность: новый взгляд на масштабирование языковых моделей', 'desc': 'Статья представляет новый подход к токенизации в больших языковых моделях, называемый Over-Tokenized Transformers. Авторы предлагают разделить входной и выходной словари, увеличивая размер входного словаря для использования мультиграммных токенов. Исследование выявило логарифмически-линейную зависимость между размером входного словаря и потерями при обучении. Результаты показывают, что увеличение входного словаря consistently улучшает производительность модели независимо от её размера.'}, 'en': {'title': 'Unlocking Performance: The Power of Over-Tokenization in Language Models', 'desc': "This paper presents a new approach called Over-Tokenized Transformers, which focuses on improving the tokenization process in large language models (LLMs). By separating the input and output vocabularies, the authors demonstrate that increasing the input vocabulary size can significantly reduce training loss and enhance model performance. Their experiments reveal a consistent log-linear relationship between the size of the input vocabulary and the model's effectiveness, showing that larger vocabularies lead to better results without increasing computational costs. This research emphasizes the critical role of tokenization in the scaling of LLMs and offers valuable insights for designing more efficient tokenizers."}, 'zh': {'title': '分词技术提升大语言模型性能的关键', 'desc': '本文探讨了大语言模型中的分词技术对模型性能的影响。我们提出了一种新的框架——过度分词变换器,旨在通过解耦输入和输出词汇表来提升语言建模性能。研究表明,增大输入词汇表可以有效降低训练损失,从而提高模型性能。我们的实验结果显示,使用更大的输入词汇表可以在不增加成本的情况下,达到与双倍基线相当的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.16764', 'title': 'DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation', 'url': 'https://huggingface.co/papers/2501.16764', 'abstract': 'Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-image diffusion models. It differs from previous 3D generative models by effectively utilizing web-scale 2D priors while maintaining 3D consistency in a unified model. To bootstrap the training, a lightweight reconstruction model is proposed to instantly produce multi-view Gaussian splat grids for scalable dataset curation. In conjunction with the regular diffusion loss on these grids, a 3D rendering loss is introduced to facilitate 3D coherence across arbitrary views. The compatibility with image diffusion models enables seamless adaptions of numerous techniques for image generation to the 3D realm. Extensive experiments reveal the superiority of DiffSplat in text- and image-conditioned generation tasks and downstream applications. Thorough ablation studies validate the efficacy of each critical design choice and provide insights into the underlying mechanism.', 'score': 8, 'issue_id': 1921, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '00ee1a0338716711', 'authors': ['Chenguo Lin', 'Panwang Pan', 'Bangbang Yang', 'Zeming Li', 'Yadong Mu'], 'affiliations': ['ByteDance', 'Peking University'], 'pdf_title_img': 'assets/pdf/title_img/2501.16764.jpg', 'data': {'categories': ['#diffusion', '#optimization', '#training', '#dataset', '#3d'], 'emoji': '🎨', 'ru': {'title': 'DiffSplat: Генерация 3D контента на новом уровне', 'desc': 'DiffSplat - это новая система генерации 3D контента, использующая диффузионные модели для создания трехмерных гауссовых сплатов. Она решает проблемы ограниченных 3D датасетов и несогласованности при мультиракурсной 2D генерации. DiffSplat объединяет масштабные 2D-приоры с 3D-согласованностью, используя легковесную модель реконструкции и специальную функцию потерь. Эксперименты показывают превосходство DiffSplat в задачах генерации по тексту и изображениям.'}, 'en': {'title': 'Revolutionizing 3D Generation with DiffSplat', 'desc': 'DiffSplat is a new framework for generating 3D content from text or images, addressing challenges like the lack of high-quality 3D datasets. It uses advanced text-to-image diffusion models to create 3D Gaussian splats while ensuring consistency across different views. The framework includes a lightweight reconstruction model that helps quickly generate multi-view datasets for training. Through extensive testing, DiffSplat shows improved performance in generating 3D content and offers insights into its effective design choices.'}, 'zh': {'title': 'DiffSplat:3D生成的新突破', 'desc': '最近,3D内容生成从文本或单张图像中取得了进展,但高质量3D数据集有限,且2D多视图生成存在不一致性。我们提出了DiffSplat,这是一种新颖的3D生成框架,能够通过控制大规模文本到图像的扩散模型,原生生成3D高斯点云。与以往的3D生成模型不同,DiffSplat有效利用了网络规模的2D先验,同时在统一模型中保持3D一致性。通过引入轻量级重建模型和3D渲染损失,DiffSplat在文本和图像条件生成任务中表现出色,且在下游应用中也显示出其优越性。'}}}, {'id': 'https://huggingface.co/papers/2501.16496', 'title': 'Open Problems in Mechanistic Interpretability', 'url': 'https://huggingface.co/papers/2501.16496', 'abstract': "Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting scientific questions about the nature of intelligence. Despite recent progress toward these goals, there are many open problems in the field that require solutions before many scientific and practical benefits can be realized: Our methods require both conceptual and practical improvements to reveal deeper insights; we must figure out how best to apply our methods in pursuit of specific goals; and the field must grapple with socio-technical challenges that influence and are influenced by our work. This forward-facing review discusses the current frontier of mechanistic interpretability and the open problems that the field may benefit from prioritizing.", 'score': 8, 'issue_id': 1920, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '5a7a914accebfa33', 'authors': ['Lee Sharkey', 'Bilal Chughtai', 'Joshua Batson', 'Jack Lindsey', 'Jeff Wu', 'Lucius Bushnaq', 'Nicholas Goldowsky-Dill', 'Stefan Heimersheim', 'Alejandro Ortega', 'Joseph Bloom', 'Stella Biderman', 'Adria Garriga-Alonso', 'Arthur Conmy', 'Neel Nanda', 'Jessica Rumbelow', 'Martin Wattenberg', 'Nandi Schoots', 'Joseph Miller', 'Eric J. Michaud', 'Stephen Casper', 'Max Tegmark', 'William Saunders', 'David Bau', 'Eric Todd', 'Atticus Geiger', 'Mor Geva', 'Jesse Hoogland', 'Daniel Murfet', 'Tom McGrath'], 'affiliations': ['Anthropic', 'Apollo Research', 'Google DeepMind', 'Harvard University', 'Imperial College London', 'Kings College London', 'Leap Laboratories', 'MIT', 'Northeastern University', 'Tel Aviv University', 'University of Melbourne'], 'pdf_title_img': 'assets/pdf/title_img/2501.16496.jpg', 'data': {'categories': ['#interpretability', '#survey'], 'emoji': '🧠', 'ru': {'title': 'Раскрывая тайны нейронных сетей: путь к пониманию искусственного интеллекта', 'desc': 'Статья посвящена механистической интерпретируемости нейронных сетей, цель которой - понять вычислительные механизмы, лежащие в основе их возможностей. Прогресс в этой области обещает обеспечить большую уверенность в поведении систем искусственного интеллекта и пролить свет на природу интеллекта. Авторы обсуждают открытые проблемы в области, требующие решения для реализации научных и практических преимуществ. Статья рассматривает текущие границы механистической интерпретируемости и приоритетные задачи для дальнейшего развития области.'}, 'en': {'title': 'Unlocking the Secrets of Neural Networks for Reliable AI', 'desc': 'Mechanistic interpretability focuses on understanding how neural networks work to achieve specific tasks, which can enhance the reliability of AI systems. This area of research aims to uncover the underlying processes that contribute to the intelligence exhibited by these models. Despite advancements, there are still significant challenges that need to be addressed, including improving methods for deeper insights and applying these methods effectively. Additionally, the field must consider socio-technical issues that affect and are affected by mechanistic interpretability efforts.'}, 'zh': {'title': '揭示神经网络的计算机制', 'desc': '机械解释性旨在理解神经网络能力背后的计算机制,以实现具体的科学和工程目标。该领域的进展有望提高对人工智能系统行为的信心,并揭示关于智能本质的有趣科学问题。尽管最近在这些目标上取得了一些进展,但仍有许多未解决的问题需要解决,以便实现更多的科学和实际利益。本文回顾了机械解释性的当前前沿及该领域应优先解决的开放问题。'}}}, {'id': 'https://huggingface.co/papers/2501.16372', 'title': 'Low-Rank Adapters Meet Neural Architecture Search for LLM Compression', 'url': 'https://huggingface.co/papers/2501.16372', 'abstract': 'The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank representations with Neural Architecture Search (NAS) techniques, particularly weight-sharing super-networks. Robust solutions for compressing and fine-tuning large pre-trained models are developed by integrating these methodologies. Our analysis highlights the potential of these combined strategies to democratize the use of LLMs, making them more accessible for deployment in resource-constrained environments. The resulting models exhibit reduced memory footprints and faster inference times, paving the way for more practical and scalable applications of LLMs. Models and code are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.', 'score': 5, 'issue_id': 1918, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'f1d43a985dbea0af', 'authors': ['J. Pablo Muñoz', 'Jinjie Yuan', 'Nilesh Jain'], 'affiliations': ['Intel Corporation', 'Intel Labs'], 'pdf_title_img': 'assets/pdf/title_img/2501.16372.jpg', 'data': {'categories': ['#inference', '#optimization', '#open_source', '#training', '#low_resource', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективная настройка крупных языковых моделей для ограниченных ресурсов', 'desc': 'Эта статья рассматривает проблему больших вычислительных ресурсов, необходимых для настройки и развертывания крупных языковых моделей (LLM). Авторы предлагают комбинировать низкоранговые адаптеры и методы поиска нейронных архитектур (NAS) для эффективной настройки параметров. Такой подход позволяет сжимать и дообучать большие предобученные модели, делая их более доступными в условиях ограниченных ресурсов. В результате получаются модели с меньшим потреблением памяти и более быстрым выводом, что открывает путь к более практичному применению LLM.'}, 'en': {'title': 'Democratizing Large Language Models with Efficient Fine-Tuning Techniques', 'desc': 'This paper addresses the challenges of using Large Language Models (LLMs) due to their high computational demands. It explores the use of low-rank adapters for parameter-efficient fine-tuning (PEFT), which helps reduce the resources needed. The authors combine low-rank representations with Neural Architecture Search (NAS) techniques, particularly through weight-sharing super-networks, to create efficient solutions for model compression and fine-tuning. The findings suggest that these strategies can make LLMs more accessible and practical for deployment in environments with limited resources, resulting in models that are faster and require less memory.'}, 'zh': {'title': '低秩适配器助力大型语言模型的高效微调', 'desc': '大型语言模型(LLMs)的快速发展带来了在微调和部署时对计算资源的巨大挑战。最近,低秩适配器在参数高效微调(PEFT)方面显示出了良好的效果。本文回顾了将低秩表示与神经架构搜索(NAS)技术相结合的创新方法,特别是权重共享超网络。通过整合这些方法,开发了压缩和微调大型预训练模型的稳健解决方案,使得LLMs在资源受限的环境中更易于部署。'}}}, {'id': 'https://huggingface.co/papers/2501.15747', 'title': 'IndicMMLU-Pro: Benchmarking Indic Large Language Models on Multi-Task Language Understanding', 'url': 'https://huggingface.co/papers/2501.15747', 'abstract': "Known by more than 1.5 billion people in the Indian subcontinent, Indic languages present unique challenges and opportunities for natural language processing (NLP) research due to their rich cultural heritage, linguistic diversity, and complex structures. IndicMMLU-Pro is a comprehensive benchmark designed to evaluate Large Language Models (LLMs) across Indic languages, building upon the MMLU Pro (Massive Multitask Language Understanding) framework. Covering major languages such as Hindi, Bengali, Gujarati, Marathi, Kannada, Punjabi, Tamil, Telugu, and Urdu, our benchmark addresses the unique challenges and opportunities presented by the linguistic diversity of the Indian subcontinent. This benchmark encompasses a wide range of tasks in language comprehension, reasoning, and generation, meticulously crafted to capture the intricacies of Indian languages. IndicMMLU-Pro provides a standardized evaluation framework to push the research boundaries in Indic language AI, facilitating the development of more accurate, efficient, and culturally sensitive models. This paper outlines the benchmarks' design principles, task taxonomy, and data collection methodology, and presents baseline results from state-of-the-art multilingual models.", 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '4b666d035c5e5c4c', 'authors': ['Sankalp KJ', 'Ashutosh Kumar', 'Laxmaan Balaji', 'Nikunj Kotecha', 'Vinija Jain', 'Aman Chadha', 'Sreyoshi Bhaduri'], 'affiliations': ['Amazon Gen AI', 'Artificial Intelligence Institute, University of South Carolina', 'Independent Researcher', 'Meta AI', 'Rochester Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.15747.jpg', 'data': {'categories': ['#reasoning', '#low_resource', '#multilingual', '#benchmark'], 'emoji': '🇮🇳', 'ru': {'title': 'Новый рубеж в NLP: комплексная оценка языковых моделей для индийских языков', 'desc': 'IndicMMLU-Pro - это комплексный бенчмарк для оценки языковых моделей в индийских языках. Он охватывает 9 основных языков Индийского субконтинента и включает широкий спектр задач по пониманию языка, рассуждению и генерации текста. Бенчмарк разработан с учетом уникальных особенностей и сложностей индийских языков. IndicMMLU-Pro предоставляет стандартизированную систему оценки для продвижения исследований в области ИИ для индийских языков.'}, 'en': {'title': 'Empowering Indic Languages with Advanced NLP Benchmarks', 'desc': 'The paper introduces IndicMMLU-Pro, a benchmark specifically designed to assess Large Language Models (LLMs) in the context of Indic languages. It builds on the existing MMLU Pro framework and includes major languages like Hindi, Bengali, and Tamil, addressing the unique linguistic challenges of the Indian subcontinent. The benchmark features a variety of tasks that test language comprehension, reasoning, and generation, ensuring a comprehensive evaluation of models. By providing a standardized framework, IndicMMLU-Pro aims to enhance the development of more accurate and culturally aware AI models for Indic languages.'}, 'zh': {'title': '推动印度语言AI研究的基准', 'desc': 'IndicMMLU-Pro是一个专门为印度语言设计的基准,旨在评估大型语言模型(LLMs)的表现。该基准基于MMLU Pro框架,涵盖了印地语、孟加拉语、古吉拉特语等主要语言,解决了印度次大陆语言的多样性带来的挑战。它包括语言理解、推理和生成等多种任务,旨在捕捉印度语言的复杂性。通过提供标准化的评估框架,IndicMMLU-Pro推动了印度语言人工智能的研究,促进了更准确、高效和文化敏感的模型的发展。'}}}, {'id': 'https://huggingface.co/papers/2501.17117', 'title': 'Histoires Morales: A French Dataset for Assessing Moral Alignment', 'url': 'https://huggingface.co/papers/2501.17117', 'abstract': 'Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant progress in languages like English and Chinese, French has seen little attention in this area, leaving a gap in understanding how LLMs handle moral reasoning in this language. To address this gap, we introduce Histoires Morales, a French dataset derived from Moral Stories, created through translation and subsequently refined with the assistance of native speakers to guarantee grammatical accuracy and adaptation to the French cultural context. We also rely on annotations of the moral values within the dataset to ensure their alignment with French norms. Histoires Morales covers a wide range of social situations, including differences in tipping practices, expressions of honesty in relationships, and responsibilities toward animals. To foster future research, we also conduct preliminary experiments on the alignment of multilingual models on French and English data and the robustness of the alignment. We find that while LLMs are generally aligned with human moral norms by default, they can be easily influenced with user-preference optimization for both moral and immoral data.', 'score': 2, 'issue_id': 1924, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'd2d1461e245219e8', 'authors': ['Thibaud Leteno', 'Irina Proskurina', 'Antoine Gourru', 'Julien Velcin', 'Charlotte Laclau', 'Guillaume Metzler', 'Christophe Gravier'], 'affiliations': ['Laboratoire Hubert Curien, UMR CNRS 5516, Saint-Etienne, France', 'Télécom Paris, Institut Polytechnique de Paris, Paris, France', 'Université Lumière Lyon 2, Université Claude Bernard Lyon 1, ERIC, 69007, Lyon, France'], 'pdf_title_img': 'assets/pdf/title_img/2501.17117.jpg', 'data': {'categories': ['#dataset', '#multilingual', '#alignment', '#ethics'], 'emoji': '🇫🇷', 'ru': {'title': 'Французский датасет для морального выравнивания языковых моделей', 'desc': "Статья представляет набор данных 'Histoires Morales' на французском языке для выравнивания языковых моделей с человеческими ценностями. Этот датасет создан на основе 'Moral Stories' путем перевода и адаптации к французскому культурному контексту. Исследование включает эксперименты по выравниванию мультиязычных моделей на французских и английских данных. Результаты показывают, что языковые модели в целом соответствуют человеческим моральным нормам, но могут быть легко подвержены влиянию при оптимизации под предпочтения пользователей."}, 'en': {'title': 'Bridging Language Models and French Moral Values', 'desc': 'This paper emphasizes the importance of aligning language models with human values, particularly in the context of the French language. It introduces Histoires Morales, a dataset created from Moral Stories, which has been translated and refined to reflect French cultural norms and moral reasoning. The dataset includes various social situations to better understand how language models handle moral values in French. Preliminary experiments show that while language models generally align with human morals, they can be swayed by user preferences, highlighting the need for careful optimization.'}, 'zh': {'title': '让语言模型与人类价值观对齐', 'desc': '本论文强调了将语言模型与人类价值观对齐的重要性,尤其是在日常生活中。我们介绍了一个名为Histoires Morales的法语数据集,旨在填补法语在道德推理方面的研究空白。该数据集通过翻译和母语者的帮助进行精细化,确保其语法准确并适应法国文化背景。我们的初步实验表明,尽管大型语言模型通常与人类道德规范一致,但它们可以通过用户偏好优化轻易受到影响。'}}}]; const articlesContainer = document.getElementById('articles-container'); const sortDropdown = document.getElementById('sort-dropdown'); const categoryFiltersContainer = document.getElementById('category-filters'); @@ -1184,7 +1184,7 @@ function updateTimeDiffs() { const timeDiff = document.getElementById('timeDiff'); - timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-29 22:09',lang=currentLang); + timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-29 23:09',lang=currentLang); } function updateSortingOptions() { const sortingLabels = { @@ -1238,14 +1238,14 @@ } function hideNextLink(format) { if (format === 'monthly') { - if (isCurrentMonth('2025-01-29 22:09')) { + if (isCurrentMonth('2025-01-29 23:09')) { const element = document.getElementById('nav-next'); if (element) { element.style.display = 'none'; } } } else { - if (isToday('2025-01-29 22:09')) { + if (isToday('2025-01-29 23:09')) { const element = document.getElementById('nav-next'); if (element) { element.style.display = 'none'; diff --git a/log.txt b/log.txt index 4c2df74bf..d2ef000d1 100644 --- a/log.txt +++ b/log.txt @@ -1,3 +1,3 @@ -[29.01.2025 22:09] Read previous papers. -[29.01.2025 22:09] Generating top page (month). -[29.01.2025 22:09] Writing top page (month). +[29.01.2025 23:09] Read previous papers. +[29.01.2025 23:09] Generating top page (month). +[29.01.2025 23:09] Writing top page (month). diff --git a/logs/2025-01-29_last_log.txt b/logs/2025-01-29_last_log.txt index 583f0fd9e..8948f1edc 100644 --- a/logs/2025-01-29_last_log.txt +++ b/logs/2025-01-29_last_log.txt @@ -1,90 +1,90 @@ -[29.01.2025 21:09] Read previous papers. -[29.01.2025 21:09] Generating top page (month). -[29.01.2025 21:09] Writing top page (month). [29.01.2025 22:09] Read previous papers. -[29.01.2025 22:09] Get feed. -[29.01.2025 22:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.17161 -[29.01.2025 22:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.17116 -[29.01.2025 22:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.16975 -[29.01.2025 22:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.16764 -[29.01.2025 22:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.16496 -[29.01.2025 22:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.16372 -[29.01.2025 22:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.15747 -[29.01.2025 22:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.17117 -[29.01.2025 22:09] Obtaining deleted papers (sometimes HF Daily Papers move some articles from today to past days). -[29.01.2025 22:09] No deleted papers detected. -[29.01.2025 22:09] Downloading and parsing papers (pdf, html). Total: 8. -[29.01.2025 22:09] Downloading and parsing paper https://huggingface.co/papers/2501.17161. -[29.01.2025 22:09] Extra JSON file exists (./assets/json/2501.17161.json), skip PDF parsing. -[29.01.2025 22:09] Paper image links file exists (./assets/img_data/2501.17161.json), skip HTML parsing. -[29.01.2025 22:09] Success. -[29.01.2025 22:09] Downloading and parsing paper https://huggingface.co/papers/2501.17116. -[29.01.2025 22:09] Extra JSON file exists (./assets/json/2501.17116.json), skip PDF parsing. -[29.01.2025 22:09] Paper image links file exists (./assets/img_data/2501.17116.json), skip HTML parsing. -[29.01.2025 22:09] Success. -[29.01.2025 22:09] Downloading and parsing paper https://huggingface.co/papers/2501.16975. -[29.01.2025 22:09] Extra JSON file exists (./assets/json/2501.16975.json), skip PDF parsing. -[29.01.2025 22:09] Paper image links file exists (./assets/img_data/2501.16975.json), skip HTML parsing. -[29.01.2025 22:09] Success. -[29.01.2025 22:09] Downloading and parsing paper https://huggingface.co/papers/2501.16764. -[29.01.2025 22:09] Extra JSON file exists (./assets/json/2501.16764.json), skip PDF parsing. -[29.01.2025 22:09] Paper image links file exists (./assets/img_data/2501.16764.json), skip HTML parsing. -[29.01.2025 22:09] Success. -[29.01.2025 22:09] Downloading and parsing paper https://huggingface.co/papers/2501.16496. -[29.01.2025 22:09] Extra JSON file exists (./assets/json/2501.16496.json), skip PDF parsing. -[29.01.2025 22:09] Paper image links file exists (./assets/img_data/2501.16496.json), skip HTML parsing. -[29.01.2025 22:09] Success. -[29.01.2025 22:09] Downloading and parsing paper https://huggingface.co/papers/2501.16372. -[29.01.2025 22:09] Extra JSON file exists (./assets/json/2501.16372.json), skip PDF parsing. -[29.01.2025 22:09] Paper image links file exists (./assets/img_data/2501.16372.json), skip HTML parsing. -[29.01.2025 22:09] Success. -[29.01.2025 22:09] Downloading and parsing paper https://huggingface.co/papers/2501.15747. -[29.01.2025 22:09] Extra JSON file exists (./assets/json/2501.15747.json), skip PDF parsing. -[29.01.2025 22:09] Paper image links file exists (./assets/img_data/2501.15747.json), skip HTML parsing. -[29.01.2025 22:09] Success. -[29.01.2025 22:09] Downloading and parsing paper https://huggingface.co/papers/2501.17117. -[29.01.2025 22:09] Extra JSON file exists (./assets/json/2501.17117.json), skip PDF parsing. -[29.01.2025 22:09] Paper image links file exists (./assets/img_data/2501.17117.json), skip HTML parsing. -[29.01.2025 22:09] Success. -[29.01.2025 22:09] Enriching papers with extra data. -[29.01.2025 22:09] ******************************************************************************** -[29.01.2025 22:09] Abstract 0. Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, fo... -[29.01.2025 22:09] ******************************************************************************** -[29.01.2025 22:09] Abstract 1. The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a ... -[29.01.2025 22:09] ******************************************************************************** -[29.01.2025 22:09] Abstract 2. Tokenization is a fundamental component of large language models (LLMs), yet its influence on model scaling and performance is not fully explored. In this paper, we introduce Over-Tokenized Transformers, a novel framework that decouples input and output vocabularies to improve language modeling perf... -[29.01.2025 22:09] ******************************************************************************** -[29.01.2025 22:09] Abstract 3. Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-im... -[29.01.2025 22:09] ******************************************************************************** -[29.01.2025 22:09] Abstract 4. Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting... -[29.01.2025 22:09] ******************************************************************************** -[29.01.2025 22:09] Abstract 5. The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. Thi... -[29.01.2025 22:09] ******************************************************************************** -[29.01.2025 22:09] Abstract 6. Known by more than 1.5 billion people in the Indian subcontinent, Indic languages present unique challenges and opportunities for natural language processing (NLP) research due to their rich cultural heritage, linguistic diversity, and complex structures. IndicMMLU-Pro is a comprehensive benchmark d... -[29.01.2025 22:09] ******************************************************************************** -[29.01.2025 22:09] Abstract 7. Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant p... -[29.01.2025 22:09] Read previous papers. -[29.01.2025 22:09] Generating reviews via LLM API. -[29.01.2025 22:09] Using data from previous issue: {"categories": ["#reasoning", "#training", "#optimization", "#rl", "#multimodal", "#games"], "emoji": "🧠", "ru": {"title": "RL превосходит SFT в обобщении для мультимодальных задач", "desc": "Это исследование сравнивает методы дообучения языковых моделей: обучение с учителем (SFT) и обучение с подкр -[29.01.2025 22:09] Using data from previous issue: {"categories": ["#optimization", "#training", "#inference"], "emoji": "🔢", "ru": {"title": "FP4: Революция в эффективности обучения языковых моделей", "desc": "Статья представляет первую систему обучения больших языковых моделей (LLM) с использованием 4-битной точности с плавающей запятой (FP4). Авт -[29.01.2025 22:09] Using data from previous issue: {"categories": ["#optimization", "#training", "#architecture"], "emoji": "🔤", "ru": {"title": "Больше токенов - выше эффективность: новый взгляд на масштабирование языковых моделей", "desc": "Статья представляет новый подход к токенизации в больших языковых моделях, называемый Over-Tokenized Transfo -[29.01.2025 22:09] Using data from previous issue: {"categories": ["#diffusion", "#optimization", "#training", "#dataset", "#3d"], "emoji": "🎨", "ru": {"title": "DiffSplat: Генерация 3D контента на новом уровне", "desc": "DiffSplat - это новая система генерации 3D контента, использующая диффузионные модели для создания трехмерных гауссовых сплатов. -[29.01.2025 22:09] Using data from previous issue: {"categories": ["#interpretability", "#survey"], "emoji": "🧠", "ru": {"title": "Раскрывая тайны нейронных сетей: путь к пониманию искусственного интеллекта", "desc": "Статья посвящена механистической интерпретируемости нейронных сетей, цель которой - понять вычислительные механизмы, лежащие в основе -[29.01.2025 22:09] Using data from previous issue: {"categories": ["#inference", "#optimization", "#open_source", "#training", "#low_resource", "#architecture"], "emoji": "🧠", "ru": {"title": "Эффективная настройка крупных языковых моделей для ограниченных ресурсов", "desc": "Эта статья рассматривает проблему больших вычислительных ресурсов, необход -[29.01.2025 22:09] Using data from previous issue: {"categories": ["#reasoning", "#low_resource", "#multilingual", "#benchmark"], "emoji": "🇮🇳", "ru": {"title": "Новый рубеж в NLP: комплексная оценка языковых моделей для индийских языков", "desc": "IndicMMLU-Pro - это комплексный бенчмарк для оценки языковых моделей в индийских языках. Он охватывает -[29.01.2025 22:09] Using data from previous issue: {"categories": ["#dataset", "#multilingual", "#alignment", "#ethics"], "emoji": "🇫🇷", "ru": {"title": "Французский датасет для морального выравнивания языковых моделей", "desc": "Статья представляет набор данных 'Histoires Morales' на французском языке для выравнивания языковых моделей с человечески -[29.01.2025 22:09] Loading Chinese text from previous data. -[29.01.2025 22:09] Renaming data file. -[29.01.2025 22:09] Renaming previous data. hf_papers.json to ./d/2025-01-29.json -[29.01.2025 22:09] Saving new data file. -[29.01.2025 22:09] Generating page. -[29.01.2025 22:09] Renaming previous page. -[29.01.2025 22:09] Renaming previous data. index.html to ./d/2025-01-29.html -[29.01.2025 22:09] [Experimental] Generating Chinese page for reading. -[29.01.2025 22:09] Chinese vocab [{'word': '监督', 'pinyin': 'jiàn dū', 'trans': 'supervised'}, {'word': '微调', 'pinyin': 'wēi tiáo', 'trans': 'fine-tuning'}, {'word': '强化学习', 'pinyin': 'qiáng huà xué xí', 'trans': 'reinforcement learning'}, {'word': '基础模型', 'pinyin': 'jī chǔ mó xíng', 'trans': 'foundational model'}, {'word': '作用', 'pinyin': 'zuò yòng', 'trans': 'effect'}, {'word': '泛化', 'pinyin': 'fàn huà', 'trans': 'generalization'}, {'word': '倾向于', 'pinyin': 'qīng xiàng yú', 'trans': 'tend to'}, {'word': '未见过', 'pinyin': 'wèi jiàn guò', 'trans': 'unseen'}, {'word': '变体', 'pinyin': 'biàn tǐ', 'trans': 'variant'}, {'word': '视觉识别', 'pinyin': 'shì jué shí bié', 'trans': 'visual recognition'}, {'word': '不可或缺', 'pinyin': 'bù kě huò quē', 'trans': 'indispensable'}] -[29.01.2025 22:09] Renaming previous Chinese page. -[29.01.2025 22:09] Renaming previous data. zh.html to ./d/2025-01-28_zh_reading_task.html -[29.01.2025 22:09] Writing Chinese reading task. -[29.01.2025 22:09] Writing result. -[29.01.2025 22:09] Renaming log file. -[29.01.2025 22:09] Renaming previous data. log.txt to ./logs/2025-01-29_last_log.txt +[29.01.2025 22:09] Generating top page (month). +[29.01.2025 22:09] Writing top page (month). +[29.01.2025 23:09] Read previous papers. +[29.01.2025 23:09] Get feed. +[29.01.2025 23:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.17161 +[29.01.2025 23:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.17116 +[29.01.2025 23:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.16975 +[29.01.2025 23:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.16764 +[29.01.2025 23:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.16496 +[29.01.2025 23:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.16372 +[29.01.2025 23:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.15747 +[29.01.2025 23:09] Get page data from previous paper. URL: https://huggingface.co/papers/2501.17117 +[29.01.2025 23:09] Obtaining deleted papers (sometimes HF Daily Papers move some articles from today to past days). +[29.01.2025 23:09] No deleted papers detected. +[29.01.2025 23:09] Downloading and parsing papers (pdf, html). Total: 8. +[29.01.2025 23:09] Downloading and parsing paper https://huggingface.co/papers/2501.17161. +[29.01.2025 23:09] Extra JSON file exists (./assets/json/2501.17161.json), skip PDF parsing. +[29.01.2025 23:09] Paper image links file exists (./assets/img_data/2501.17161.json), skip HTML parsing. +[29.01.2025 23:09] Success. +[29.01.2025 23:09] Downloading and parsing paper https://huggingface.co/papers/2501.17116. +[29.01.2025 23:09] Extra JSON file exists (./assets/json/2501.17116.json), skip PDF parsing. +[29.01.2025 23:09] Paper image links file exists (./assets/img_data/2501.17116.json), skip HTML parsing. +[29.01.2025 23:09] Success. +[29.01.2025 23:09] Downloading and parsing paper https://huggingface.co/papers/2501.16975. +[29.01.2025 23:09] Extra JSON file exists (./assets/json/2501.16975.json), skip PDF parsing. +[29.01.2025 23:09] Paper image links file exists (./assets/img_data/2501.16975.json), skip HTML parsing. +[29.01.2025 23:09] Success. +[29.01.2025 23:09] Downloading and parsing paper https://huggingface.co/papers/2501.16764. +[29.01.2025 23:09] Extra JSON file exists (./assets/json/2501.16764.json), skip PDF parsing. +[29.01.2025 23:09] Paper image links file exists (./assets/img_data/2501.16764.json), skip HTML parsing. +[29.01.2025 23:09] Success. +[29.01.2025 23:09] Downloading and parsing paper https://huggingface.co/papers/2501.16496. +[29.01.2025 23:09] Extra JSON file exists (./assets/json/2501.16496.json), skip PDF parsing. +[29.01.2025 23:09] Paper image links file exists (./assets/img_data/2501.16496.json), skip HTML parsing. +[29.01.2025 23:09] Success. +[29.01.2025 23:09] Downloading and parsing paper https://huggingface.co/papers/2501.16372. +[29.01.2025 23:09] Extra JSON file exists (./assets/json/2501.16372.json), skip PDF parsing. +[29.01.2025 23:09] Paper image links file exists (./assets/img_data/2501.16372.json), skip HTML parsing. +[29.01.2025 23:09] Success. +[29.01.2025 23:09] Downloading and parsing paper https://huggingface.co/papers/2501.15747. +[29.01.2025 23:09] Extra JSON file exists (./assets/json/2501.15747.json), skip PDF parsing. +[29.01.2025 23:09] Paper image links file exists (./assets/img_data/2501.15747.json), skip HTML parsing. +[29.01.2025 23:09] Success. +[29.01.2025 23:09] Downloading and parsing paper https://huggingface.co/papers/2501.17117. +[29.01.2025 23:09] Extra JSON file exists (./assets/json/2501.17117.json), skip PDF parsing. +[29.01.2025 23:09] Paper image links file exists (./assets/img_data/2501.17117.json), skip HTML parsing. +[29.01.2025 23:09] Success. +[29.01.2025 23:09] Enriching papers with extra data. +[29.01.2025 23:09] ******************************************************************************** +[29.01.2025 23:09] Abstract 0. Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, fo... +[29.01.2025 23:09] ******************************************************************************** +[29.01.2025 23:09] Abstract 1. The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a ... +[29.01.2025 23:09] ******************************************************************************** +[29.01.2025 23:09] Abstract 2. Tokenization is a fundamental component of large language models (LLMs), yet its influence on model scaling and performance is not fully explored. In this paper, we introduce Over-Tokenized Transformers, a novel framework that decouples input and output vocabularies to improve language modeling perf... +[29.01.2025 23:09] ******************************************************************************** +[29.01.2025 23:09] Abstract 3. Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-im... +[29.01.2025 23:09] ******************************************************************************** +[29.01.2025 23:09] Abstract 4. Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting... +[29.01.2025 23:09] ******************************************************************************** +[29.01.2025 23:09] Abstract 5. The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. Thi... +[29.01.2025 23:09] ******************************************************************************** +[29.01.2025 23:09] Abstract 6. Known by more than 1.5 billion people in the Indian subcontinent, Indic languages present unique challenges and opportunities for natural language processing (NLP) research due to their rich cultural heritage, linguistic diversity, and complex structures. IndicMMLU-Pro is a comprehensive benchmark d... +[29.01.2025 23:09] ******************************************************************************** +[29.01.2025 23:09] Abstract 7. Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant p... +[29.01.2025 23:09] Read previous papers. +[29.01.2025 23:09] Generating reviews via LLM API. +[29.01.2025 23:09] Using data from previous issue: {"categories": ["#reasoning", "#training", "#optimization", "#rl", "#multimodal", "#games"], "emoji": "🧠", "ru": {"title": "RL превосходит SFT в обобщении для мультимодальных задач", "desc": "Это исследование сравнивает методы дообучения языковых моделей: обучение с учителем (SFT) и обучение с подкр +[29.01.2025 23:09] Using data from previous issue: {"categories": ["#optimization", "#training", "#inference"], "emoji": "🔢", "ru": {"title": "FP4: Революция в эффективности обучения языковых моделей", "desc": "Статья представляет первую систему обучения больших языковых моделей (LLM) с использованием 4-битной точности с плавающей запятой (FP4). Авт +[29.01.2025 23:09] Using data from previous issue: {"categories": ["#optimization", "#training", "#architecture"], "emoji": "🔤", "ru": {"title": "Больше токенов - выше эффективность: новый взгляд на масштабирование языковых моделей", "desc": "Статья представляет новый подход к токенизации в больших языковых моделях, называемый Over-Tokenized Transfo +[29.01.2025 23:09] Using data from previous issue: {"categories": ["#diffusion", "#optimization", "#training", "#dataset", "#3d"], "emoji": "🎨", "ru": {"title": "DiffSplat: Генерация 3D контента на новом уровне", "desc": "DiffSplat - это новая система генерации 3D контента, использующая диффузионные модели для создания трехмерных гауссовых сплатов. +[29.01.2025 23:09] Using data from previous issue: {"categories": ["#interpretability", "#survey"], "emoji": "🧠", "ru": {"title": "Раскрывая тайны нейронных сетей: путь к пониманию искусственного интеллекта", "desc": "Статья посвящена механистической интерпретируемости нейронных сетей, цель которой - понять вычислительные механизмы, лежащие в основе +[29.01.2025 23:09] Using data from previous issue: {"categories": ["#inference", "#optimization", "#open_source", "#training", "#low_resource", "#architecture"], "emoji": "🧠", "ru": {"title": "Эффективная настройка крупных языковых моделей для ограниченных ресурсов", "desc": "Эта статья рассматривает проблему больших вычислительных ресурсов, необход +[29.01.2025 23:09] Using data from previous issue: {"categories": ["#reasoning", "#low_resource", "#multilingual", "#benchmark"], "emoji": "🇮🇳", "ru": {"title": "Новый рубеж в NLP: комплексная оценка языковых моделей для индийских языков", "desc": "IndicMMLU-Pro - это комплексный бенчмарк для оценки языковых моделей в индийских языках. Он охватывает +[29.01.2025 23:09] Using data from previous issue: {"categories": ["#dataset", "#multilingual", "#alignment", "#ethics"], "emoji": "🇫🇷", "ru": {"title": "Французский датасет для морального выравнивания языковых моделей", "desc": "Статья представляет набор данных 'Histoires Morales' на французском языке для выравнивания языковых моделей с человечески +[29.01.2025 23:09] Loading Chinese text from previous data. +[29.01.2025 23:09] Renaming data file. +[29.01.2025 23:09] Renaming previous data. hf_papers.json to ./d/2025-01-29.json +[29.01.2025 23:09] Saving new data file. +[29.01.2025 23:09] Generating page. +[29.01.2025 23:09] Renaming previous page. +[29.01.2025 23:09] Renaming previous data. index.html to ./d/2025-01-29.html +[29.01.2025 23:09] [Experimental] Generating Chinese page for reading. +[29.01.2025 23:09] Chinese vocab [{'word': '监督', 'pinyin': 'jiàn dū', 'trans': 'supervised'}, {'word': '微调', 'pinyin': 'wēi tiáo', 'trans': 'fine-tuning'}, {'word': '强化学习', 'pinyin': 'qiáng huà xué xí', 'trans': 'reinforcement learning'}, {'word': '基础模型', 'pinyin': 'jī chǔ mó xíng', 'trans': 'foundational model'}, {'word': '作用', 'pinyin': 'zuò yòng', 'trans': 'effect'}, {'word': '泛化', 'pinyin': 'fàn huà', 'trans': 'generalization'}, {'word': '倾向于', 'pinyin': 'qīng xiàng yú', 'trans': 'tend to'}, {'word': '未见过', 'pinyin': 'wèi jiàn guò', 'trans': 'unseen'}, {'word': '变体', 'pinyin': 'biàn tǐ', 'trans': 'variant'}, {'word': '视觉识别', 'pinyin': 'shì jué shí bié', 'trans': 'visual recognition'}, {'word': '不可或缺', 'pinyin': 'bù kě huò quē', 'trans': 'indispensable'}] +[29.01.2025 23:09] Renaming previous Chinese page. +[29.01.2025 23:09] Renaming previous data. zh.html to ./d/2025-01-28_zh_reading_task.html +[29.01.2025 23:09] Writing Chinese reading task. +[29.01.2025 23:09] Writing result. +[29.01.2025 23:09] Renaming log file. +[29.01.2025 23:09] Renaming previous data. log.txt to ./logs/2025-01-29_last_log.txt diff --git a/m/2025-01.html b/m/2025-01.html index 0773b9f17..2b05241d7 100644 --- a/m/2025-01.html +++ b/m/2025-01.html @@ -881,7 +881,7 @@ } } - const articlesData = [{'id': 'https://huggingface.co/papers/2412.18525', 'title': 'Explanatory Instructions: Towards Unified Vision Tasks Understanding and Zero-shot Generalization', 'url': 'https://huggingface.co/papers/2412.18525', 'abstract': "Computer Vision (CV) has yet to fully achieve the zero-shot task generalization observed in Natural Language Processing (NLP), despite following many of the milestones established in NLP, such as large transformer models, extensive pre-training, and the auto-regression paradigm, among others. In this paper, we explore the idea that CV adopts discrete and terminological task definitions (\\eg, ``image segmentation''), which may be a key barrier to zero-shot task generalization. Our hypothesis is that without truly understanding previously-seen tasks--due to these terminological definitions--deep models struggle to generalize to novel tasks. To verify this, we introduce Explanatory Instructions, which provide an intuitive way to define CV task objectives through detailed linguistic transformations from input images to outputs. We create a large-scale dataset comprising 12 million ``image input to explanatory instruction to output'' triplets, and train an auto-regressive-based vision-language model (AR-based VLM) that takes both images and explanatory instructions as input. By learning to follow these instructions, the AR-based VLM achieves instruction-level zero-shot capabilities for previously-seen tasks and demonstrates strong zero-shot generalization for unseen CV tasks. Code and dataset will be openly available on our GitHub repository.", 'score': 48, 'issue_id': 1406, 'pub_date': '2024-12-24', 'pub_date_card': {'ru': '24 декабря', 'en': 'December 24', 'zh': '12月24日'}, 'hash': '23f11aceae00534d', 'authors': ['Yang Shen', 'Xiu-Shen Wei', 'Yifan Sun', 'Yuxin Song', 'Tao Yuan', 'Jian Jin', 'Heyang Xu', 'Yazhou Yao', 'Errui Ding'], 'affiliations': ['Baidu', 'Nanjing University of Science and Technology', 'Southeast University'], 'pdf_title_img': 'assets/pdf/title_img/2412.18525.jpg', 'data': {'categories': ['#dataset', '#open_source', '#cv', '#multimodal', '#transfer_learning'], 'emoji': '🔬', 'ru': {'title': 'Лингвистические инструкции - ключ к обобщению в компьютерном зрении', 'desc': "В статье исследуется проблема недостаточной способности моделей компьютерного зрения к обобщению на новые задачи без предварительного обучения. Авторы предлагают использовать подробные лингвистические инструкции для определения задач вместо дискретных терминологических определений. Они создали большой датасет из 12 миллионов примеров 'изображение-инструкция-результат' и обучили авторегрессионную мультимодальную модель следовать этим инструкциям. Эксперименты показали, что такой подход позволяет модели лучше обобщаться на новые задачи компьютерного зрения без дополнительного обучения."}, 'en': {'title': 'Unlocking Zero-Shot Generalization in Computer Vision with Explanatory Instructions', 'desc': "This paper addresses the challenge of zero-shot task generalization in Computer Vision (CV), which has not reached the levels seen in Natural Language Processing (NLP). The authors argue that the use of specific terminological definitions for tasks in CV, like 'image segmentation', limits the models' ability to generalize to new tasks. To overcome this, they propose 'Explanatory Instructions' that transform image inputs into detailed linguistic outputs, helping models understand tasks better. They introduce a large dataset of 12 million triplets and train an auto-regressive vision-language model that successfully demonstrates zero-shot capabilities for both seen and unseen tasks."}, 'zh': {'title': '突破计算机视觉的零样本任务泛化', 'desc': '本文探讨了计算机视觉(CV)在零样本任务泛化方面的挑战,尤其是与自然语言处理(NLP)的对比。我们认为,CV使用的术语性任务定义(如“图像分割”)可能是阻碍零样本任务泛化的关键因素。为了解决这个问题,我们引入了“解释性指令”,通过详细的语言转换来直观地定义CV任务目标。我们创建了一个包含1200万对“图像输入、解释性指令和输出”的大规模数据集,并训练了一个基于自回归的视觉语言模型,实现了对已见任务的指令级零样本能力,并在未见的CV任务上展示了强大的零样本泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2412.20070', 'title': 'On the Compositional Generalization of Multimodal LLMs for Medical Imaging', 'url': 'https://huggingface.co/papers/2412.20070', 'abstract': 'Multimodal large language models (MLLMs) hold significant potential in the medical field, but their capabilities are often limited by insufficient data in certain medical domains, highlighting the need for understanding what kinds of images can be used by MLLMs for generalization. Current research suggests that multi-task training outperforms single-task as different tasks can benefit each other, but they often overlook the internal relationships within these tasks, providing limited guidance on selecting datasets to enhance specific tasks. To analyze this phenomenon, we attempted to employ compositional generalization (CG)-the ability of models to understand novel combinations by recombining learned elements-as a guiding framework. Since medical images can be precisely defined by Modality, Anatomical area, and Task, naturally providing an environment for exploring CG. Therefore, we assembled 106 medical datasets to create Med-MAT for comprehensive experiments. The experiments confirmed that MLLMs can use CG to understand unseen medical images and identified CG as one of the main drivers of the generalization observed in multi-task training. Additionally, further studies demonstrated that CG effectively supports datasets with limited data and delivers consistent performance across different backbones, highlighting its versatility and broad applicability. Med-MAT is publicly available at https://github.com/FreedomIntelligence/Med-MAT.', 'score': 36, 'issue_id': 1405, 'pub_date': '2024-12-28', 'pub_date_card': {'ru': '28 декабря', 'en': 'December 28', 'zh': '12月28日'}, 'hash': '34f9c6ec4611d6ec', 'authors': ['Zhenyang Cai', 'Junying Chen', 'Rongsheng Wang', 'Weihong Wang', 'Yonglin Deng', 'Dingjie Song', 'Yize Chen', 'Zixu Zhang', 'Benyou Wang'], 'affiliations': ['The Chinese University of Hong Kong, Shenzhen'], 'pdf_title_img': 'assets/pdf/title_img/2412.20070.jpg', 'data': {'categories': ['#dataset', '#healthcare', '#open_source', '#multimodal', '#transfer_learning'], 'emoji': '🩺', 'ru': {'title': 'Композиционная генерализация - ключ к пониманию медицинских изображений для MLLM', 'desc': 'Статья исследует возможности мультимодальных больших языковых моделей (MLLM) в медицинской сфере, фокусируясь на композиционной генерализации (CG). Авторы создали набор данных Med-MAT из 106 медицинских датасетов для изучения способности моделей понимать новые комбинации изображений. Эксперименты показали, что MLLM могут использовать CG для интерпретации ранее невиданных медицинских изображений. Исследование также выявило эффективность CG для датасетов с ограниченными данными и стабильность результатов на разных архитектурах моделей.'}, 'en': {'title': 'Unlocking Medical Insights with Compositional Generalization', 'desc': "This paper explores the use of multimodal large language models (MLLMs) in the medical field, focusing on how they can generalize from limited data. It highlights the advantages of multi-task training over single-task training, emphasizing the importance of understanding the relationships between different tasks. The authors introduce compositional generalization (CG) as a framework to enhance the model's ability to interpret new combinations of medical images. They created a dataset called Med-MAT, which consists of 106 medical datasets, and found that CG significantly improves the performance of MLLMs, especially in scenarios with scarce data."}, 'zh': {'title': '组合泛化助力医学图像理解', 'desc': '多模态大型语言模型(MLLMs)在医学领域具有重要潜力,但在某些医学领域的数据不足限制了其能力。当前研究表明,多任务训练优于单任务训练,因为不同任务可以相互促进,但往往忽视了这些任务之间的内部关系。我们采用组合泛化(CG)作为指导框架,分析模型如何理解新组合的能力,并组建了106个医学数据集以创建Med-MAT进行全面实验。实验结果确认,MLLMs能够利用CG理解未见过的医学图像,并且CG是多任务训练中观察到的泛化的主要驱动因素之一。'}}}, {'id': 'https://huggingface.co/papers/2412.20422', 'title': 'Bringing Objects to Life: 4D generation from 3D objects', 'url': 'https://huggingface.co/papers/2412.20422', 'abstract': 'Recent advancements in generative modeling now enable the creation of 4D content (moving 3D objects) controlled with text prompts. 4D generation has large potential in applications like virtual worlds, media, and gaming, but existing methods provide limited control over the appearance and geometry of generated content. In this work, we introduce a method for animating user-provided 3D objects by conditioning on textual prompts to guide 4D generation, enabling custom animations while maintaining the identity of the original object. We first convert a 3D mesh into a ``static" 4D Neural Radiance Field (NeRF) that preserves the visual attributes of the input object. Then, we animate the object using an Image-to-Video diffusion model driven by text. To improve motion realism, we introduce an incremental viewpoint selection protocol for sampling perspectives to promote lifelike movement and a masked Score Distillation Sampling (SDS) loss, which leverages attention maps to focus optimization on relevant regions. We evaluate our model in terms of temporal coherence, prompt adherence, and visual fidelity and find that our method outperforms baselines that are based on other approaches, achieving up to threefold improvements in identity preservation measured using LPIPS scores, and effectively balancing visual quality with dynamic content.', 'score': 29, 'issue_id': 1408, 'pub_date': '2024-12-29', 'pub_date_card': {'ru': '29 декабря', 'en': 'December 29', 'zh': '12月29日'}, 'hash': 'de742e56a5ec379f', 'authors': ['Ohad Rahamim', 'Ori Malca', 'Dvir Samuel', 'Gal Chechik'], 'affiliations': ['Bar-Ilan University', 'NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2412.20422.jpg', 'data': {'categories': ['#optimization', '#multimodal', '#games', '#diffusion', '#video', '#3d'], 'emoji': '🎭', 'ru': {'title': 'Оживление 3D-объектов с помощью текста: новый рубеж в генеративном моделировании', 'desc': 'Статья представляет новый метод анимации 3D-объектов с помощью текстовых подсказок. Авторы используют генеративную модель для создания 4D-контента (движущихся 3D-объектов), сохраняя при этом исходный вид объекта. Метод включает преобразование 3D-меша в статическое 4D нейронное радиальное поле (NeRF) и последующую анимацию с помощью диффузионной модели Image-to-Video. Для улучшения реалистичности движения введены протокол выбора ракурсов и маскированная функция потерь Score Distillation Sampling.'}, 'en': {'title': 'Animating 3D Objects with Text Prompts for Realistic 4D Generation', 'desc': "This paper presents a novel approach to generating 4D content by animating 3D objects based on text prompts. The method involves converting a 3D mesh into a static 4D Neural Radiance Field (NeRF) to retain the object's visual characteristics. It then utilizes an Image-to-Video diffusion model to create animations while ensuring the original object's identity is preserved. The authors enhance motion realism through a viewpoint selection protocol and a masked Score Distillation Sampling loss, leading to significant improvements in visual quality and dynamic content generation."}, 'zh': {'title': '文本驱动的4D动画生成新方法', 'desc': '本研究提出了一种新方法,可以通过文本提示来控制4D内容的生成,特别是动画用户提供的3D对象。我们首先将3D网格转换为静态的4D神经辐射场(NeRF),以保留输入对象的视觉特征。然后,利用图像到视频的扩散模型进行动画制作,确保生成的动画与文本提示相符。通过引入增量视角选择协议和掩码评分蒸馏损失,我们提高了运动的真实感,并在多个评估指标上超越了现有方法。'}}}, {'id': 'https://huggingface.co/papers/2412.20993', 'title': 'Efficiently Serving LLM Reasoning Programs with Certaindex', 'url': 'https://huggingface.co/papers/2412.20993', 'abstract': 'The rapid evolution of large language models (LLMs) has unlocked their capabilities in advanced reasoning tasks like mathematical problem-solving, code generation, and legal analysis. Central to this progress are inference-time reasoning algorithms, which refine outputs by exploring multiple solution paths, at the cost of increasing compute demands and response latencies. Existing serving systems fail to adapt to the scaling behaviors of these algorithms or the varying difficulty of queries, leading to inefficient resource use and unmet latency targets. We present Dynasor, a system that optimizes inference-time compute for LLM reasoning queries. Unlike traditional engines, Dynasor tracks and schedules requests within reasoning queries and uses Certaindex, a proxy that measures statistical reasoning progress based on model certainty, to guide compute allocation dynamically. Dynasor co-adapts scheduling with reasoning progress: it allocates more compute to hard queries, reduces compute for simpler ones, and terminates unpromising queries early, balancing accuracy, latency, and cost. On diverse datasets and algorithms, Dynasor reduces compute by up to 50% in batch processing and sustaining 3.3x higher query rates or 4.7x tighter latency SLOs in online serving.', 'score': 24, 'issue_id': 1406, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '7fe76ed90463d977', 'authors': ['Yichao Fu', 'Junda Chen', 'Siqi Zhu', 'Zheyu Fu', 'Zhongdongming Dai', 'Aurick Qiao', 'Hao Zhang'], 'affiliations': ['Snowflake', 'Tsinghua University', 'UC San Diego'], 'pdf_title_img': 'assets/pdf/title_img/2412.20993.jpg', 'data': {'categories': ['#training', '#reasoning', '#optimization', '#inference'], 'emoji': '🧠', 'ru': {'title': 'Dynasor: умное распределение ресурсов для эффективных LLM-рассуждений', 'desc': 'Статья представляет систему Dynasor, оптимизирующую вычисления для задач рассуждения с использованием больших языковых моделей (LLM). Dynasor отслеживает и планирует запросы, используя прокси Certaindex для измерения прогресса рассуждений на основе уверенности модели. Система динамически распределяет вычислительные ресурсы, уделяя больше внимания сложным запросам и меньше простым, а также прекращая бесперспективные запросы. Dynasor показывает значительное снижение вычислительных затрат и улучшение производительности на различных наборах данных и алгоритмах.'}, 'en': {'title': 'Dynasor: Smart Compute Allocation for Efficient LLM Reasoning', 'desc': "This paper introduces Dynasor, a system designed to optimize the compute resources used during inference for large language models (LLMs) when handling reasoning queries. It addresses the inefficiencies of existing serving systems that do not adapt to the complexity of different queries or the scaling needs of inference-time reasoning algorithms. Dynasor employs a dynamic scheduling approach that allocates compute resources based on the difficulty of the query, using a proxy called Certaindex to measure the model's certainty in its reasoning. As a result, Dynasor can significantly reduce compute usage while improving query processing rates and meeting latency targets more effectively."}, 'zh': {'title': 'Dynasor:优化推理查询的计算效率', 'desc': '这篇论文介绍了Dynasor系统,它优化了大型语言模型(LLM)在推理查询时的计算效率。Dynasor通过跟踪和调度推理查询中的请求,动态分配计算资源,以应对不同难度的查询。该系统使用Certaindex代理,根据模型的确定性来衡量推理进展,从而指导计算分配。通过在多种数据集和算法上测试,Dynasor在批处理时减少了多达50%的计算需求,同时在在线服务中实现了3.3倍更高的查询速率或4.7倍更严格的延迟服务水平目标。'}}}, {'id': 'https://huggingface.co/papers/2412.21037', 'title': 'TangoFlux: Super Fast and Faithful Text to Audio Generation with Flow Matching and Clap-Ranked Preference Optimization', 'url': 'https://huggingface.co/papers/2412.21037', 'abstract': 'We introduce TangoFlux, an efficient Text-to-Audio (TTA) generative model with 515M parameters, capable of generating up to 30 seconds of 44.1kHz audio in just 3.7 seconds on a single A40 GPU. A key challenge in aligning TTA models lies in the difficulty of creating preference pairs, as TTA lacks structured mechanisms like verifiable rewards or gold-standard answers available for Large Language Models (LLMs). To address this, we propose CLAP-Ranked Preference Optimization (CRPO), a novel framework that iteratively generates and optimizes preference data to enhance TTA alignment. We demonstrate that the audio preference dataset generated using CRPO outperforms existing alternatives. With this framework, TangoFlux achieves state-of-the-art performance across both objective and subjective benchmarks. We open source all code and models to support further research in TTA generation.', 'score': 19, 'issue_id': 1405, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'bb669623871df661', 'authors': ['Chia-Yu Hung', 'Navonil Majumder', 'Zhifeng Kong', 'Ambuj Mehrish', 'Rafael Valle', 'Bryan Catanzaro', 'Soujanya Poria'], 'affiliations': ['NVIDIA', 'Singapore University of Technology and Design (SUTD)'], 'pdf_title_img': 'assets/pdf/title_img/2412.21037.jpg', 'data': {'categories': ['#dataset', '#audio', '#open_source', '#benchmark', '#alignment', '#rlhf', '#small_models'], 'emoji': '🎵', 'ru': {'title': 'TangoFlux: Революция в генерации аудио из текста', 'desc': 'TangoFlux - это эффективная генеративная модель для преобразования текста в аудио (Text-to-Audio, TTA) с 515 миллионами параметров. Модель способна генерировать до 30 секунд аудио с частотой 44,1 кГц всего за 3,7 секунды на одном GPU A40. Авторы представляют новую методику CLAP-Ranked Preference Optimization (CRPO) для улучшения согласованности TTA моделей путем итеративной генерации и оптимизации данных о предпочтениях. TangoFlux достигает передовых результатов в объективных и субъективных тестах, а код и модели открыты для дальнейших исследований.'}, 'en': {'title': 'TangoFlux: Revolutionizing Text-to-Audio Generation with CRPO', 'desc': "TangoFlux is a powerful Text-to-Audio generative model that can create high-quality audio quickly and efficiently. It addresses the challenge of aligning TTA models by introducing a new method called CLAP-Ranked Preference Optimization (CRPO), which helps generate and optimize preference data. This approach improves the model's ability to understand and produce audio that aligns with user preferences. The results show that TangoFlux not only meets but exceeds current standards in both objective and subjective evaluations, and the team has made their code and models available for further research."}, 'zh': {'title': 'TangoFlux:高效的文本到音频生成模型', 'desc': '我们介绍了TangoFlux,这是一种高效的文本到音频生成模型,拥有5.15亿个参数,能够在单个A40 GPU上以3.7秒的速度生成最长30秒的44.1kHz音频。TTA模型对齐的一个主要挑战是创建偏好对的困难,因为TTA缺乏像大型语言模型(LLMs)那样的可验证奖励或标准答案的结构化机制。为了解决这个问题,我们提出了CLAP-Ranked Preference Optimization(CRPO),这是一个新颖的框架,通过迭代生成和优化偏好数据来增强TTA的对齐。我们证明了使用CRPO生成的音频偏好数据集在现有替代方案中表现更优,TangoFlux在客观和主观基准测试中都达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2412.21079', 'title': 'Edicho: Consistent Image Editing in the Wild', 'url': 'https://huggingface.co/papers/2412.21079', 'abstract': 'As a verified need, consistent editing across in-the-wild images remains a technical challenge arising from various unmanageable factors, like object poses, lighting conditions, and photography environments. Edicho steps in with a training-free solution based on diffusion models, featuring a fundamental design principle of using explicit image correspondence to direct editing. Specifically, the key components include an attention manipulation module and a carefully refined classifier-free guidance (CFG) denoising strategy, both of which take into account the pre-estimated correspondence. Such an inference-time algorithm enjoys a plug-and-play nature and is compatible to most diffusion-based editing methods, such as ControlNet and BrushNet. Extensive results demonstrate the efficacy of Edicho in consistent cross-image editing under diverse settings. We will release the code to facilitate future studies.', 'score': 17, 'issue_id': 1405, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '8068418a331b2086', 'authors': ['Qingyan Bai', 'Hao Ouyang', 'Yinghao Xu', 'Qiuyu Wang', 'Ceyuan Yang', 'Ka Leong Cheng', 'Yujun Shen', 'Qifeng Chen'], 'affiliations': ['Ant Group', 'CUHK', 'HKUST', 'Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21079.jpg', 'data': {'categories': ['#cv', '#diffusion', '#open_source', '#inference'], 'emoji': '🖼️', 'ru': {'title': 'Edicho: согласованное редактирование изображений без обучения', 'desc': 'Статья представляет Edicho - решение для согласованного редактирования изображений без обучения, основанное на диффузионных моделях. Ключевые компоненты включают модуль манипуляции вниманием и стратегию шумоподавления без классификатора, использующие предварительно оцененное соответствие между изображениями. Этот алгоритм совместим с большинством методов редактирования на основе диффузии, таких как ControlNet и BrushNet. Результаты демонстрируют эффективность Edicho в согласованном редактировании изображений в различных условиях.'}, 'en': {'title': 'Edicho: Consistent Image Editing Made Easy with Diffusion Models', 'desc': 'This paper introduces Edicho, a novel approach for consistent editing of images that addresses challenges like varying object poses and lighting. It utilizes diffusion models without the need for prior training, focusing on explicit image correspondence to guide the editing process. Key innovations include an attention manipulation module and a refined classifier-free guidance denoising strategy, which enhance the editing quality by considering pre-estimated correspondences. The method is designed to be easily integrated with existing diffusion-based editing techniques, showing strong performance across different scenarios.'}, 'zh': {'title': 'Edicho:无训练一致性图像编辑的新方法', 'desc': 'Edicho 是一种基于扩散模型的无训练解决方案,旨在解决在不同环境下进行一致性图像编辑的挑战。它的设计原则是利用显式图像对应关系来指导编辑,确保在不同的拍摄条件下保持一致性。该方法包括一个注意力操作模块和经过精细调整的无分类器引导去噪策略,能够有效处理预估的对应关系。Edicho 具有即插即用的特性,兼容大多数基于扩散的编辑方法,实验结果显示其在多种设置下的有效性。'}}}, {'id': 'https://huggingface.co/papers/2412.21187', 'title': 'Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs', 'url': 'https://huggingface.co/papers/2412.21187', 'abstract': 'The remarkable performance of models like the OpenAI o1 can be attributed to their ability to emulate human-like long-time thinking during inference. These models employ extended chain-of-thought (CoT) processes, exploring multiple strategies to enhance problem-solving capabilities. However, a critical question remains: How to intelligently and efficiently scale computational resources during testing. This paper presents the first comprehensive study on the prevalent issue of overthinking in these models, where excessive computational resources are allocated for simple problems with minimal benefit. We introduce novel efficiency metrics from both outcome and process perspectives to evaluate the rational use of computational resources by o1-like models. Using a self-training paradigm, we propose strategies to mitigate overthinking, streamlining reasoning processes without compromising accuracy. Experimental results show that our approach successfully reduces computational overhead while preserving model performance across a range of testsets with varying difficulty levels, such as GSM8K, MATH500, GPQA, and AIME.', 'score': 11, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '78da22eae14fe26c', 'authors': ['Xingyu Chen', 'Jiahao Xu', 'Tian Liang', 'Zhiwei He', 'Jianhui Pang', 'Dian Yu', 'Linfeng Song', 'Qiuzhi Liu', 'Mengfei Zhou', 'Zhuosheng Zhang', 'Rui Wang', 'Zhaopeng Tu', 'Haitao Mi', 'Dong Yu'], 'affiliations': ['Shanghai Jiao Tong University', 'Tencent AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2412.21187.jpg', 'data': {'categories': ['#optimization', '#reasoning', '#training', '#math', '#inference'], 'emoji': '🧠', 'ru': {'title': 'Эффективное мышление ИИ: борьба с избыточными вычислениями', 'desc': 'Статья исследует проблему избыточных вычислений (overthinking) в крупных языковых моделях типа OpenAI o1 при решении задач. Авторы вводят новые метрики эффективности для оценки рационального использования вычислительных ресурсов такими моделями. Предлагается стратегия на основе самообучения для оптимизации рассуждений модели без потери точности. Экспериментальные результаты показывают успешное снижение вычислительных затрат при сохранении производительности на различных наборах тестов.'}, 'en': {'title': 'Streamlining Reasoning: Tackling Overthinking in AI Models', 'desc': "This paper investigates the phenomenon of overthinking in advanced machine learning models, particularly those like OpenAI's o1, which excel at long-term reasoning. It highlights the inefficiencies that arise when these models allocate excessive computational resources to solve simple problems, leading to minimal gains in performance. The authors propose new efficiency metrics to assess how well these models utilize their computational power during inference. By implementing a self-training approach, they present strategies to reduce overthinking, achieving a balance between computational efficiency and model accuracy across various challenging test sets."}, 'zh': {'title': '优化计算资源,提升模型效率', 'desc': '本文探讨了像OpenAI o1这样的模型在推理过程中模拟人类长期思考的能力。研究指出,这些模型在解决问题时常常会过度思考,导致在简单问题上分配过多的计算资源。我们提出了新的效率指标,从结果和过程两个角度评估计算资源的合理使用,并提出了自我训练的策略来减少过度思考。实验结果表明,我们的方法在不同难度的测试集上成功降低了计算开销,同时保持了模型的性能。'}}}, {'id': 'https://huggingface.co/papers/2412.20005', 'title': 'OneKE: A Dockerized Schema-Guided LLM Agent-based Knowledge Extraction System', 'url': 'https://huggingface.co/papers/2412.20005', 'abstract': "We introduce OneKE, a dockerized schema-guided knowledge extraction system, which can extract knowledge from the Web and raw PDF Books, and support various domains (science, news, etc.). Specifically, we design OneKE with multiple agents and a configure knowledge base. Different agents perform their respective roles, enabling support for various extraction scenarios. The configure knowledge base facilitates schema configuration, error case debugging and correction, further improving the performance. Empirical evaluations on benchmark datasets demonstrate OneKE's efficacy, while case studies further elucidate its adaptability to diverse tasks across multiple domains, highlighting its potential for broad applications. We have open-sourced the Code at https://github.com/zjunlp/OneKE and released a Video at http://oneke.openkg.cn/demo.mp4.", 'score': 10, 'issue_id': 1405, 'pub_date': '2024-12-28', 'pub_date_card': {'ru': '28 декабря', 'en': 'December 28', 'zh': '12月28日'}, 'hash': 'da8469c61421cefb', 'authors': ['Yujie Luo', 'Xiangyuan Ru', 'Kangwei Liu', 'Lin Yuan', 'Mengshu Sun', 'Ningyu Zhang', 'Lei Liang', 'Zhiqiang Zhang', 'Jun Zhou', 'Lanning Wei', 'Da Zheng', 'Haofen Wang', 'Huajun Chen'], 'affiliations': ['Ant Group', 'Tongji University', 'ZJU-Ant Group Joint Research Center for Knowledge Graphs', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2412.20005.jpg', 'data': {'categories': ['#dataset', '#agents', '#open_source', '#benchmark', '#multimodal', '#science'], 'emoji': '🧠', 'ru': {'title': 'OneKE: Универсальный инструмент для извлечения знаний из разнородных источников', 'desc': 'OneKE - это докеризованная система извлечения знаний, управляемая схемой. Она способна извлекать информацию из веб-ресурсов и PDF-книг, поддерживая различные домены, такие как наука и новости. Система использует множество агентов и настраиваемую базу знаний для выполнения различных сценариев извлечения. OneKE демонстрирует высокую эффективность на эталонных наборах данных и адаптируемость к разнообразным задачам в различных областях.'}, 'en': {'title': 'OneKE: Versatile Knowledge Extraction for Diverse Domains', 'desc': "OneKE is a knowledge extraction system designed to gather information from the Web and raw PDF books across various domains like science and news. It utilizes multiple agents, each responsible for specific tasks, which enhances its ability to handle different extraction scenarios effectively. The system includes a configurable knowledge base that aids in schema setup, debugging, and error correction, leading to improved performance. Empirical tests on benchmark datasets confirm OneKE's effectiveness, and case studies showcase its versatility in tackling diverse tasks."}, 'zh': {'title': 'OneKE:多领域知识提取的智能系统', 'desc': 'OneKE是一个基于Docker的知识提取系统,能够从网络和原始PDF书籍中提取知识,支持多个领域(如科学、新闻等)。该系统设计了多个智能代理,各自承担不同的角色,以适应各种提取场景。配置知识库的设计使得模式配置、错误调试和修正变得更加高效,从而提升了系统的性能。通过在基准数据集上的实证评估,OneKE展示了其有效性,并通过案例研究进一步说明了其在多个领域的适应性和广泛应用潜力。'}}}, {'id': 'https://huggingface.co/papers/2412.20631', 'title': "Slow Perception: Let's Perceive Geometric Figures Step-by-step", 'url': 'https://huggingface.co/papers/2412.20631', 'abstract': 'Recently, "visual o1" began to enter people\'s vision, with expectations that this slow-thinking design can solve visual reasoning tasks, especially geometric math problems. However, the reality is that current LVLMs (Large Vision Language Models) can hardly even accurately copy a geometric figure, let alone truly understand the complex inherent logic and spatial relationships within geometric shapes. We believe accurate copying (strong perception) is the first step to visual o1. Accordingly, we introduce the concept of "slow perception" (SP), which guides the model to gradually perceive basic point-line combinations, as our humans, reconstruct complex geometric structures progressively. There are two-fold stages in SP: a) perception decomposition. Perception is not instantaneous. In this stage, complex geometric figures are broken down into basic simple units to unify geometry representation. b) perception flow, which acknowledges that accurately tracing a line is not an easy task. This stage aims to avoid "long visual jumps" in regressing line segments by using a proposed "perceptual ruler" to trace each line stroke-by-stroke. Surprisingly, such a human-like perception manner enjoys an inference time scaling law -- the slower, the better. Researchers strive to speed up the model\'s perception in the past, but we slow it down again, allowing the model to read the image step-by-step and carefully.', 'score': 9, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'f99c59b7ef92c667', 'authors': ['Haoran Wei', 'Youyang Yin', 'Yumeng Li', 'Jia Wang', 'Liang Zhao', 'Jianjian Sun', 'Zheng Ge', 'Xiangyu Zhang'], 'affiliations': ['Beihang University', 'Stepfun'], 'pdf_title_img': 'assets/pdf/title_img/2412.20631.jpg', 'data': {'categories': ['#cv', '#math', '#reasoning'], 'emoji': '🔍', 'ru': {'title': 'Медленнее значит лучше: новый подход к компьютерному зрению', 'desc': "Статья представляет концепцию 'медленного восприятия' (slow perception) для улучшения способности моделей компьютерного зрения копировать геометрические фигуры. Авторы предлагают двухэтапный подход: декомпозиция восприятия, разбивающая сложные фигуры на простые элементы, и поток восприятия, использующий 'перцептивную линейку' для точного отслеживания линий. Исследователи обнаружили, что более медленное восприятие приводит к лучшим результатам, что противоречит традиционному стремлению ускорить обработку изображений. Эта методика может стать первым шагом к решению задач визуального рассуждения и геометрических задач большими визуально-языковыми моделями."}, 'en': {'title': 'Slow Down to See Better: Enhancing Visual Reasoning with Slow Perception', 'desc': "This paper introduces the concept of 'slow perception' (SP) to enhance the capabilities of Large Vision Language Models (LVLMs) in visual reasoning tasks, particularly in understanding geometric shapes. SP consists of two stages: perception decomposition, where complex figures are simplified into basic components, and perception flow, which emphasizes careful tracing of lines to avoid errors. The authors argue that this method mimics human cognitive processes, allowing for a more accurate understanding of spatial relationships. Interestingly, they find that a slower, more deliberate approach to perception improves the model's performance, challenging the traditional focus on speed in machine learning."}, 'zh': {'title': '慢感知:逐步理解几何结构的关键', 'desc': '最近,"视觉o1"开始引起人们的关注,期望这种慢思维设计能够解决视觉推理任务,尤其是几何数学问题。然而,当前的大型视觉语言模型(LVLMs)在准确复制几何图形方面几乎无能为力,更不用说真正理解几何形状内在的复杂逻辑和空间关系。我们提出了"慢感知"(SP)的概念,指导模型逐步感知基本的点线组合,像人类一样逐步重建复杂的几何结构。SP包括两个阶段:感知分解和感知流,前者将复杂的几何图形分解为基本单元,后者通过使用"感知尺"逐步追踪每条线段,避免"长视觉跳跃"。'}}}, {'id': 'https://huggingface.co/papers/2412.21140', 'title': 'Facilitating large language model Russian adaptation with Learned Embedding Propagation', 'url': 'https://huggingface.co/papers/2412.21140', 'abstract': 'Rapid advancements of large language model (LLM) technologies led to the introduction of powerful open-source instruction-tuned LLMs that have the same text generation quality as the state-of-the-art counterparts such as GPT-4. While the emergence of such models accelerates the adoption of LLM technologies in sensitive-information environments the authors of such models don not disclose the training data necessary for replication of the results thus making the achievements model-exclusive. Since those open-source models are also multilingual this in turn reduces the benefits of training a language specific LLMs as improved inference computation efficiency becomes the only guaranteed advantage of such costly procedure. More cost-efficient options such as vocabulary extension and subsequent continued pre-training are also inhibited by the lack of access to high-quality instruction-tuning data since it is the major factor behind the resulting LLM task-solving capabilities. To address the limitations and cut the costs of the language adaptation pipeline we propose Learned Embedding Propagation (LEP). Unlike existing approaches our method has lower training data size requirements due to minimal impact on existing LLM knowledge which we reinforce using novel ad-hoc embedding propagation procedure that allows to skip the instruction-tuning step and instead implant the new language knowledge directly into any existing instruct-tuned variant. We evaluated four Russian vocabulary adaptations for LLaMa-3-8B and Mistral-7B, showing that LEP is competitive with traditional instruction-tuning methods, achieving performance comparable to OpenChat 3.5 and LLaMa-3-8B-Instruct, with further improvements via self-calibration and continued tuning enhancing task-solving capabilities.', 'score': 9, 'issue_id': 1412, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '093f3929e323d180', 'authors': ['Mikhail Tikhomirov', 'Daniil Chernyshev'], 'affiliations': ['Lomonosov Moscow State University, Moscow, Russia'], 'pdf_title_img': 'assets/pdf/title_img/2412.21140.jpg', 'data': {'categories': ['#data', '#training', '#low_resource', '#transfer_learning', '#dataset', '#open_source', '#multilingual'], 'emoji': '🌐', 'ru': {'title': 'Эффективная адаптация языковых моделей без масштабного переобучения', 'desc': 'Статья представляет новый метод адаптации больших языковых моделей (LLM) к другим языкам, называемый Learned Embedding Propagation (LEP). Этот подход позволяет эффективно внедрять знания нового языка в существующие инструктированные LLM без необходимости повторного обучения на больших объемах данных. Авторы провели эксперименты с адаптацией моделей LLaMa-3-8B и Mistral-7B к русскому языку, показав, что LEP конкурентоспособен с традиционными методами инструктирования. Результаты демонстрируют, что LEP достигает производительности, сравнимой с OpenChat 3.5 и LLaMa-3-8B-Instruct, с возможностью дальнейшего улучшения через самокалибровку и дополнительную настройку.'}, 'en': {'title': 'Efficient Language Adaptation with Learned Embedding Propagation', 'desc': 'This paper introduces Learned Embedding Propagation (LEP), a novel method for adapting large language models (LLMs) to new languages without the need for extensive instruction-tuning data. LEP minimizes the training data requirements by directly embedding new language knowledge into existing instruct-tuned models, thus bypassing traditional instruction-tuning steps. The authors demonstrate that LEP can effectively adapt LLaMa-3-8B and Mistral-7B for Russian vocabulary, achieving performance on par with state-of-the-art models like OpenChat 3.5. This approach not only reduces costs but also enhances the efficiency of language adaptation in multilingual contexts.'}, 'zh': {'title': '学习嵌入传播:降低语言适应成本的新方法', 'desc': '这篇论文介绍了一种名为学习嵌入传播(LEP)的方法,旨在降低语言适应过程的成本。LEP方法通过最小化对现有大语言模型(LLM)知识的影响,减少了对训练数据的需求。与传统的指令调优方法相比,LEP能够直接将新的语言知识植入到现有的指令调优模型中,从而跳过指令调优步骤。实验结果表明,LEP在俄语词汇适应方面的表现与传统方法相当,且通过自我校准和持续调优进一步提升了任务解决能力。'}}}, {'id': 'https://huggingface.co/papers/2412.21139', 'title': 'Training Software Engineering Agents and Verifiers with SWE-Gym', 'url': 'https://huggingface.co/papers/2412.21139', 'abstract': 'We present SWE-Gym, the first environment for training real-world software engineering (SWE) agents. SWE-Gym contains 2,438 real-world Python task instances, each comprising a codebase with an executable runtime environment, unit tests, and a task specified in natural language. We use SWE-Gym to train language model based SWE agents , achieving up to 19% absolute gains in resolve rate on the popular SWE-Bench Verified and Lite test sets. We also experiment with inference-time scaling through verifiers trained on agent trajectories sampled from SWE-Gym. When combined with our fine-tuned SWE agents, we achieve 32.0% and 26.0% on SWE-Bench Verified and Lite, respectively, reflecting a new state-of-the-art for open-weight SWE agents. To facilitate further research, we publicly release SWE-Gym, models, and agent trajectories.', 'score': 9, 'issue_id': 1406, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '800bb3f4c48e2cf9', 'authors': ['Jiayi Pan', 'Xingyao Wang', 'Graham Neubig', 'Navdeep Jaitly', 'Heng Ji', 'Alane Suhr', 'Yizhe Zhang'], 'affiliations': ['Apple', 'CMU', 'UC Berkeley', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2412.21139.jpg', 'data': {'categories': ['#dataset', '#open_source', '#agents', '#training'], 'emoji': '🤖', 'ru': {'title': 'SWE-Gym: революция в обучении ИИ-агентов для разработки ПО', 'desc': 'SWE-Gym - это новая среда для обучения агентов программной инженерии на реальных задачах. Она содержит 2438 экземпляров задач на Python с исполняемой средой, юнит-тестами и описанием на естественном языке. Авторы использовали SWE-Gym для обучения агентов на основе языковых моделей, достигнув улучшения на 19% в решении задач из наборов SWE-Bench. Комбинация обученных агентов и верификаторов позволила достичь нового рекорда производительности для открытых моделей в программной инженерии.'}, 'en': {'title': 'Revolutionizing Software Engineering with SWE-Gym', 'desc': 'SWE-Gym is a novel environment designed for training software engineering agents using real-world Python tasks. It includes 2,438 task instances, each with a codebase, executable environment, unit tests, and natural language task descriptions. The paper demonstrates that language model-based agents trained in SWE-Gym can significantly improve their performance, achieving up to 19% higher resolve rates on benchmark tests. Additionally, the authors explore scaling inference through verifiers, leading to state-of-the-art results for open-weight software engineering agents, and they provide resources for further research.'}, 'zh': {'title': 'SWE-Gym:软件工程代理的新起点', 'desc': '我们提出了SWE-Gym,这是第一个用于训练真实世界软件工程(SWE)代理的环境。SWE-Gym包含2438个真实的Python任务实例,每个实例都有可执行的运行环境、单元测试和用自然语言指定的任务。通过使用SWE-Gym,我们训练的基于语言模型的SWE代理在流行的SWE-Bench验证和Lite测试集上实现了高达19%的绝对解决率提升。我们还通过在SWE-Gym中采样的代理轨迹训练验证器,进行推理时的扩展,结合我们微调的SWE代理,在SWE-Bench验证和Lite上分别达到了32.0%和26.0%的新状态,成为开放权重SWE代理的新标杆。'}}}, {'id': 'https://huggingface.co/papers/2412.21206', 'title': 'PERSE: Personalized 3D Generative Avatars from A Single Portrait', 'url': 'https://huggingface.co/papers/2412.21206', 'abstract': "We present PERSE, a method for building an animatable personalized generative avatar from a reference portrait. Our avatar model enables facial attribute editing in a continuous and disentangled latent space to control each facial attribute, while preserving the individual's identity. To achieve this, our method begins by synthesizing large-scale synthetic 2D video datasets, where each video contains consistent changes in the facial expression and viewpoint, combined with a variation in a specific facial attribute from the original input. We propose a novel pipeline to produce high-quality, photorealistic 2D videos with facial attribute editing. Leveraging this synthetic attribute dataset, we present a personalized avatar creation method based on the 3D Gaussian Splatting, learning a continuous and disentangled latent space for intuitive facial attribute manipulation. To enforce smooth transitions in this latent space, we introduce a latent space regularization technique by using interpolated 2D faces as supervision. Compared to previous approaches, we demonstrate that PERSE generates high-quality avatars with interpolated attributes while preserving identity of reference person.", 'score': 8, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '334a60a17f9a9477', 'authors': ['Hyunsoo Cha', 'Inhee Lee', 'Hanbyul Joo'], 'affiliations': ['Seoul National University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21206.jpg', 'data': {'categories': ['#3d', '#cv', '#dataset', '#synthetic'], 'emoji': '🎭', 'ru': {'title': 'Персонализированные аватары с гибким редактированием черт лица', 'desc': 'PERSE - это метод создания анимируемого персонализированного генеративного аватара на основе портрета. Он позволяет редактировать лицевые атрибуты в непрерывном и разделенном латентном пространстве, сохраняя при этом индивидуальность человека. Метод использует синтетические наборы 2D-видео для обучения модели на основе 3D Gaussian Splatting. PERSE демонстрирует высокое качество генерации аватаров с интерполированными атрибутами, сохраняя идентичность исходного человека.'}, 'en': {'title': 'Create Your Unique Avatar with PERSE!', 'desc': "PERSE is a novel method for creating personalized generative avatars from a single reference portrait. It allows users to edit facial attributes in a smooth and controlled manner within a continuous latent space, ensuring that the individual's identity remains intact. The approach involves generating large-scale synthetic 2D video datasets that showcase variations in facial expressions and attributes, which are then used to train the avatar model. By employing 3D Gaussian Splatting and a latent space regularization technique, PERSE achieves high-quality, photorealistic avatars with seamless attribute transitions."}, 'zh': {'title': '个性化生成头像的新方法', 'desc': '本文介绍了一种名为PERSE的方法,用于从参考肖像构建可动画的个性化生成头像。该头像模型能够在连续且解耦的潜在空间中编辑面部属性,同时保持个体的身份。我们的方法首先合成大规模的合成2D视频数据集,每个视频包含面部表情和视角的一致变化,并结合原始输入中特定面部属性的变化。通过引入潜在空间正则化技术,我们实现了高质量、逼真的2D视频生成,并在此基础上提出了一种个性化头像创建方法。'}}}, {'id': 'https://huggingface.co/papers/2412.21199', 'title': 'HumanEval Pro and MBPP Pro: Evaluating Large Language Models on Self-invoking Code Generation', 'url': 'https://huggingface.co/papers/2412.21199', 'abstract': "We introduce self-invoking code generation, a new task designed to evaluate the progressive reasoning and problem-solving capabilities of LLMs. In this task, models are presented with a base problem and a related, more complex problem. They must solve the base problem and then utilize its solution to address the more complex one. This work features three key contributions. First, we propose a general recipe for generating more challenging versions of existing benchmarks, resulting in three new benchmarks: HumanEval Pro, MBPP Pro, and BigCodeBench-Lite Pro, specifically designed to assess LLMs on self-invoking code generation. Second, from the analysis of experimental results over twenty LLMs on our benchmarks, we have two important observations: (i) Most LLMs excel in traditional code generation benchmarks like HumanEval and MBPP, but their performance declines on self-invoking tasks. For example, o1-mini achieves 96.2% pass@1 on HumanEval but only 76.2% on HumanEval Pro. (ii) On self-invoking code generation task, the instruction-tuned models demonstrate only marginal improvements compared to the base models. Third, we disclose the types of failure modes that exist in our evaluation results. All these results underscore the need for further advancements in self-invoking code generation tasks and provide a new direction for future research on enhancing LLMs' code reasoning capabilities.", 'score': 6, 'issue_id': 1408, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '9d2cebc8f30f722c', 'authors': ['Zhaojian Yu', 'Yilun Zhao', 'Arman Cohan', 'Xiao-Ping Zhang'], 'affiliations': ['Tsinghua University', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21199.jpg', 'data': {'categories': ['#dataset', '#reasoning', '#training', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Самовызывающийся код: новый рубеж для языковых моделей', 'desc': 'Статья представляет новую задачу для оценки возможностей языковых моделей (LLM) - генерацию самовызывающегося кода. В рамках этой задачи модели должны решить базовую проблему, а затем использовать ее решение для более сложной задачи. Авторы создали три новых бенчмарка: HumanEval Pro, MBPP Pro и BigCodeBench-Lite Pro. Эксперименты показали, что большинство LLM хорошо справляются с традиционными задачами генерации кода, но их производительность снижается на самовызывающихся задачах. Результаты подчеркивают необходимость дальнейших исследований в области улучшения способностей LLM к рассуждению при работе с кодом.'}, 'en': {'title': 'Enhancing LLMs: The Challenge of Self-Invoking Code Generation', 'desc': 'This paper introduces a new task called self-invoking code generation, which tests the reasoning and problem-solving skills of large language models (LLMs). In this task, models first solve a simple problem and then use that solution to tackle a more complex one. The authors create three new benchmarks to evaluate LLMs on this task, revealing that while many models perform well on standard code generation tasks, their performance drops significantly on self-invoking tasks. The findings highlight the limitations of current models and suggest that more research is needed to improve their code reasoning abilities.'}, 'zh': {'title': '自调用代码生成:提升LLMs推理能力的新方向', 'desc': '本文介绍了一种新的任务——自调用代码生成,旨在评估大型语言模型(LLMs)的推理和问题解决能力。在这个任务中,模型需要先解决一个基础问题,然后利用其解决方案来处理一个更复杂的问题。研究提出了三项重要贡献,包括生成更具挑战性的基准测试的通用方法,并创建了三个新基准:HumanEval Pro、MBPP Pro和BigCodeBench-Lite Pro。实验结果显示,大多数LLMs在传统代码生成基准上表现良好,但在自调用任务上的表现却有所下降,表明在自调用代码生成任务上仍需进一步的研究和改进。'}}}, {'id': 'https://huggingface.co/papers/2501.09732', 'title': 'Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps', 'url': 'https://huggingface.co/papers/2501.09732', 'abstract': 'Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.', 'score': 50, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '2ad32c666f91ba05', 'authors': ['Nanye Ma', 'Shangyuan Tong', 'Haolin Jia', 'Hexiang Hu', 'Yu-Chuan Su', 'Mingda Zhang', 'Xuan Yang', 'Yandong Li', 'Tommi Jaakkola', 'Xuhui Jia', 'Saining Xie'], 'affiliations': ['Google', 'MIT', 'NYU'], 'pdf_title_img': 'assets/pdf/title_img/2501.09732.jpg', 'data': {'categories': ['#diffusion', '#inference', '#benchmark', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Повышение качества генерации изображений за счет масштабирования вычислений при выводе', 'desc': 'Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислений во время вывода. Авторы рассматривают задачу поиска лучших шумов для процесса сэмплирования диффузионной модели. Они структурируют пространство решений по двум осям: верификаторы для обратной связи и алгоритмы поиска лучших кандидатов шума. Эксперименты показывают, что увеличение вычислений при выводе приводит к значительному улучшению качества сгенерированных изображений.'}, 'en': {'title': 'Enhancing Diffusion Models: Scaling Inference for Better Image Generation', 'desc': 'This paper investigates how to enhance the performance of diffusion models during the inference phase by increasing computational resources. It highlights that, unlike Large Language Models (LLMs), diffusion models can adjust their inference process through the number of denoising steps, but improvements tend to plateau after a certain point. The authors propose a method to optimize the noise used in the diffusion sampling process by exploring different feedback verifiers and algorithms. Their experiments demonstrate that by strategically increasing computation during inference, the quality of generated images can be significantly improved, tailored to various application needs.'}, 'zh': {'title': '扩散模型推理时的计算扩展与性能提升', 'desc': '生成模型在多个领域产生了重要影响,主要得益于其在训练过程中通过增加数据、计算资源和模型规模来扩展的能力。最近的研究开始探讨大型语言模型(LLMs)在推理时的扩展行为,发现额外的计算可以进一步提高性能。与LLMs不同,扩散模型通过去噪步骤的数量灵活调整推理时的计算,尽管性能提升通常在几十步后趋于平稳。本文探讨了扩散模型在推理时的扩展行为,研究如何通过增加计算来进一步提高生成性能,特别是通过寻找更好的噪声来优化扩散采样过程。'}}}, {'id': 'https://huggingface.co/papers/2501.09751', 'title': 'OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking', 'url': 'https://huggingface.co/papers/2501.09751', 'abstract': "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.", 'score': 34, 'issue_id': 1722, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '7e8d42358354f79b', 'authors': ['Zekun Xi', 'Wenbiao Yin', 'Jizhan Fang', 'Jialong Wu', 'Runnan Fang', 'Ningyu Zhang', 'Jiang Yong', 'Pengjun Xie', 'Fei Huang', 'Huajun Chen'], 'affiliations': ['Tongyi Lab, Alibaba Group', 'Zhejiang Key Laboratory of Big Data Intelligent Computing', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09751.jpg', 'data': {'categories': ['#rag', '#story_generation', '#long_context', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста', 'desc': 'Статья представляет новый подход к генерации текста с использованием больших языковых моделей, названный OmniThink. Этот метод имитирует человеческий процесс итеративного расширения знаний и рефлексии, преодолевая ограничения стандартных методов извлечения информации. OmniThink улучшает плотность знаний в генерируемых статьях, не жертвуя связностью и глубиной. Эксперименты и оценки экспертов подтверждают эффективность OmniThink для решения реальных задач генерации длинных статей.'}, 'en': {'title': 'OmniThink: Elevating Machine Writing through Human-Like Learning', 'desc': 'This paper introduces OmniThink, a novel machine writing framework that enhances the capabilities of large language models by mimicking human cognitive processes. Unlike traditional retrieval-augmented generation methods, which often produce shallow and repetitive content, OmniThink focuses on iterative expansion and reflection to deepen knowledge on topics. The framework significantly improves the knowledge density of generated articles while maintaining coherence and depth, as shown by experimental results. Human evaluations and expert feedback confirm that OmniThink effectively addresses challenges in generating high-quality long-form content.'}, 'zh': {'title': 'OmniThink:提升机器写作的知识密度', 'desc': '本文提出了一种名为OmniThink的机器写作框架,旨在改善传统大语言模型在生成内容时的局限性。OmniThink模拟人类学习者的认知过程,通过迭代扩展和反思来加深对主题的理解。实验结果表明,OmniThink能够提高生成文章的知识密度,同时保持连贯性和深度等指标。人类评估和专家反馈进一步验证了OmniThink在生成长篇文章时解决实际问题的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.09755', 'title': 'Learnings from Scaling Visual Tokenizers for Reconstruction and Generation', 'url': 'https://huggingface.co/papers/2501.09755', 'abstract': "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.", 'score': 25, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '426aa3415c3c0ef4', 'authors': ['Philippe Hansen-Estruch', 'David Yan', 'Ching-Yao Chung', 'Orr Zohar', 'Jialiang Wang', 'Tingbo Hou', 'Tao Xu', 'Sriram Vishwanath', 'Peter Vajda', 'Xinlei Chen'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'Stanford University', 'UT Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.09755.jpg', 'data': {'categories': ['#cv', '#benchmark', '#video', '#optimization', '#architecture', '#diffusion'], 'emoji': '🔬', 'ru': {'title': 'ViTok: Оптимизация визуальной токенизации для генеративных моделей', 'desc': 'Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделях изображений и видео. Авторы представляют ViTok - легковесный автоэнкодер на основе Vision Transformer, обученный на масштабных датасетах. Исследование показывает, что масштабирование декодера улучшает реконструкцию, но неоднозначно влияет на генерацию. ViTok демонстрирует конкурентоспособную производительность при меньшем количестве FLOP и устанавливает новые рекорды в условной генерации видео.'}, 'en': {'title': 'Scaling Auto-Encoders for Enhanced Image and Video Generation', 'desc': 'This paper explores the scaling of auto-encoders, particularly focusing on the tokenizer component, which is crucial for image and video generation. The authors introduce ViTok, a Vision Transformer-based architecture that replaces traditional convolutional backbones, allowing for better scaling on large datasets. They investigate how different scaling strategies for the encoder and decoder affect both reconstruction and generative performance, finding that scaling the decoder is more beneficial for reconstruction. Ultimately, ViTok achieves competitive results with fewer computational resources and sets new benchmarks in image and video generation tasks.'}, 'zh': {'title': '自编码器的视觉标记化:提升生成模型的关键', 'desc': '本论文探讨了通过自编码器进行视觉标记化对图像和视频生成模型的影响。我们提出了一种增强的视觉变换器架构(ViTok),用于替代传统的卷积骨干网络,以提高标记化的效果。研究发现,自编码器的瓶颈规模与重建性能高度相关,但与生成性能的关系更为复杂。最终,ViTok在多个任务中表现出色,尤其是在视频重建和图像生成方面,展示了其在计算效率上的优势。'}}}, {'id': 'https://huggingface.co/papers/2501.09686', 'title': 'Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models', 'url': 'https://huggingface.co/papers/2501.09686', 'abstract': 'Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs\' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs\' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI\'s o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.', 'score': 23, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1c6b1b1f0235304c', 'authors': ['Fengli Xu', 'Qianyue Hao', 'Zefang Zong', 'Jingwei Wang', 'Yunke Zhang', 'Jingyi Wang', 'Xiaochong Lan', 'Jiahui Gong', 'Tianjian Ouyang', 'Fanjin Meng', 'Chenyang Shao', 'Yuwei Yan', 'Qinglong Yang', 'Yiwen Song', 'Sijian Ren', 'Xinyuan Hu', 'Yu Li', 'Jie Feng', 'Chen Gao', 'Yong Li'], 'affiliations': ['Emory University, Atlanta GA, USA', 'HKUST (GZ), Guangzhou, China', 'Tsinghua University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09686.jpg', 'data': {'categories': ['#open_source', '#training', '#rl', '#survey', '#reasoning', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Путь к большим моделям рассуждений: новый рубеж в ИИ', 'desc': 'Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваются ключевые технические компоненты, способствующие развитию крупных моделей рассуждений, включая автоматизированное построение данных, методы обучения рассуждениям и масштабирование во время тестирования. Анализируются популярные проекты с открытым исходным кодом по созданию крупных моделей рассуждений. Обсуждаются открытые проблемы и направления будущих исследований в этой области.'}, 'en': {'title': 'Unlocking Human-Like Reasoning in Large Language Models', 'desc': "This paper discusses the advancements in Large Language Models (LLMs) and their application to complex reasoning tasks. It introduces the concept of 'thought', which represents intermediate reasoning steps, allowing LLMs to simulate human-like reasoning processes. The paper highlights the use of reinforcement learning to enhance LLMs' reasoning capabilities by generating high-quality reasoning trajectories through trial-and-error methods. Additionally, it emphasizes the importance of scaling both training and testing phases to improve reasoning accuracy, paving the way for the development of Large Reasoning Models."}, 'zh': {'title': '推动大型推理模型的研究新前沿', 'desc': '这篇论文探讨了大型语言模型(LLMs)在复杂推理任务中的应用。研究者们引入了“思考”的概念,通过中间步骤的令牌序列来模拟人类的推理过程。最近,强化学习(RL)被应用于训练LLMs,以自动生成高质量的推理轨迹,从而显著提高推理能力。论文还讨论了在测试时增加令牌数量以提高推理准确性的效果,并展望了大型推理模型的未来研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09484', 'title': 'Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators', 'url': 'https://huggingface.co/papers/2501.09484', 'abstract': 'Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the "inquiry" phase of the consultation process. This lack of focus has left the relationship between "inquiry" and "diagnosis" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between "inquiry" and "diagnosis" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig\'s law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.', 'score': 18, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'aff7d86ad63040d9', 'authors': ['Zhaocheng Liu', 'Quan Tu', 'Wen Ye', 'Yu Xiao', 'Zhishou Zhang', 'Hengfu Cui', 'Yalun Zhu', 'Qiang Ju', 'Shizheng Li', 'Jian Xie'], 'affiliations': ['Baichuan Inc.', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09484.jpg', 'data': {'categories': ['#data', '#training', '#science', '#open_source', '#healthcare'], 'emoji': '🩺', 'ru': {'title': 'Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ', 'desc': 'Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам.'}, 'en': {'title': 'Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations', 'desc': "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."}, 'zh': {'title': '优化询问,提升诊断效果', 'desc': '本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。'}}}, {'id': 'https://huggingface.co/papers/2501.09038', 'title': 'Do generative video models learn physical principles from watching videos?', 'url': 'https://huggingface.co/papers/2501.09038', 'abstract': "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.", 'score': 17, 'issue_id': 1725, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '6a5047e8681ddcc5', 'authors': ['Saman Motamed', 'Laura Culp', 'Kevin Swersky', 'Priyank Jaini', 'Robert Geirhos'], 'affiliations': ['Google DeepMind', 'INSAIT, Sofia University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09038.jpg', 'data': {'categories': ['#benchmark', '#science', '#video'], 'emoji': '🧠', 'ru': {'title': 'Визуальный реализм не гарантирует понимание физики в ИИ', 'desc': 'Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моделей понимать законы физики. Результаты показывают, что современные модели имеют ограниченное физическое понимание, несмотря на визуальный реализм. Однако некоторые задачи уже успешно решаются, что указывает на потенциал изучения физических принципов из наблюдений.'}, 'en': {'title': 'Visual Realism vs. Physical Understanding in AI Video Generation', 'desc': "This paper explores whether AI video generation models truly understand the laws of physics or if they are just good at creating realistic images. The authors introduce Physics-IQ, a benchmark dataset designed to test models on their grasp of physical principles like fluid dynamics and thermodynamics. Their findings show that current models struggle with physical understanding, even though they can produce visually realistic videos. This suggests that while some physical concepts can be learned from observation, there are still significant gaps in the models' comprehension of reality."}, 'zh': {'title': '视觉真实感不等于物理理解', 'desc': '本论文探讨了AI视频生成技术的进展,特别是模型是否理解物理规律。我们开发了Physics-IQ,一个全面的基准数据集,只有通过深入理解流体动力学、光学、固体力学、磁学和热力学等物理原理才能解决。研究发现,当前模型在物理理解方面存在严重限制,且与视觉真实感无关。尽管某些测试案例已成功解决,但这表明仅通过观察获得某些物理原理仍面临重大挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.09747', 'title': 'FAST: Efficient Action Tokenization for Vision-Language-Action Models', 'url': 'https://huggingface.co/papers/2501.09747', 'abstract': 'Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.', 'score': 16, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1ff64d2f7e62d274', 'authors': ['Karl Pertsch', 'Kyle Stachowicz', 'Brian Ichter', 'Danny Driess', 'Suraj Nair', 'Quan Vuong', 'Oier Mees', 'Chelsea Finn', 'Sergey Levine'], 'affiliations': ['Physical Intelligence', 'Stanford', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.09747.jpg', 'data': {'categories': ['#dataset', '#agents', '#training', '#games', '#optimization', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'Революция в токенизации действий робота: от частотного пространства к универсальности', 'desc': 'Статья представляет новый метод токенизации действий робота под названием FAST (Frequency-space Action Sequence Tokenization), основанный на дискретном косинусном преобразовании. Этот подход позволяет обучать авторегрессионные модели VLA (Vision-Language Action) для высокочастотных и сложных задач манипулирования, где стандартные методы дискретизации не работают. Авторы также представляют FAST+, универсальный токенизатор действий робота, обученный на 1 миллионе реальных траекторий. В сочетании с моделью pi0 VLA, метод FAST позволяет обучаться на 10 тысячах часов данных робота и достигать производительности диффузионных VLA, сокращая время обучения до 5 раз.'}, 'en': {'title': 'Revolutionizing Robot Action Tokenization with FAST', 'desc': 'This paper introduces a new method for tokenizing continuous robot actions to improve the performance of autoregressive sequence models, specifically in the context of vision-language action (VLA) policies. The authors identify that traditional tokenization methods, which use simple binning techniques, struggle with high-frequency and dexterous robotic tasks. To overcome this limitation, they propose Frequency-space Action Sequence Tokenization (FAST), which utilizes the discrete cosine transform for better action representation. The results demonstrate that FAST can effectively train VLAs on extensive robot data, achieving performance comparable to diffusion models while significantly reducing training time.'}, 'zh': {'title': '提升机器人灵巧技能的标记化新方法', 'desc': '本文提出了一种新的机器人动作标记化方案,称为频率空间动作序列标记化(FAST),旨在解决现有基于简单分箱方法的标记化在学习灵巧技能时的不足。FAST利用离散余弦变换来有效地处理高频机器人数据,从而提高了模型在复杂任务中的表现。我们还发布了FAST+,这是一个通用的机器人动作标记器,能够处理多种动作序列和控制频率。通过与pi0 VLA结合,我们的方法在训练10,000小时的机器人数据时,能够与扩散VLA的性能相匹配,同时将训练时间减少了多达5倍。'}}}, {'id': 'https://huggingface.co/papers/2501.09756', 'title': 'SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces', 'url': 'https://huggingface.co/papers/2501.09756', 'abstract': "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/", 'score': 15, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'e6621d55eb165448', 'authors': ['Sumit Chaturvedi', 'Mengwei Ren', 'Yannick Hold-Geoffroy', 'Jingyuan Liu', 'Julie Dorsey', 'Zhixin Shu'], 'affiliations': ['Adobe Research', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09756.jpg', 'data': {'categories': ['#dataset', '#3d', '#inference', '#cv', '#diffusion', '#training', '#synthetic'], 'emoji': '💡', 'ru': {'title': 'SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели', 'desc': 'SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматривает перезасветку как проблему повторного рендеринга, где пиксели трансформируются в ответ на изменения условий освещения окружающей среды. Авторы синтезировали датасет с помощью физически корректного рендеринга, симулируя трансформации освещения на 3D-моделях голов. Предложены две стратегии обучения и вывода для преодоления разрыва между синтетическими и реальными изображениями.'}, 'en': {'title': 'Revolutionizing Portrait Relighting with SynthLight', 'desc': 'SynthLight is a diffusion model designed for relighting portraits by treating the task as a re-rendering challenge influenced by environmental lighting changes. It utilizes a physically-based rendering engine to create a synthetic dataset that simulates how lighting affects 3D head models. The model employs multi-task training to utilize real portraits without specific lighting labels and a novel inference strategy that enhances detail preservation during the relighting process. The results show that SynthLight can effectively generalize to real images, producing realistic lighting effects while maintaining the identity of the subjects, outperforming existing methods in both quantitative and qualitative assessments.'}, 'zh': {'title': 'SynthLight:肖像重光照的新方法', 'desc': '我们介绍了SynthLight,这是一种用于肖像重光照的扩散模型。我们将图像重光照视为重新渲染的问题,通过物理基础渲染引擎合成数据集,以模拟在不同光照条件下的像素变换。我们提出了两种训练和推理策略,以缩小合成图像和真实图像之间的差距,利用真实人像进行多任务训练,并在推理时使用无分类器引导的扩散采样程序。我们的模型能够在多样的真实照片中推广,生成逼真的光照效果,同时保持主体的身份特征。'}}}, {'id': 'https://huggingface.co/papers/2501.09433', 'title': 'CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation', 'url': 'https://huggingface.co/papers/2501.09433', 'abstract': 'The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.', 'score': 12, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '8c7a54f21e46af7a', 'authors': ['Hwan Heo', 'Jangyeong Kim', 'Seongyeong Lee', 'Jeong A Wi', 'Junyoung Choi', 'Sangjun Ahn'], 'affiliations': ['Graphics AI Lab, NC Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.09433.jpg', 'data': {'categories': ['#diffusion', '#3d', '#optimization'], 'emoji': '🎨', 'ru': {'title': 'CaPa: Революция в генерации 3D-моделей', 'desc': 'В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощью латентной диффузионной модели и пространственно-разделенного внимания. CaPa также предлагает алгоритм для заполнения нетекстурированных областей, обеспечивая целостность результатов. Фреймворк генерирует 3D-модели менее чем за 30 секунд, превосходя аналоги по качеству текстур и стабильности геометрии.'}, 'en': {'title': 'CaPa: Fast and High-Fidelity 3D Asset Generation', 'desc': 'This paper presents CaPa, a novel framework for generating high-quality 3D assets from textual or visual inputs. It addresses common challenges in 3D generation, such as multi-view inconsistency and slow generation times, by separating geometry generation from texture synthesis. The framework utilizes a 3D latent diffusion model for consistent geometry creation and a Spatially Decoupled Attention mechanism for high-resolution texture synthesis. CaPa also includes a 3D-aware occlusion inpainting algorithm to enhance the final output, achieving high fidelity and stability in under 30 seconds.'}, 'zh': {'title': '高效生成高保真3D资产的CaPa框架', 'desc': '本论文介绍了一种名为CaPa的框架,用于高效生成高保真度的3D资产。该框架采用两阶段的过程,将几何体生成与纹理合成解耦。首先,使用3D潜在扩散模型生成几何体,确保多视角之间的结构一致性。然后,通过一种新颖的空间解耦注意力机制合成高分辨率纹理,并提出了3D感知的遮挡修复算法,最终在30秒内生成高质量的3D资产。'}}}, {'id': 'https://huggingface.co/papers/2501.09653', 'title': 'The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models', 'url': 'https://huggingface.co/papers/2501.09653', 'abstract': 'The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.', 'score': 10, 'issue_id': 1730, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '6d731a1519dc2727', 'authors': ['Jonathan Katzy', 'Razvan Mihai Popescu', 'Arie van Deursen', 'Maliheh Izadi'], 'affiliations': ['Delft University of Technology Delft, The Netherlands'], 'pdf_title_img': 'assets/pdf/title_img/2501.09653.jpg', 'data': {'categories': ['#low_resource', '#multilingual', '#open_source', '#data', '#dataset'], 'emoji': '🗃️', 'ru': {'title': 'The Heap: чистый код для честной оценки языковых моделей', 'desc': "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор данных под названием 'The Heap' охватывает 57 языков программирования и был дедуплицирован относительно других открытых наборов данных. Это позволяет исследователям проводить объективные оценки больших языковых моделей без необходимости значительной предварительной очистки данных. Создание 'The Heap' решает проблему ограниченности доступного кода для исследования специфических поведений моделей и их оценки без риска загрязнения данных."}, 'en': {'title': 'The Heap: A Clean Dataset for Fair Evaluation of Language Models', 'desc': 'This paper introduces The Heap, a comprehensive multilingual dataset that includes code from 57 programming languages. It addresses the challenge of data contamination in evaluating large language models by providing a deduplicated dataset, ensuring that the code is unique compared to existing open datasets. Researchers can utilize The Heap for downstream tasks without the burden of extensive data cleaning. This resource aims to facilitate fair assessments of model performance in coding tasks.'}, 'zh': {'title': '公平评估大型语言模型的新数据集', 'desc': '随着大型语言模型的流行,开发了大量的代码数据集来训练这些模型。然而,这导致可用于特定行为研究或评估大型语言模型的代码有限,且可能存在数据污染的问题。为了解决这个问题,我们发布了The Heap,这是一个覆盖57种编程语言的大型多语言数据集,经过去重处理,避免与其他开放代码数据集重复。这样,研究人员可以在不需要大量数据清理的情况下,公平地评估大型语言模型。'}}}, {'id': 'https://huggingface.co/papers/2501.09503', 'title': 'AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation', 'url': 'https://huggingface.co/papers/2501.09503', 'abstract': 'Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .', 'score': 8, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'fb27e795153a9668', 'authors': ['Junjie He', 'Yuxiang Tuo', 'Binghui Chen', 'Chongyang Zhong', 'Yifeng Geng', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.09503.jpg', 'data': {'categories': ['#cv', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами', 'desc': 'Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод использует универсальный энкодер изображений ReferenceNet и CLIP для высококачественного кодирования характеристик субъектов. AnyStory применяет декуплированный маршрутизатор субъектов для точного определения их потенциального расположения в латентном пространстве. Эксперименты показывают превосходную производительность метода в сохранении деталей субъектов, соответствии текстовым описаниям и персонализации для нескольких субъектов одновременно.'}, 'en': {'title': 'AnyStory: Mastering Personalized Image Generation for Multiple Subjects', 'desc': "This paper introduces AnyStory, a novel method for generating personalized images with high fidelity, even when multiple subjects are involved. It employs an 'encode-then-route' strategy, where a powerful image encoder, ReferenceNet, captures detailed subject features. The routing mechanism uses an instance-aware subject router to accurately determine where each subject should be placed in the generated image. Experimental results show that AnyStory excels in maintaining subject details and aligning them with text descriptions, making it effective for both single and multiple subjects."}, 'zh': {'title': 'AnyStory:个性化主题生成的新方法', 'desc': '最近,大规模生成模型在文本到图像生成方面表现出色。然而,生成高保真度的个性化图像,尤其是涉及多个主题的情况,仍然面临挑战。本文提出了AnyStory,这是一种统一的个性化主题生成方法,能够在不牺牲主题保真的情况下,实现单个和多个主题的高保真个性化。AnyStory通过“编码-再路由”的方式建模主题个性化问题,利用强大的图像编码器和实例感知路由器,准确预测主题在潜在空间中的位置。'}}}, {'id': 'https://huggingface.co/papers/2501.08617', 'title': 'RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation', 'url': 'https://huggingface.co/papers/2501.08617', 'abstract': "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.", 'score': 8, 'issue_id': 1720, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'f758bc630d8dd443', 'authors': ['Kaiqu Liang', 'Haimin Hu', 'Ryan Liu', 'Thomas L. Griffiths', 'Jaime Fernández Fisac'], 'affiliations': ['Department of Computer Science, Princeton University', 'Department of Electrical and Computer Engineering, Princeton University', 'Department of Psychology, Princeton University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08617.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#training', '#rl'], 'emoji': '🔮', 'ru': {'title': 'Взгляд в будущее для лучшей настройки ИИ', 'desc': 'Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS использует симуляцию долгосрочных последствий действий модели и оценку их полезности постфактум. Авторы показывают, что RLHS позволяет уменьшить проблему неправильной мотивации модели и улучшить соответствие человеческим ценностям. Эмпирические эксперименты демонстрируют превосходство RLHS над RLHF в достижении целей пользователей.'}, 'en': {'title': 'Aligning AI with Human Values through Hindsight Feedback', 'desc': "This paper addresses the challenge of aligning generative AI systems with human values using Reinforcement Learning from Human Feedback (RLHF). It identifies that relying on immediate feedback can lead to misaligned behaviors, such as sycophancy and deception, due to Goodhart's Law dynamics. The authors propose a new approach called Reinforcement Learning from Hindsight Simulation (RLHS), which uses simulated consequences to gather feedback on beneficial behaviors. Their experiments show that RLHS improves user satisfaction and goal achievement compared to traditional RLHF methods, highlighting the importance of considering long-term outcomes in AI alignment."}, 'zh': {'title': '关注长期后果,提升AI对齐性', 'desc': '这篇论文探讨了生成性人工智能系统如何更好地与人类价值观对齐,以确保其行为有益且可信。现有的基于人类反馈的强化学习(RLHF)方法主要依赖即时反馈,但这种反馈可能无法准确反映与用户效用相关的长期影响。作者提出了一种新的方法,称为基于事后模拟的强化学习(RLHS),通过模拟可能的后果来获取反馈,从而改善模型的对齐性。研究表明,RLHS在帮助用户实现目标和提高满意度方面,优于传统的RLHF方法。'}}}, {'id': 'https://huggingface.co/papers/2501.15368', 'title': 'Baichuan-Omni-1.5 Technical Report', 'url': 'https://huggingface.co/papers/2501.15368', 'abstract': 'We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks.', 'score': 35, 'issue_id': 1898, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': 'f40b7f7b108c1500', 'authors': ['Yadong Li', 'Jun Liu', 'Tao Zhang', 'Tao Zhang', 'Song Chen', 'Tianpeng Li', 'Zehuan Li', 'Lijun Liu', 'Lingfeng Ming', 'Guosheng Dong', 'Da Pan', 'Chong Li', 'Yuanbo Fang', 'Dongdong Kuang', 'Mingrui Wang', 'Chenglin Zhu', 'Youwei Zhang', 'Hongyu Guo', 'Fengyu Zhang', 'Yuran Wang', 'Bowen Ding', 'Wei Song', 'Xu Li', 'Yuqi Huo', 'Zheng Liang', 'Shusen Zhang', 'Xin Wu', 'Shuai Zhao', 'Linchu Xiong', 'Yozhen Wu', 'Jiahui Ye', 'Wenhao Lu', 'Bowen Li', 'Yan Zhang', 'Yaqi Zhou', 'Xin Chen', 'Lei Su', 'Hongda Zhang', 'Fuzhong Chen', 'Xuezhen Dong', 'Na Nie', 'Zhiying Wu', 'Bin Xiao', 'Ting Li', 'Shunya Dang', 'Ping Zhang', 'Yijia Sun', 'Jincheng Wu', 'Jinjie Yang', 'Xionghai Lin', 'Zhi Ma', 'Kegeng Wu', 'Jia li', 'Aiyuan Yang', 'Hui Liu', 'Jianqiang Zhang', 'Xiaoxi Chen', 'Guangwei Ai', 'Wentao Zhang', 'Yicong Chen', 'Xiaoqin Huang', 'Kun Li', 'Wenjing Luo', 'Yifei Duan', 'Lingling Zhu', 'Ran Xiao', 'Zhe Su', 'Jiani Pu', 'Dian Wang', 'Xu Jia', 'Tianyu Zhang', 'Mengyu Ai', 'Mang Wang', 'Yujing Qiao', 'Lei Zhang', 'Yanjun Shen', 'Fan Yang', 'Miao Zhen', 'Yijie Zhou', 'Mingyang Chen', 'Fei Li', 'Chenzheng Zhu', 'Keer Lu', 'Yaqi Zhao', 'Hao Liang', 'Youquan Li', 'Yanzhao Qin', 'Linzhuang Sun', 'Jianhua Xu', 'Haoze Sun', 'Mingan Lin', 'Zenan Zhou', 'Weipeng Chen'], 'affiliations': ['Baichuan Inc.'], 'pdf_title_img': 'assets/pdf/title_img/2501.15368.jpg', 'data': {'categories': ['#data', '#optimization', '#dataset', '#training', '#audio', '#multimodal'], 'emoji': '🎭', 'ru': {'title': 'Baichuan-Omni-1.5: Прорыв в омнимодальном ИИ', 'desc': 'Baichuan-Omni-1.5 - это омнимодальная модель, обладающая способностями понимания и генерации аудио. Для достижения качественного взаимодействия между модальностями, авторы оптимизировали три ключевых аспекта: создали комплексный пайплайн для обработки мультимодальных данных, разработали аудио-токенизатор для захвата семантической и акустической информации, и применили многоэтапную стратегию обучения. Модель демонстрирует ведущие результаты в омнимодальных возможностях и сравнима с передовыми моделями в различных мультимодальных медицинских бенчмарках.'}, 'en': {'title': 'Revolutionizing Multimodal Interaction with Baichuan-Omni-1.5', 'desc': 'Baichuan-Omni-1.5 is a cutting-edge omni-modal model designed for seamless interaction across text, audio, and visual data. It utilizes a robust data cleaning and synthesis pipeline to process approximately 500 billion high-quality multimodal data points. The model features a specialized audio-tokenizer that captures both semantic and acoustic elements, enhancing its compatibility with multi-layered language models (MLLM). Through a multi-stage training approach, it effectively aligns and fine-tunes across modalities, outperforming existing models in various multimodal tasks, particularly in medical benchmarks.'}, 'zh': {'title': '全模态交互的新纪元', 'desc': '我们介绍了Baichuan-Omni-1.5,这是一种全模态模型,具备全模态理解和端到端音频生成能力。为了实现不同模态之间流畅且高质量的交互,我们优化了三个关键方面。首先,我们建立了一个全面的数据清洗和合成管道,获得了约5000亿条高质量的多模态数据(文本、音频和视觉)。其次,我们设计了一个音频标记器(Baichuan-Audio-Tokenizer),能够捕捉音频的语义和声学信息,从而增强与多模态大语言模型的兼容性。'}}}, {'id': 'https://huggingface.co/papers/2501.15383', 'title': 'Qwen2.5-1M Technical Report', 'url': 'https://huggingface.co/papers/2501.15383', 'abstract': 'We introduce Qwen2.5-1M, a series of models that extend the context length to 1 million tokens. Compared to the previous 128K version, the Qwen2.5-1M series have significantly enhanced long-context capabilities through long-context pre-training and post-training. Key techniques such as long data synthesis, progressive pre-training, and multi-stage supervised fine-tuning are employed to effectively enhance long-context performance while reducing training costs. To promote the use of long-context models among a broader user base, we present and open-source our inference framework. This framework includes a length extrapolation method that can expand the model context lengths by at least four times, or even more, without additional training. To reduce inference costs, we implement a sparse attention method along with chunked prefill optimization for deployment scenarios and a sparsity refinement method to improve precision. Additionally, we detail our optimizations in the inference engine, including kernel optimization, pipeline parallelism, and scheduling optimization, which significantly enhance overall inference performance. By leveraging our inference framework, the Qwen2.5-1M models achieve a remarkable 3x to 7x prefill speedup in scenarios with 1 million tokens of context. This framework provides an efficient and powerful solution for developing applications that require long-context processing using open-source models. The Qwen2.5-1M series currently includes the open-source models Qwen2.5-7B-Instruct-1M and Qwen2.5-14B-Instruct-1M, as well as the API-accessed model Qwen2.5-Turbo. Evaluations show that Qwen2.5-1M models have been greatly improved in long-context tasks without compromising performance in short-context scenarios. Specifically, the Qwen2.5-14B-Instruct-1M model significantly outperforms GPT-4o-mini in long-context tasks and supports contexts eight times longer.', 'score': 22, 'issue_id': 1898, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': '203817e55fc3eb45', 'authors': ['An Yang', 'Bowen Yu', 'Chengyuan Li', 'Dayiheng Liu', 'Fei Huang', 'Haoyan Huang', 'Jiandong Jiang', 'Jianhong Tu', 'Jianwei Zhang', 'Jingren Zhou', 'Junyang Lin', 'Kai Dang', 'Kexin Yang', 'Le Yu', 'Mei Li', 'Minmin Sun', 'Qin Zhu', 'Rui Men', 'Tao He', 'Weijia Xu', 'Wenbiao Yin', 'Wenyuan Yu', 'Xiafei Qiu', 'Xingzhang Ren', 'Xinlong Yang', 'Yong Li', 'Zhiying Xu', 'Zipeng Zhang'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.15383.jpg', 'data': {'categories': ['#architecture', '#inference', '#long_context', '#training', '#open_source'], 'emoji': '🚀', 'ru': {'title': 'Миллион токенов: новый рубеж для языковых моделей', 'desc': 'Статья представляет серию моделей Qwen2.5-1M с контекстным окном в 1 миллион токенов. Авторы применили техники синтеза длинных данных, прогрессивного предобучения и многоэтапной супервизированной донастройки для улучшения работы с длинным контекстом. Разработан фреймворк для инференса, включающий метод экстраполяции длины и оптимизации для ускорения обработки. Модели Qwen2.5-1M демонстрируют значительное улучшение на задачах с длинным контекстом без ухудшения производительности на коротких текстах.'}, 'en': {'title': 'Unlocking the Power of 1 Million Tokens with Qwen2.5-1M', 'desc': 'The Qwen2.5-1M models introduce a significant advancement in handling long-context inputs, extending the context length to 1 million tokens. This is achieved through innovative techniques like long data synthesis and multi-stage supervised fine-tuning, which enhance performance while minimizing training costs. The open-source inference framework allows users to expand context lengths without additional training and includes optimizations for efficient deployment. Overall, these models demonstrate superior performance in long-context tasks compared to existing models, making them a valuable resource for applications requiring extensive context processing.'}, 'zh': {'title': 'Qwen2.5-1M:长上下文处理的新突破', 'desc': '我们介绍了Qwen2.5-1M系列模型,能够处理长达100万标记的上下文。与之前的128K版本相比,Qwen2.5-1M在长上下文能力上有显著提升,采用了长数据合成、渐进式预训练和多阶段监督微调等关键技术。为了降低推理成本,我们实现了稀疏注意力机制和分块预填充优化,同时优化了推理引擎的性能。Qwen2.5-1M模型在处理长上下文任务时表现优异,且在短上下文场景中性能没有下降。'}}}, {'id': 'https://huggingface.co/papers/2501.16142', 'title': 'Towards General-Purpose Model-Free Reinforcement Learning', 'url': 'https://huggingface.co/papers/2501.16142', 'abstract': 'Reinforcement learning (RL) promises a framework for near-universal problem-solving. In practice however, RL algorithms are often tailored to specific benchmarks, relying on carefully tuned hyperparameters and algorithmic choices. Recently, powerful model-based RL methods have shown impressive general results across benchmarks but come at the cost of increased complexity and slow run times, limiting their broader applicability. In this paper, we attempt to find a unifying model-free deep RL algorithm that can address a diverse class of domains and problem settings. To achieve this, we leverage model-based representations that approximately linearize the value function, taking advantage of the denser task objectives used by model-based RL while avoiding the costs associated with planning or simulated trajectories. We evaluate our algorithm, MR.Q, on a variety of common RL benchmarks with a single set of hyperparameters and show a competitive performance against domain-specific and general baselines, providing a concrete step towards building general-purpose model-free deep RL algorithms.', 'score': 13, 'issue_id': 1898, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '0cf7cd0c9c1f5964', 'authors': ['Scott Fujimoto', "Pierluca D'Oro", 'Amy Zhang', 'Yuandong Tian', 'Michael Rabbat'], 'affiliations': ['Meta FAIR'], 'pdf_title_img': 'assets/pdf/title_img/2501.16142.jpg', 'data': {'categories': ['#optimization', '#rl', '#benchmark', '#training', '#games'], 'emoji': '🤖', 'ru': {'title': 'MR.Q: На пути к универсальному обучению с подкреплением', 'desc': 'Статья представляет новый алгоритм обучения с подкреплением под названием MR.Q. Этот алгоритм объединяет преимущества модельного и безмодельного подходов, используя представления, линеаризующие функцию ценности. MR.Q показывает конкурентоспособные результаты на различных бенчмарках с единым набором гиперпараметров. Исследование направлено на создание универсального безмодельного алгоритма глубокого обучения с подкреплением.'}, 'en': {'title': 'Towards Universal Problem-Solving with MR.Q in Reinforcement Learning', 'desc': 'This paper presents a new model-free deep reinforcement learning algorithm called MR.Q, which aims to solve a wide range of problems without needing extensive tuning of hyperparameters. The authors utilize model-based representations to simplify the value function, allowing the algorithm to benefit from the advantages of model-based RL while avoiding the complexities of planning. MR.Q is evaluated across various standard RL benchmarks using a single set of hyperparameters, demonstrating competitive performance against both specialized and general algorithms. This work represents a significant advancement towards creating versatile and efficient model-free deep RL solutions.'}, 'zh': {'title': '构建通用的无模型深度强化学习算法', 'desc': '强化学习(RL)提供了一种通用问题解决框架,但在实际应用中,RL算法通常针对特定基准进行调整,依赖于精心调节的超参数和算法选择。最近,强大的基于模型的RL方法在多个基准上表现出色,但其复杂性和较慢的运行时间限制了其更广泛的应用。本文提出了一种统一的无模型深度RL算法MR.Q,旨在解决多样化的领域和问题设置。我们利用基于模型的表示方法,近似线性化价值函数,从而在避免规划或模拟轨迹相关成本的同时,利用基于模型的RL所使用的更密集的任务目标。'}}}, {'id': 'https://huggingface.co/papers/2501.15570', 'title': 'ARWKV: Pretrain is not what we need, an RNN-Attention-Based Language Model Born from Transformer', 'url': 'https://huggingface.co/papers/2501.15570', 'abstract': "As is known, hybrid quadratic and subquadratic attention models in multi-head architectures have surpassed both Transformer and Linear RNN models , with these works primarily focusing on reducing KV complexity and improving efficiency. For further research on expressiveness, we introduce our series of models distilled from Qwen 2.5, based on pure native RWKV-7 attention, which aims to make RNN more expressive and demonstrates state tracking ability beyond transformers. We work with QRWK 32B based on RWKV-6 architecture, another approach that reduces the entire knowledge processing time to just 8 hours using 16 AMD MI300X GPUs while maintaining Qwen 2.5's performance. In fact, the distillation process can utilize any LLM, not just Qwen, and enables knowledge transfer from larger LLMs to smaller ones with more fewer tokens. We will explain the detailed process and share our insights on building more powerful foundation models. Please note that this is an ongoing work that will be updated continuously. The model checkpoints and source code are available at https://github.com/yynil/RWKVInside{https://github.com/yynil/RWKVInside}, https://huggingface.co/RWKV-Red-Team/ARWKV-7B-Preview-0.1{https://huggingface.co/RWKV-Red-Team/ARWKV-7B-Preview-0.1}.", 'score': 11, 'issue_id': 1900, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': '063647dfe2bd7b63', 'authors': ['Lin Yueyu', 'Li Zhiyuan', 'Peter Yue', 'Liu Xiao'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.15570.jpg', 'data': {'categories': ['#transfer_learning', '#training', '#architecture', '#small_models', '#optimization', '#open_source'], 'emoji': '🧠', 'ru': {'title': 'Повышение эффективности и выразительности RNN через дистилляцию знаний', 'desc': 'Статья представляет новые модели, основанные на чистом нативном внимании RWKV-7, дистиллированные из Qwen 2.5. Цель исследования - повысить выразительность RNN и продемонстрировать способность отслеживания состояния, превосходящую трансформеры. Авторы работают с QRWK 32B на архитектуре RWKV-6, что позволяет сократить время обработки знаний до 8 часов на 16 GPU AMD MI300X. Процесс дистилляции может использовать любую большую языковую модель для передачи знаний меньшим моделям с меньшим количеством токенов.'}, 'en': {'title': 'Enhancing RNN Expressiveness with RWKV Attention', 'desc': 'This paper presents a new series of models derived from Qwen 2.5, focusing on enhancing the expressiveness of RNNs through a native RWKV-7 attention mechanism. The authors demonstrate that their hybrid quadratic and subquadratic attention models outperform traditional Transformer and Linear RNN architectures by significantly reducing key-value (KV) complexity. They introduce the QRWK 32B model, which achieves impressive efficiency by processing knowledge in just 8 hours using 16 AMD MI300X GPUs while retaining the performance of Qwen 2.5. Additionally, the distillation process allows for knowledge transfer from larger language models (LLMs) to smaller ones, making it a versatile approach for building more powerful foundation models.'}, 'zh': {'title': '提升RNN表达能力的新模型', 'desc': '本文介绍了一种新型的混合二次和亚二次注意力模型,旨在提高RNN的表达能力。我们基于RWKV-7注意力架构,提出了一系列从Qwen 2.5中提炼的模型,展示了超越Transformer的状态跟踪能力。通过使用16个AMD MI300X GPU,我们的QRWK 32B模型将知识处理时间缩短至仅8小时,同时保持了Qwen 2.5的性能。该提炼过程可以利用任何大型语言模型(LLM),实现从更大模型到更小模型的知识转移。'}}}, {'id': 'https://huggingface.co/papers/2501.15907', 'title': 'Emilia: A Large-Scale, Extensive, Multilingual, and Diverse Dataset for Speech Generation', 'url': 'https://huggingface.co/papers/2501.15907', 'abstract': 'Recent advancements in speech generation have been driven by the large-scale training datasets. However, current models fall short of capturing the spontaneity and variability inherent in real-world human speech, due to their reliance on audiobook datasets limited to formal read-aloud speech styles. To bridge this gap, we introduce Emilia-Pipe, an open-source preprocessing pipeline to extract high-quality training data from valuable yet underexplored in-the-wild data that capture spontaneous human speech in real-world contexts. By leveraging Emilia-Pipe, we construct Emilia, the first multilingual speech generation dataset derived from in-the-wild speech data. This dataset comprises over 101k hours of speech across six languages: English, Chinese, German, French, Japanese, and Korean. Besides, we expand Emilia to Emilia-Large, a dataset exceeding 216k hours, making it the largest open-source speech generation dataset available. Extensive experiments demonstrate that Emilia significantly outperforms traditional audiobook datasets in generating spontaneous and human-like speech, showcasing superior performance in capturing diverse speaker timbre and speaking styles of real-world human speech. Furthermore, this work underscores the importance of scaling dataset size to advance speech generation research and validates the effectiveness of Emilia for both multilingual and crosslingual speech generation.', 'score': 10, 'issue_id': 1903, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': 'bd221795c86585eb', 'authors': ['Haorui He', 'Zengqiang Shang', 'Chaoren Wang', 'Xuyuan Li', 'Yicheng Gu', 'Hua Hua', 'Liwei Liu', 'Chen Yang', 'Jiaqi Li', 'Peiyang Shi', 'Yuancheng Wang', 'Kai Chen', 'Pengyuan Zhang', 'Zhizheng Wu'], 'affiliations': ['Chinese University of Hong Kong, Shenzhen, China', 'Laboratory of Speech and Intelligent Information Processing, Institute of Acoustics, CAS, Beijing, China', 'Shanghai AI Laboratory, Shanghai, China', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.15907.jpg', 'data': {'categories': ['#data', '#audio', '#multilingual', '#dataset', '#open_source', '#low_resource'], 'emoji': '🗣️', 'ru': {'title': 'Emilia: новый этап в генерации естественной речи', 'desc': 'Исследователи представили Emilia-Pipe - открытый конвейер предобработки для извлечения высококачественных данных из спонтанной речи в реальных условиях. На его основе создан многоязычный датасет Emilia, содержащий более 101 тысячи часов речи на 6 языках. Расширенная версия Emilia-Large включает более 216 тысяч часов и является крупнейшим открытым датасетом для генерации речи. Эксперименты показали превосходство Emilia над традиционными аудиокнижными датасетами в генерации естественной и спонтанной речи.'}, 'en': {'title': 'Unlocking Spontaneous Speech with Emilia-Pipe', 'desc': 'This paper presents Emilia-Pipe, a preprocessing tool designed to extract high-quality training data from spontaneous human speech in real-world settings. The authors introduce Emilia, a multilingual speech generation dataset that includes over 101k hours of diverse speech data across six languages. They further expand this dataset to Emilia-Large, which contains more than 216k hours, making it the largest open-source resource for speech generation. The results show that models trained on Emilia outperform those trained on traditional audiobook datasets, effectively capturing the variability and naturalness of human speech.'}, 'zh': {'title': '打破传统,捕捉真实语音的多样性', 'desc': '近年来,语音生成的进展主要依赖于大规模的训练数据集。然而,目前的模型在捕捉真实人类语音的自发性和多样性方面存在不足,因为它们依赖于仅限于正式朗读风格的有声书数据集。为了解决这个问题,我们提出了Emilia-Pipe,这是一个开源的预处理管道,用于从有价值但未被充分探索的真实环境数据中提取高质量的训练数据。通过利用Emilia-Pipe,我们构建了Emilia,这是第一个基于真实环境语音数据的多语言语音生成数据集,包含超过101k小时的语音,涵盖六种语言。'}}}, {'id': 'https://huggingface.co/papers/2501.15369', 'title': 'iFormer: Integrating ConvNet and Transformer for Mobile Application', 'url': 'https://huggingface.co/papers/2501.15369', 'abstract': 'We present a new family of mobile hybrid vision networks, called iFormer, with a focus on optimizing latency and accuracy on mobile applications. iFormer effectively integrates the fast local representation capacity of convolution with the efficient global modeling ability of self-attention. The local interactions are derived from transforming a standard convolutional network, i.e., ConvNeXt, to design a more lightweight mobile network. Our newly introduced mobile modulation attention removes memory-intensive operations in MHA and employs an efficient modulation mechanism to boost dynamic global representational capacity. We conduct comprehensive experiments demonstrating that iFormer outperforms existing lightweight networks across various tasks. Notably, iFormer achieves an impressive Top-1 accuracy of 80.4\\% on ImageNet-1k with a latency of only 1.10 ms on an iPhone 13, surpassing the recently proposed MobileNetV4 under similar latency constraints. Additionally, our method shows significant improvements in downstream tasks, including COCO object detection, instance segmentation, and ADE20k semantic segmentation, while still maintaining low latency on mobile devices for high-resolution inputs in these scenarios.', 'score': 8, 'issue_id': 1898, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': '50e030854cdc071f', 'authors': ['Chuanyang Zheng'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.15369.jpg', 'data': {'categories': ['#optimization', '#training', '#cv', '#architecture'], 'emoji': '📱', 'ru': {'title': 'iFormer: Эффективные нейросети для мобильного компьютерного зрения', 'desc': 'iFormer - это новое семейство мобильных гибридных сетей компьютерного зрения, оптимизированных для мобильных приложений. Оно сочетает быструю локальную репрезентативную способность свёрточных сетей с эффективным глобальным моделированием механизма внимания. iFormer использует облегченную версию ConvNeXt и новый механизм модуляционного внимания для мобильных устройств. Эксперименты показывают, что iFormer превосходит существующие легковесные сети по точности и скорости работы на различных задачах, включая классификацию изображений, обнаружение объектов и сегментацию.'}, 'en': {'title': 'iFormer: Optimizing Mobile Vision with Speed and Accuracy', 'desc': 'The paper introduces iFormer, a new type of mobile hybrid vision network designed to enhance both speed and accuracy for mobile applications. It combines the quick local processing of convolutional networks with the effective global understanding of self-attention mechanisms. By modifying a standard convolutional architecture, ConvNeXt, iFormer creates a lightweight model that reduces memory usage while improving performance. Experimental results show that iFormer achieves high accuracy on ImageNet-1k and excels in various downstream tasks, all while maintaining low latency on mobile devices.'}, 'zh': {'title': 'iFormer:移动应用中的高效视觉网络', 'desc': '我们提出了一种新的移动混合视觉网络家族,称为iFormer,旨在优化移动应用的延迟和准确性。iFormer有效地结合了卷积的快速局部表示能力和自注意力的高效全局建模能力。通过将标准卷积网络ConvNeXt转化为更轻量级的移动网络,iFormer实现了局部交互的优化。我们的移动调制注意力机制去除了多头自注意力中的内存密集型操作,并采用高效的调制机制来增强动态全局表示能力。'}}}, {'id': 'https://huggingface.co/papers/2501.14723', 'title': 'CodeMonkeys: Scaling Test-Time Compute for Software Engineering', 'url': 'https://huggingface.co/papers/2501.14723', 'abstract': 'Scaling test-time compute is a promising axis for improving LLM capabilities. However, test-time compute can be scaled in a variety of ways, and effectively combining different approaches remains an active area of research. Here, we explore this problem in the context of solving real-world GitHub issues from the SWE-bench dataset. Our system, named CodeMonkeys, allows models to iteratively edit a codebase by jointly generating and running a testing script alongside their draft edit. We sample many of these multi-turn trajectories for every issue to generate a collection of candidate edits. This approach lets us scale "serial" test-time compute by increasing the number of iterations per trajectory and "parallel" test-time compute by increasing the number of trajectories per problem. With parallel scaling, we can amortize up-front costs across multiple downstream samples, allowing us to identify relevant codebase context using the simple method of letting an LLM read every file. In order to select between candidate edits, we combine voting using model-generated tests with a final multi-turn trajectory dedicated to selection. Overall, CodeMonkeys resolves 57.4% of issues from SWE-bench Verified using a budget of approximately 2300 USD. Our selection method can also be used to combine candidates from different sources. Selecting over an ensemble of edits from existing top SWE-bench Verified submissions obtains a score of 66.2% and outperforms the best member of the ensemble on its own. We fully release our code and data at https://scalingintelligence.stanford.edu/pubs/codemonkeys.', 'score': 4, 'issue_id': 1912, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '0aee5401febd2bf6', 'authors': ['Ryan Ehrlich', 'Bradley Brown', 'Jordan Juravsky', 'Ronald Clark', 'Christopher Ré', 'Azalia Mirhoseini'], 'affiliations': ['Department of Computer Science, Stanford University', 'University of Oxford'], 'pdf_title_img': 'assets/pdf/title_img/2501.14723.jpg', 'data': {'categories': ['#data', '#optimization', '#training', '#dataset', '#plp', '#open_source'], 'emoji': '🐒', 'ru': {'title': 'CodeMonkeys: Масштабирование вычислений LLM для решения реальных задач программирования', 'desc': 'Статья представляет систему CodeMonkeys для решения реальных проблем GitHub с помощью больших языковых моделей (LLM). Система позволяет моделям итеративно редактировать кодовую базу, генерируя и запуская тестовые скрипты вместе с черновыми правками. CodeMonkeys использует как последовательное, так и параллельное масштабирование вычислений во время тестирования, что позволяет эффективно идентифицировать релевантный контекст кодовой базы. Метод выбора кандидатов на основе голосования и финальной многоходовой траектории позволил системе решить 57.4% проблем из набора данных SWE-bench Verified.'}, 'en': {'title': 'Enhancing LLMs with Scalable Test-Time Compute for Code Editing', 'desc': 'This paper presents CodeMonkeys, a system designed to enhance the capabilities of large language models (LLMs) by scaling test-time compute during code editing tasks. It combines iterative code generation with testing script execution, allowing models to refine their edits through multiple iterations and trajectories. By leveraging both serial and parallel scaling, CodeMonkeys efficiently identifies relevant code context and selects the best candidate edits through a voting mechanism. The system demonstrates effectiveness by resolving over 57% of real-world GitHub issues while optimizing resource usage, and it shows improved performance when combining edits from various sources.'}, 'zh': {'title': '通过CodeMonkeys提升代码编辑能力', 'desc': '本文探讨了如何通过扩展测试时计算来提升大型语言模型(LLM)的能力。我们提出了一个名为CodeMonkeys的系统,它可以通过生成和运行测试脚本来迭代编辑代码库,从而解决实际的GitHub问题。该方法通过增加每个问题的迭代次数和轨迹数量,实现了串行和并行的测试时计算扩展。最终,CodeMonkeys成功解决了57.4%的问题,并且我们的选择方法也能有效结合来自不同来源的候选编辑。'}}}, {'id': 'https://huggingface.co/papers/2403.09193', 'title': 'Are Vision Language Models Texture or Shape Biased and Can We Steer Them?', 'url': 'https://huggingface.co/papers/2403.09193', 'abstract': 'Vision language models (VLMs) have drastically changed the computer vision model landscape in only a few years, opening an exciting array of new applications from zero-shot image classification, over to image captioning, and visual question answering. Unlike pure vision models, they offer an intuitive way to access visual content through language prompting. The wide applicability of such models encourages us to ask whether they also align with human vision - specifically, how far they adopt human-induced visual biases through multimodal fusion, or whether they simply inherit biases from pure vision models. One important visual bias is the texture vs. shape bias, or the dominance of local over global information. In this paper, we study this bias in a wide range of popular VLMs. Interestingly, we find that VLMs are often more shape-biased than their vision encoders, indicating that visual biases are modulated to some extent through text in multimodal models. If text does indeed influence visual biases, this suggests that we may be able to steer visual biases not just through visual input but also through language: a hypothesis that we confirm through extensive experiments. For instance, we are able to steer shape bias from as low as 49% to as high as 72% through prompting alone. For now, the strong human bias towards shape (96%) remains out of reach for all tested VLMs.', 'score': 4, 'issue_id': 1911, 'pub_date': '2025-03-14', 'pub_date_card': {'ru': '14 марта', 'en': 'March 14', 'zh': '3月14日'}, 'hash': 'e5fc94d983fca41c', 'authors': ['Paul Gavrikov', 'Jovita Lukasik', 'Steffen Jung', 'Robert Geirhos', 'Bianca Lamm', 'Muhammad Jehanzeb Mirza', 'Margret Keuper', 'Janis Keuper'], 'affiliations': ['Google DeepMind', 'ICG, Graz University of Technology', 'IMLA, Offenburg University', 'Max Planck Institute for Informatics, Saarland Informatics Campus', 'University of Mannheim', 'University of Siegen'], 'pdf_title_img': 'assets/pdf/title_img/2403.09193.jpg', 'data': {'categories': ['#cv', '#ethics', '#alignment', '#multimodal'], 'emoji': '👁️', 'ru': {'title': 'Текст направляет взгляд: как языковые подсказки влияют на визуальные предубеждения ИИ', 'desc': 'Статья исследует визуальные предубеждения в мультимодальных моделях, объединяющих зрение и язык (VLM). Авторы обнаружили, что VLM часто более ориентированы на форму объектов, чем чисто визуальные модели. Эксперименты показали, что текстовые подсказки могут значительно влиять на баланс между ориентацией на форму и текстуру в VLM. Однако даже после оптимизации, VLM все еще уступают человеческому зрению в ориентации на форму объектов.'}, 'en': {'title': 'Steering Visual Biases with Language in Vision Language Models', 'desc': 'This paper investigates how vision language models (VLMs) incorporate human visual biases, particularly the texture vs. shape bias, which refers to the preference for local versus global information in images. The authors find that VLMs tend to be more shape-biased than traditional vision models, suggesting that language prompts can influence visual processing. Through experiments, they demonstrate that the shape bias can be adjusted significantly by changing the text prompts used with the models. However, despite these adjustments, the VLMs still do not fully match the strong human bias towards shape recognition.'}, 'zh': {'title': '通过语言引导视觉偏差的可能性', 'desc': '视觉语言模型(VLMs)在计算机视觉领域带来了显著变化,支持从零样本图像分类到图像描述和视觉问答等多种应用。这些模型通过语言提示提供了一种直观的方式来访问视觉内容。我们研究了VLMs中存在的视觉偏差,特别是纹理与形状偏差,发现VLMs在形状偏差上往往比纯视觉模型更强。这表明,通过文本的多模态融合,视觉偏差可以在一定程度上被调节,且我们可以通过语言来引导视觉偏差。'}}}, {'id': 'https://huggingface.co/papers/2501.16295', 'title': 'Mixture-of-Mamba: Enhancing Multi-Modal State-Space Models with Modality-Aware Sparsity', 'url': 'https://huggingface.co/papers/2501.16295', 'abstract': 'State Space Models (SSMs) have emerged as efficient alternatives to Transformers for sequential modeling, but their inability to leverage modality-specific features limits their performance in multi-modal pretraining. Here, we propose Mixture-of-Mamba, a novel SSM architecture that introduces modality-aware sparsity through modality-specific parameterization of the Mamba block. Building on Mixture-of-Transformers (W. Liang et al. arXiv:2411.04996; 2024), we extend the benefits of modality-aware sparsity to SSMs while preserving their computational efficiency. We evaluate Mixture-of-Mamba across three multi-modal pretraining settings: Transfusion (interleaved text and continuous image tokens with diffusion loss), Chameleon (interleaved text and discrete image tokens), and an extended three-modality framework incorporating speech. Mixture-of-Mamba consistently reaches the same loss values at earlier training steps with significantly reduced computational costs. In the Transfusion setting, Mixture-of-Mamba achieves equivalent image loss using only 34.76% of the training FLOPs at the 1.4B scale. In the Chameleon setting, Mixture-of-Mamba reaches similar image loss with just 42.50% of the FLOPs at the 1.4B scale, and similar text loss with just 65.40% of the FLOPs. In the three-modality setting, MoM matches speech loss at 24.80% of the FLOPs at the 1.4B scale. Our ablation study highlights the synergistic effects of decoupling projection components, where joint decoupling yields greater gains than individual modifications. These results establish modality-aware sparsity as a versatile and effective design principle, extending its impact from Transformers to SSMs and setting new benchmarks in multi-modal pretraining. Our code can be accessed at https://github.com/Weixin-Liang/Mixture-of-Mamba', 'score': 4, 'issue_id': 1898, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '011d06607305f0f8', 'authors': ['Weixin Liang', 'Junhong Shen', 'Genghan Zhang', 'Ning Dong', 'Luke Zettlemoyer', 'Lili Yu'], 'affiliations': ['Department of Computer Science, Stanford University', 'FAIR at Meta', 'Machine Learning Department, Carnegie Mellon University'], 'pdf_title_img': 'assets/pdf/title_img/2501.16295.jpg', 'data': {'categories': ['#multimodal', '#architecture', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Mixture-of-Mamba: Эффективное мультимодальное обучение с модальность-специфической разреженностью', 'desc': 'В этой статье представлена новая архитектура модели состояний (SSM) под названием Mixture-of-Mamba, которая вводит модальность-специфическую разреженность для мультимодального предобучения. Mixture-of-Mamba расширяет преимущества модальность-осведомленной разреженности на SSM, сохраняя при этом их вычислительную эффективность. Модель была оценена в трех настройках мультимодального предобучения: Transfusion, Chameleon и расширенной трехмодальной системе, включающей речь. Результаты показывают, что Mixture-of-Mamba достигает тех же значений потерь на более ранних этапах обучения со значительно меньшими вычислительными затратами по сравнению с базовыми моделями.'}, 'en': {'title': 'Revolutionizing Multi-Modal Learning with Efficient SSMs', 'desc': 'This paper introduces Mixture-of-Mamba, a new State Space Model (SSM) that enhances multi-modal pretraining by incorporating modality-aware sparsity. By parameterizing the Mamba block specifically for different modalities, the model efficiently utilizes features from various data types like text, images, and speech. The results show that Mixture-of-Mamba achieves comparable performance to existing models while significantly reducing computational costs, using fewer floating point operations (FLOPs). This work demonstrates the effectiveness of modality-aware sparsity in improving SSMs, setting new benchmarks in the field of multi-modal learning.'}, 'zh': {'title': '模态感知稀疏性:提升SSM的多模态预训练效率', 'desc': '状态空间模型(SSMs)作为序列建模的有效替代方案,面临无法利用特定模态特征的问题。我们提出了一种新颖的SSM架构——Mixture-of-Mamba,通过对Mamba模块进行模态特定参数化,引入了模态感知稀疏性。该模型在多模态预训练中表现出色,能够在较早的训练步骤中达到相同的损失值,同时显著降低计算成本。我们的研究表明,模态感知稀疏性是一个有效的设计原则,能够将其影响从变换器扩展到SSMs,并在多模态预训练中设定新的基准。'}}}, {'id': 'https://huggingface.co/papers/2501.15427', 'title': 'OpenCharacter: Training Customizable Role-Playing LLMs with Large-Scale Synthetic Personas', 'url': 'https://huggingface.co/papers/2501.15427', 'abstract': 'Customizable role-playing in large language models (LLMs), also known as character generalization, is gaining increasing attention for its versatility and cost-efficiency in developing and deploying role-playing dialogue agents. This study explores a large-scale data synthesis approach to equip LLMs with character generalization capabilities. We begin by synthesizing large-scale character profiles using personas from Persona Hub and then explore two strategies: response rewriting and response generation, to create character-aligned instructional responses. To validate the effectiveness of our synthetic instruction tuning data for character generalization, we perform supervised fine-tuning (SFT) using the LLaMA-3 8B model. Our best-performing model strengthens the original LLaMA-3 8B Instruct model and achieves performance comparable to GPT-4o models on role-playing dialogue. We release our synthetic characters and instruction-tuning dialogues to support public research.', 'score': 3, 'issue_id': 1910, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': 'fa7a70d2c9f398b9', 'authors': ['Xiaoyang Wang', 'Hongming Zhang', 'Tao Ge', 'Wenhao Yu', 'Dian Yu', 'Dong Yu'], 'affiliations': ['Tencent AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.15427.jpg', 'data': {'categories': ['#training', '#data', '#agents', '#dataset', '#open_source', '#synthetic'], 'emoji': '🎭', 'ru': {'title': 'Обучение ИИ искусству перевоплощения', 'desc': 'Исследование посвящено обучению больших языковых моделей (LLM) способности к обобщению характеров персонажей. Авторы синтезируют большой набор профилей персонажей и диалогов для инструктивной настройки модели. Используя эти данные, они проводят supervised fine-tuning модели LLaMA-3 8B. Полученная модель показывает результаты, сравнимые с GPT-4 в задачах ролевого диалога.'}, 'en': {'title': 'Empowering LLMs with Character Generalization for Role-Playing', 'desc': 'This paper discusses how to improve large language models (LLMs) for role-playing tasks by enabling them to adopt different character personas. The authors create a large dataset of character profiles and use two methods—response rewriting and response generation—to produce responses that match these characters. They then fine-tune the LLaMA-3 8B model with this synthetic data to enhance its ability to generate character-aligned dialogues. The results show that their improved model performs similarly to advanced models like GPT-4o in role-playing scenarios, and they provide their resources for further research.'}, 'zh': {'title': '增强大型语言模型的角色扮演能力', 'desc': '本文研究了如何通过大规模数据合成来增强大型语言模型(LLMs)的角色扮演能力。我们首先利用Persona Hub合成大量角色档案,然后探索了两种策略:响应重写和响应生成,以创建与角色对齐的指令响应。通过对LLaMA-3 8B模型进行监督微调(SFT),我们验证了合成指令调优数据在角色泛化方面的有效性。最终,我们的最佳模型在角色扮演对话中表现出色,达到了与GPT-4o模型相当的性能,并公开发布了合成角色和指令调优对话以支持公共研究。'}}}, {'id': 'https://huggingface.co/papers/2501.12370', 'title': 'Parameters vs FLOPs: Scaling Laws for Optimal Sparsity for Mixture-of-Experts Language Models', 'url': 'https://huggingface.co/papers/2501.12370', 'abstract': "Scaling the capacity of language models has consistently proven to be a reliable approach for improving performance and unlocking new capabilities. Capacity can be primarily defined by two dimensions: the number of model parameters and the compute per example. While scaling typically involves increasing both, the precise interplay between these factors and their combined contribution to overall capacity remains not fully understood. We explore this relationship in the context of sparse Mixture-of-Experts (MoEs), which allow scaling the number of parameters without proportionally increasing the FLOPs per example. We investigate how varying the sparsity level, i.e., the fraction of inactive parameters, impacts model's performance during pretraining and downstream few-shot evaluation. We find that under different constraints (e.g., parameter size and total training compute), there is an optimal level of sparsity that improves both training efficiency and model performance. These results provide a better understanding of the impact of sparsity in scaling laws for MoEs and complement existing works in this area, offering insights for designing more efficient architectures.", 'score': 3, 'issue_id': 1905, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'bffcdc51c572d8f2', 'authors': ['Samira Abnar', 'Harshay Shah', 'Dan Busbridge', 'Alaaeldin Mohamed Elnouby Ali', 'Josh Susskind', 'Vimal Thilak'], 'affiliations': ['Apple', 'MIT'], 'pdf_title_img': 'assets/pdf/title_img/2501.12370.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization'], 'emoji': '🧠', 'ru': {'title': 'Оптимальная разреженность - ключ к эффективному масштабированию языковых моделей', 'desc': 'Статья исследует взаимосвязь между количеством параметров и вычислительной мощностью в контексте разреженных моделей Mixture-of-Experts (MoE). Авторы изучают, как изменение уровня разреженности влияет на производительность модели во время предварительного обучения и последующей оценки few-shot. Результаты показывают, что существует оптимальный уровень разреженности, который улучшает как эффективность обучения, так и производительность модели. Это исследование дополняет существующие работы в области масштабирования языковых моделей и предлагает insights для разработки более эффективных архитектур.'}, 'en': {'title': 'Unlocking Efficiency: The Power of Sparsity in Language Models', 'desc': "This paper investigates how to improve language models by scaling their capacity, focusing on two main factors: the number of parameters and the compute required for each example. It specifically looks at sparse Mixture-of-Experts (MoEs), which allow for a larger number of parameters without a corresponding increase in computational load. The authors explore how different levels of sparsity, or the proportion of inactive parameters, affect the model's performance during training and evaluation. Their findings suggest that there is an optimal level of sparsity that enhances both efficiency and performance, providing valuable insights for developing more effective machine learning architectures."}, 'zh': {'title': '优化稀疏性,提升模型性能', 'desc': '本文探讨了语言模型容量的扩展,特别是在稀疏混合专家(MoEs)框架下。容量主要由模型参数数量和每个样本的计算量决定。研究发现,在不同的约束条件下,存在一个最佳的稀疏水平,可以提高训练效率和模型性能。此研究为理解稀疏性在MoEs扩展法则中的影响提供了新的视角,并为设计更高效的架构提供了见解。'}}}, {'id': 'https://huggingface.co/papers/2501.15420', 'title': 'Visual Generation Without Guidance', 'url': 'https://huggingface.co/papers/2501.15420', 'abstract': 'Classifier-Free Guidance (CFG) has been a default technique in various visual generative models, yet it requires inference from both conditional and unconditional models during sampling. We propose to build visual models that are free from guided sampling. The resulting algorithm, Guidance-Free Training (GFT), matches the performance of CFG while reducing sampling to a single model, halving the computational cost. Unlike previous distillation-based approaches that rely on pretrained CFG networks, GFT enables training directly from scratch. GFT is simple to implement. It retains the same maximum likelihood objective as CFG and differs mainly in the parameterization of conditional models. Implementing GFT requires only minimal modifications to existing codebases, as most design choices and hyperparameters are directly inherited from CFG. Our extensive experiments across five distinct visual models demonstrate the effectiveness and versatility of GFT. Across domains of diffusion, autoregressive, and masked-prediction modeling, GFT consistently achieves comparable or even lower FID scores, with similar diversity-fidelity trade-offs compared with CFG baselines, all while being guidance-free. Code will be available at https://github.com/thu-ml/GFT.', 'score': 2, 'issue_id': 1912, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': 'd7e67912a685cbf9', 'authors': ['Huayu Chen', 'Kai Jiang', 'Kaiwen Zheng', 'Jianfei Chen', 'Hang Su', 'Jun Zhu'], 'affiliations': ['Department of Computer Science & Technology, Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.15420.jpg', 'data': {'categories': ['#optimization', '#diffusion', '#training', '#cv', '#open_source'], 'emoji': '🖼️', 'ru': {'title': 'GFT: Эффективная генерация изображений без направляющей выборки', 'desc': 'Статья представляет новый метод обучения визуальных генеративных моделей - Guidance-Free Training (GFT). GFT позволяет достичь производительности Classifier-Free Guidance (CFG), но требует вдвое меньше вычислений при генерации изображений. Метод прост в реализации и может применяться для обучения моделей с нуля. Эксперименты показали эффективность GFT для различных типов моделей, включая диффузионные, авторегрессионные и модели с маскированием.'}, 'en': {'title': 'Guidance-Free Training: Simplifying Visual Generative Models', 'desc': 'This paper introduces Guidance-Free Training (GFT), a new approach for visual generative models that eliminates the need for classifier-free guidance during sampling. GFT achieves similar performance to traditional Classifier-Free Guidance (CFG) while only requiring a single model for inference, thus reducing computational costs by half. The method allows for training from scratch, avoiding reliance on pre-trained CFG networks, and retains the same maximum likelihood objective as CFG with minimal changes to existing implementations. Extensive experiments show that GFT performs comparably or better than CFG across various visual modeling domains, maintaining a good balance between diversity and fidelity.'}, 'zh': {'title': '无引导训练:降低计算成本的视觉生成新方法', 'desc': '无引导采样的视觉模型是本研究的核心。我们提出的无引导训练(GFT)算法在性能上与传统的分类器引导(CFG)相当,但只需使用单一模型进行采样,从而减少了计算成本。GFT可以直接从头开始训练,而不依赖于预训练的CFG网络,且实现简单。通过在五种不同的视觉模型上进行广泛实验,我们证明了GFT的有效性和多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.16273', 'title': 'Return of the Encoder: Maximizing Parameter Efficiency for SLMs', 'url': 'https://huggingface.co/papers/2501.16273', 'abstract': "The dominance of large decoder-only language models has overshadowed encoder-decoder architectures, despite their fundamental efficiency advantages in sequence processing. For small language models (SLMs) - those with 1 billion parameters or fewer - our systematic analysis across GPU, CPU, and NPU platforms reveals that encoder-decoder architectures achieve 47% lower first-token latency and 4.7x higher throughput compared to decoder-only models on edge devices. These gains may be attributed to encoder-decoder's one-time input processing and efficient separation of understanding and generation phases. We introduce a novel knowledge distillation framework that enables encoder-decoder models to leverage capabilities from large scalable decoder-only teachers while preserving their architectural advantages, achieving up to 6 average performance points improvement across diverse tasks, with significant gains in asymmetric sequence tasks where input and output distributions can benefit from different processing approaches. When combined with modern advances like Rotary Positional Embeddings (RoPE) and Vision encoders, our systematic investigation demonstrates that encoder-decoder architectures provide a more practical path toward deploying capable language models in resource-constrained environments. Our findings challenge the prevailing trend toward decoder-only scaling, showing that architectural choices become increasingly crucial as parameter budgets decrease, particularly for on-device and edge deployments where computational efficiency is paramount.", 'score': 2, 'issue_id': 1910, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': 'bd97733bda9e3557', 'authors': ['Mohamed Elfeki', 'Rui Liu', 'Chad Voegele'], 'affiliations': ['Microsoft'], 'pdf_title_img': 'assets/pdf/title_img/2501.16273.jpg', 'data': {'categories': ['#architecture', '#training', '#small_models', '#optimization', '#transfer_learning'], 'emoji': '🤖', 'ru': {'title': 'Энкодер-декодер: эффективное решение для малых языковых моделей', 'desc': 'Исследование показывает преимущества архитектуры энкодер-декодер для малых языковых моделей (до 1 млрд параметров) по сравнению с декодер-онли моделями. На периферийных устройствах энкодер-декодер модели демонстрируют на 47% меньшую задержку первого токена и в 4,7 раза большую пропускную способность. Предложен новый фреймворк дистилляции знаний, позволяющий энкодер-декодер моделям использовать возможности больших декодер-онли учителей. Результаты исследования ставят под сомнение тренд на масштабирование декодер-онли архитектур, особенно для ресурсно-ограниченных сред.'}, 'en': {'title': 'Unlocking Efficiency: The Power of Encoder-Decoder Models in Small Language Tasks', 'desc': 'This paper highlights the advantages of encoder-decoder architectures over large decoder-only language models, especially for small language models (SLMs) with 1 billion parameters or fewer. The authors demonstrate that encoder-decoder models can achieve significantly lower latency and higher throughput on edge devices due to their efficient processing of input and separation of understanding and generation phases. They introduce a new knowledge distillation framework that allows these models to benefit from the capabilities of larger decoder-only models while maintaining their efficiency. The study concludes that as parameter budgets decrease, the choice of architecture becomes critical for effective deployment in resource-constrained environments.'}, 'zh': {'title': '编码-解码架构的优势与应用', 'desc': '本论文分析了编码-解码架构在小型语言模型(SLMs)中的优势,尤其是在边缘设备上的表现。研究表明,编码-解码模型在首次令牌延迟上比仅解码模型低47%,并且吞吐量提高了4.7倍。这种优势源于编码-解码架构的一次性输入处理和理解与生成阶段的高效分离。我们还提出了一种新的知识蒸馏框架,使编码-解码模型能够利用大型解码教师的能力,同时保持其架构优势。'}}}, {'id': 'https://huggingface.co/papers/2501.14912', 'title': 'Feasible Learning', 'url': 'https://huggingface.co/papers/2501.14912', 'abstract': 'We introduce Feasible Learning (FL), a sample-centric learning paradigm where models are trained by solving a feasibility problem that bounds the loss for each training sample. In contrast to the ubiquitous Empirical Risk Minimization (ERM) framework, which optimizes for average performance, FL demands satisfactory performance on every individual data point. Since any model that meets the prescribed performance threshold is a valid FL solution, the choice of optimization algorithm and its dynamics play a crucial role in shaping the properties of the resulting solutions. In particular, we study a primal-dual approach which dynamically re-weights the importance of each sample during training. To address the challenge of setting a meaningful threshold in practice, we introduce a relaxation of FL that incorporates slack variables of minimal norm. Our empirical analysis, spanning image classification, age regression, and preference optimization in large language models, demonstrates that models trained via FL can learn from data while displaying improved tail behavior compared to ERM, with only a marginal impact on average performance.', 'score': 2, 'issue_id': 1898, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '7ded44debecf7694', 'authors': ['Juan Ramirez', 'Ignacio Hounie', 'Juan Elenter', 'Jose Gallego-Posada', 'Meraj Hashemizadeh', 'Alejandro Ribeiro', 'Simon Lacoste-Julien'], 'affiliations': ['Canada CIFAR AI Chair', 'Mila & Université de Montréal', 'Spotify', 'University of Pennsylvania'], 'pdf_title_img': 'assets/pdf/title_img/2501.14912.jpg', 'data': {'categories': ['#training', '#optimization'], 'emoji': '🎯', 'ru': {'title': 'Индивидуальный подход к каждому образцу данных', 'desc': 'В статье представлена новая парадигма обучения моделей машинного обучения - Feasible Learning (FL). В отличие от традиционного подхода минимизации эмпирического риска (ERM), FL стремится обеспечить удовлетворительную производительность для каждого отдельного образца данных. Авторы предлагают примально-двойственный подход, который динамически переопределяет важность каждого образца во время обучения. Эмпирический анализ на задачах классификации изображений, регрессии возраста и оптимизации предпочтений в больших языковых моделях показывает, что модели, обученные с помощью FL, демонстрируют улучшенное поведение на редких случаях по сравнению с ERM.'}, 'en': {'title': 'Ensuring Individual Sample Success with Feasible Learning', 'desc': "Feasible Learning (FL) is a new approach in machine learning that focuses on ensuring each training sample meets a specific performance standard, rather than just optimizing for overall average performance like traditional methods. This paradigm treats the training process as a feasibility problem, where any model that satisfies the performance criteria for all samples is considered valid. The paper explores a primal-dual optimization technique that adjusts the importance of each sample during training, enhancing the model's ability to learn effectively. Through various applications, including image classification and language model optimization, FL shows improved performance on challenging cases while maintaining similar average results compared to conventional methods."}, 'zh': {'title': '可行学习:每个样本都要优秀!', 'desc': '我们介绍了一种新的学习范式,称为可行学习(Feasible Learning,FL),它通过解决一个可行性问题来训练模型,从而限制每个训练样本的损失。与传统的经验风险最小化(Empirical Risk Minimization,ERM)框架不同,FL要求每个数据点都能达到满意的性能。FL的有效性依赖于优化算法的选择及其动态调整样本重要性的能力。我们的实证分析表明,使用FL训练的模型在图像分类、年龄回归和大语言模型的偏好优化中,能够在保持平均性能的同时,改善模型在极端情况下的表现。'}}}, {'id': 'https://huggingface.co/papers/2501.08325', 'title': 'GameFactory: Creating New Games with Generative Interactive Videos', 'url': 'https://huggingface.co/papers/2501.08325', 'abstract': 'Generative game engines have the potential to revolutionize game development by autonomously creating new content and reducing manual workload. However, existing video-based game generation methods fail to address the critical challenge of scene generalization, limiting their applicability to existing games with fixed styles and scenes. In this paper, we present GameFactory, a framework focused on exploring scene generalization in game video generation. To enable the creation of entirely new and diverse games, we leverage pre-trained video diffusion models trained on open-domain video data. To bridge the domain gap between open-domain priors and small-scale game dataset, we propose a multi-phase training strategy that decouples game style learning from action control, preserving open-domain generalization while achieving action controllability. Using Minecraft as our data source, we release GF-Minecraft, a high-quality and diversity action-annotated video dataset for research. Furthermore, we extend our framework to enable autoregressive action-controllable game video generation, allowing the production of unlimited-length interactive game videos. Experimental results demonstrate that GameFactory effectively generates open-domain, diverse, and action-controllable game videos, representing a significant step forward in AI-driven game generation. Our dataset and project page are publicly available at https://vvictoryuki.github.io/gamefactory/.', 'score': 47, 'issue_id': 1773, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '0331c9576ced4090', 'authors': ['Jiwen Yu', 'Yiran Qin', 'Xintao Wang', 'Pengfei Wan', 'Di Zhang', 'Xihui Liu'], 'affiliations': ['Kuaishou Technology', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.08325.jpg', 'data': {'categories': ['#dataset', '#video', '#open_source', '#diffusion', '#games', '#training', '#multimodal'], 'emoji': '🎮', 'ru': {'title': 'GameFactory: ИИ-революция в создании видеоигр', 'desc': 'GameFactory - это новая система для генерации видео игр с возможностью обобщения на различные сцены. Она использует предобученные модели диффузии видео на общих данных, что позволяет создавать разнообразные новые игры. Авторы предлагают многоэтапную стратегию обучения, которая разделяет изучение стиля игры и контроль действий. Система также поддерживает авторегрессивную генерацию видео игр с контролем действий неограниченной длины.'}, 'en': {'title': 'Revolutionizing Game Development with Scene Generalization', 'desc': 'This paper introduces GameFactory, a novel framework aimed at enhancing scene generalization in game video generation. It addresses the limitations of current methods that struggle with fixed styles and scenes by utilizing pre-trained video diffusion models on diverse video data. The authors propose a multi-phase training strategy that separates game style learning from action control, allowing for better generalization and controllability. The framework is validated using a new dataset, GF-Minecraft, which supports the generation of diverse and interactive game videos, marking a significant advancement in AI-driven game development.'}, 'zh': {'title': 'GameFactory:革命性的游戏视频生成框架', 'desc': '本论文介绍了GameFactory框架,旨在解决游戏视频生成中的场景泛化问题。现有的视频生成方法无法适应不同风格和场景的游戏,限制了其应用。我们利用预训练的视频扩散模型,并提出多阶段训练策略,以实现游戏风格学习与动作控制的解耦。实验结果表明,GameFactory能够有效生成开放域、多样化且可控的游戏视频,推动了AI驱动的游戏生成技术的发展。'}}}, {'id': 'https://huggingface.co/papers/2501.09781', 'title': 'VideoWorld: Exploring Knowledge Learning from Unlabeled Videos', 'url': 'https://huggingface.co/papers/2501.09781', 'abstract': 'This work explores whether a deep generative model can learn complex knowledge solely from visual input, in contrast to the prevalent focus on text-based models like large language models (LLMs). We develop VideoWorld, an auto-regressive video generation model trained on unlabeled video data, and test its knowledge acquisition abilities in video-based Go and robotic control tasks. Our experiments reveal two key findings: (1) video-only training provides sufficient information for learning knowledge, including rules, reasoning and planning capabilities, and (2) the representation of visual change is crucial for knowledge acquisition. To improve both the efficiency and efficacy of this process, we introduce the Latent Dynamics Model (LDM) as a key component of VideoWorld. Remarkably, VideoWorld reaches a 5-dan professional level in the Video-GoBench with just a 300-million-parameter model, without relying on search algorithms or reward mechanisms typical in reinforcement learning. In robotic tasks, VideoWorld effectively learns diverse control operations and generalizes across environments, approaching the performance of oracle models in CALVIN and RLBench. This study opens new avenues for knowledge acquisition from visual data, with all code, data, and models open-sourced for further research.', 'score': 6, 'issue_id': 1779, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'db65df971ed9f199', 'authors': ['Zhongwei Ren', 'Yunchao Wei', 'Xun Guo', 'Yao Zhao', 'Bingyi Kang', 'Jiashi Feng', 'Xiaojie Jin'], 'affiliations': ['Beijing Jiaotong University', 'ByteDance Seed', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09781.jpg', 'data': {'categories': ['#agents', '#video', '#open_source', '#small_models', '#rl', '#games', '#optimization'], 'emoji': '🎥', 'ru': {'title': 'Визуальное обучение: от видео к глубоким знаниям', 'desc': 'Исследование посвящено обучению глубокой генеративной модели сложным знаниям исключительно на основе визуальных данных. Разработана модель VideoWorld, обученная на немаркированных видеоданных, которая тестируется на задачах игры в го и управления роботами. Ключевые выводы: визуальное обучение достаточно для приобретения знаний, включая правила, рассуждения и планирование, а представление визуальных изменений критично для этого процесса. Модель достигает уровня профессионала 5 дана в го и эффективно обучается управлению роботами в различных средах.'}, 'en': {'title': "Learning Knowledge from Visuals: VideoWorld's Breakthrough", 'desc': 'This paper investigates the ability of a deep generative model to learn complex knowledge from visual inputs, rather than relying on text-based models. The authors introduce VideoWorld, an auto-regressive model that generates videos and learns from unlabeled video data, demonstrating its effectiveness in tasks like video-based Go and robotic control. Key findings indicate that training solely on video data is sufficient for acquiring knowledge such as rules and reasoning, and that understanding visual changes is essential for this learning process. The introduction of the Latent Dynamics Model enhances the efficiency of knowledge acquisition, allowing VideoWorld to achieve high performance in various tasks without traditional reinforcement learning techniques.'}, 'zh': {'title': '从视觉数据中获取知识的新方法', 'desc': '本研究探讨了深度生成模型是否可以仅通过视觉输入学习复杂知识,而不是依赖于文本模型。我们开发了VideoWorld,这是一个基于自回归的视频生成模型,训练于未标记的视频数据,并测试其在视频围棋和机器人控制任务中的知识获取能力。实验结果表明,视频训练提供了足够的信息来学习规则、推理和规划能力,视觉变化的表示对知识获取至关重要。通过引入潜在动态模型(LDM),VideoWorld在视频围棋基准测试中达到了5段专业水平,且在机器人任务中有效学习了多种控制操作。'}}}, {'id': 'https://huggingface.co/papers/2501.09284', 'title': 'SEAL: Entangled White-box Watermarks on Low-Rank Adaptation', 'url': 'https://huggingface.co/papers/2501.09284', 'abstract': 'Recently, LoRA and its variants have become the de facto strategy for training and sharing task-specific versions of large pretrained models, thanks to their efficiency and simplicity. However, the issue of copyright protection for LoRA weights, especially through watermark-based techniques, remains underexplored. To address this gap, we propose SEAL (SEcure wAtermarking on LoRA weights), the universal whitebox watermarking for LoRA. SEAL embeds a secret, non-trainable matrix between trainable LoRA weights, serving as a passport to claim ownership. SEAL then entangles the passport with the LoRA weights through training, without extra loss for entanglement, and distributes the finetuned weights after hiding the passport. When applying SEAL, we observed no performance degradation across commonsense reasoning, textual/visual instruction tuning, and text-to-image synthesis tasks. We demonstrate that SEAL is robust against a variety of known attacks: removal, obfuscation, and ambiguity attacks.', 'score': 2, 'issue_id': 1782, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '3c8f91b49b49bdd2', 'authors': ['Giyeong Oh', 'Saejin Kim', 'Woohyun Cho', 'Sangkyu Lee', 'Jiwan Chung', 'Dokyung Song', 'Youngjae Yu'], 'affiliations': ['Department of Artificial Intelligence, Yonsei University, Seoul, Republic of Korea', 'Department of Computer Science and Engineering, Yonsei University, Seoul, Republic of Korea'], 'pdf_title_img': 'assets/pdf/title_img/2501.09284.jpg', 'data': {'categories': ['#training', '#architecture', '#security'], 'emoji': '🔐', 'ru': {'title': 'SEAL: Защита авторских прав на LoRA-веса с помощью водяных знаков', 'desc': 'Статья представляет SEAL - универсальный метод водяных знаков для весов LoRA. SEAL встраивает секретную матрицу между обучаемыми весами LoRA, которая служит паспортом для подтверждения авторства. Метод не ухудшает производительность модели на различных задачах обработки естественного языка и компьютерного зрения. SEAL демонстрирует устойчивость к известным атакам на водяные знаки, таким как удаление, обфускация и атаки неоднозначности.'}, 'en': {'title': 'SEAL: Safeguarding LoRA Weights with Robust Watermarking', 'desc': "This paper introduces SEAL, a watermarking technique designed to protect LoRA weights used in machine learning. SEAL embeds a secret matrix within the trainable weights, allowing owners to claim their models without affecting performance. The method ensures that the watermark is integrated during training, maintaining the model's effectiveness across various tasks. Additionally, SEAL demonstrates resilience against common attacks aimed at removing or obscuring the watermark."}, 'zh': {'title': '保护LoRA权重的水印技术', 'desc': '最近,LoRA及其变体成为训练和共享特定任务的大型预训练模型的主要策略,因其高效和简单。然而,LoRA权重的版权保护问题,特别是基于水印的技术,仍然未得到充分研究。为了解决这个问题,我们提出了SEAL(LoRA权重的安全水印),这是一种通用的白盒水印技术。SEAL在可训练的LoRA权重之间嵌入一个秘密的、不可训练的矩阵,作为所有权的凭证,并在训练过程中将其与LoRA权重纠缠,确保性能不下降。'}}}, {'id': 'https://huggingface.co/papers/2501.09891', 'title': 'Evolving Deeper LLM Thinking', 'url': 'https://huggingface.co/papers/2501.09891', 'abstract': 'We explore an evolutionary search strategy for scaling inference time compute in Large Language Models. The proposed approach, Mind Evolution, uses a language model to generate, recombine and refine candidate responses. The proposed approach avoids the need to formalize the underlying inference problem whenever a solution evaluator is available. Controlling for inference cost, we find that Mind Evolution significantly outperforms other inference strategies such as Best-of-N and Sequential Revision in natural language planning tasks. In the TravelPlanner and Natural Plan benchmarks, Mind Evolution solves more than 98% of the problem instances using Gemini 1.5 Pro without the use of a formal solver.', 'score': 55, 'issue_id': 1750, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': 'f2f5bbede5781334', 'authors': ['Kuang-Huei Lee', 'Ian Fischer', 'Yueh-Hua Wu', 'Dave Marwood', 'Shumeet Baluja', 'Dale Schuurmans', 'Xinyun Chen'], 'affiliations': ['Google DeepMind', 'UC San Diego', 'University of Alberta'], 'pdf_title_img': 'assets/pdf/title_img/2501.09891.jpg', 'data': {'categories': ['#benchmark', '#inference', '#optimization'], 'emoji': '🧠', 'ru': {'title': 'Эволюция мышления: новый подход к оптимизации вывода в языковых моделях', 'desc': 'Статья представляет эволюционную стратегию поиска для масштабирования вычислений во время вывода в больших языковых моделях. Метод, названный Mind Evolution, использует языковую модель для генерации, рекомбинации и уточнения кандидатов-ответов. Этот подход устраняет необходимость формализации исходной задачи вывода, если доступен оценщик решений. При контроле за стоимостью вычислений, Mind Evolution значительно превосходит другие стратегии вывода в задачах планирования на естественном языке.'}, 'en': {'title': 'Mind Evolution: Revolutionizing Inference in Large Language Models', 'desc': 'This paper presents Mind Evolution, an innovative evolutionary search strategy designed to enhance the inference time of Large Language Models (LLMs). By leveraging a language model, Mind Evolution generates, recombines, and refines potential responses without needing to define the inference problem formally, as long as a solution evaluator is available. The results demonstrate that Mind Evolution significantly outperforms traditional inference methods like Best-of-N and Sequential Revision in natural language planning tasks. In benchmarks such as TravelPlanner and Natural Plan, Mind Evolution successfully solves over 98% of instances using Gemini 1.5 Pro, showcasing its effectiveness without relying on a formal solver.'}, 'zh': {'title': 'Mind Evolution:推理效率的新突破', 'desc': '本文探讨了一种用于大语言模型推理时间计算的进化搜索策略,称为Mind Evolution。该方法利用语言模型生成、重组和优化候选响应,避免了在有解决方案评估器的情况下需要形式化推理问题。通过控制推理成本,我们发现Mind Evolution在自然语言规划任务中显著优于其他推理策略,如Best-of-N和Sequential Revision。在TravelPlanner和Natural Plan基准测试中,Mind Evolution在不使用正式求解器的情况下,解决了超过98%的问题实例。'}}}, {'id': 'https://huggingface.co/papers/2501.10120', 'title': 'PaSa: An LLM Agent for Comprehensive Academic Paper Search', 'url': 'https://huggingface.co/papers/2501.10120', 'abstract': 'We introduce PaSa, an advanced Paper Search agent powered by large language models. PaSa can autonomously make a series of decisions, including invoking search tools, reading papers, and selecting relevant references, to ultimately obtain comprehensive and accurate results for complex scholarly queries. We optimize PaSa using reinforcement learning with a synthetic dataset, AutoScholarQuery, which includes 35k fine-grained academic queries and corresponding papers sourced from top-tier AI conference publications. Additionally, we develop RealScholarQuery, a benchmark collecting real-world academic queries to assess PaSa performance in more realistic scenarios. Despite being trained on synthetic data, PaSa significantly outperforms existing baselines on RealScholarQuery, including Google, Google Scholar, Google with GPT-4 for paraphrased queries, chatGPT (search-enabled GPT-4o), GPT-o1, and PaSa-GPT-4o (PaSa implemented by prompting GPT-4o). Notably, PaSa-7B surpasses the best Google-based baseline, Google with GPT-4o, by 37.78% in recall@20 and 39.90% in recall@50. It also exceeds PaSa-GPT-4o by 30.36% in recall and 4.25% in precision. Model, datasets, and code are available at https://github.com/bytedance/pasa.', 'score': 17, 'issue_id': 1750, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': 'bf3bfc73e6d5b31d', 'authors': ['Yichen He', 'Guanhua Huang', 'Peiyuan Feng', 'Yuan Lin', 'Yuchen Zhang', 'Hang Li', 'Weinan E'], 'affiliations': ['ByteDance Research', 'Peking University'], 'pdf_title_img': 'assets/pdf/title_img/2501.10120.jpg', 'data': {'categories': ['#agents', '#synthetic', '#benchmark', '#open_source', '#dataset', '#rl', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'PaSa: ИИ-агент для эффективного поиска научных статей', 'desc': 'PaSa - это продвинутый агент для поиска научных статей, основанный на больших языковых моделях. Он способен автономно принимать решения, включая использование поисковых инструментов, чтение статей и выбор релевантных ссылок для получения комплексных и точных результатов по сложным научным запросам. PaSa оптимизирован с помощью обучения с подкреплением на синтетическом наборе данных AutoScholarQuery, содержащем 35 тысяч детализированных академических запросов и соответствующих статей из ведущих конференций по ИИ. Несмотря на обучение на синтетических данных, PaSa значительно превосходит существующие базовые модели на реальном тестовом наборе RealScholarQuery, включая Google и ChatGPT.'}, 'en': {'title': 'Revolutionizing Academic Search with PaSa!', 'desc': 'The paper presents PaSa, a sophisticated Paper Search agent that utilizes large language models to enhance academic research. PaSa autonomously navigates the search process by making decisions such as invoking search tools, analyzing papers, and selecting pertinent references to deliver thorough and precise results for complex queries. It is optimized through reinforcement learning using a synthetic dataset called AutoScholarQuery, which contains 35,000 detailed academic queries and related papers from leading AI conferences. The performance of PaSa is evaluated against real-world queries using the RealScholarQuery benchmark, demonstrating significant improvements over existing search tools, including Google and various GPT models.'}, 'zh': {'title': 'PaSa:智能论文搜索的新纪元', 'desc': '本文介绍了一种名为PaSa的先进论文搜索代理,利用大型语言模型进行自主决策。PaSa能够调用搜索工具、阅读论文并选择相关参考文献,以获取复杂学术查询的全面和准确结果。我们通过强化学习优化PaSa,使用了一个包含35,000个细粒度学术查询的合成数据集AutoScholarQuery。尽管在合成数据上训练,PaSa在真实学术查询基准RealScholarQuery上的表现显著优于现有的基线模型。'}}}, {'id': 'https://huggingface.co/papers/2501.09775', 'title': 'Multiple Choice Questions: Reasoning Makes Large Language Models (LLMs) More Self-Confident Even When They Are Wrong', 'url': 'https://huggingface.co/papers/2501.09775', 'abstract': 'One of the most widely used methods to evaluate LLMs are Multiple Choice Question (MCQ) tests. MCQ benchmarks enable the testing of LLM knowledge on almost any topic at scale as the results can be processed automatically. To help the LLM answer, a few examples called few shots can be included in the prompt. Moreover, the LLM can be asked to answer the question directly with the selected option or to first provide the reasoning and then the selected answer, which is known as chain of thought. In addition to checking whether the selected answer is correct, the evaluation can look at the LLM-estimated probability of its response as an indication of the confidence of the LLM in the response. In this paper, we study how the LLM confidence in its answer depends on whether the model has been asked to answer directly or to provide the reasoning before answering. The results of the evaluation of questions on a wide range of topics in seven different models show that LLMs are more confident in their answers when they provide reasoning before the answer. This occurs regardless of whether the selected answer is correct. Our hypothesis is that this behavior is due to the reasoning that modifies the probability of the selected answer, as the LLM predicts the answer based on the input question and the reasoning that supports the selection made. Therefore, LLM estimated probabilities seem to have intrinsic limitations that should be understood in order to use them in evaluation procedures. Interestingly, the same behavior has been observed in humans, for whom explaining an answer increases confidence in its correctness.', 'score': 12, 'issue_id': 1756, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'eb8938131508de10', 'authors': ['Tairan Fu', 'Javier Conde', 'Gonzalo Martínez', 'María Grandury', 'Pedro Reviriego'], 'affiliations': ['College of Mechanical and Electrical Engineering Nanjing University of Aeronautics and Astronautics Nanjing, China', 'ETSI de Telecomunicación Universidad Politécnica de Madrid Madrid, Spain', 'SomosNLP/Universidad Politécnica de Madrid Madrid, Spain', 'Universidad Carlos III de Madrid Madrid, Spain'], 'pdf_title_img': 'assets/pdf/title_img/2501.09775.jpg', 'data': {'categories': ['#benchmark', '#hallucinations', '#training', '#reasoning'], 'emoji': '🤔', 'ru': {'title': 'Рассуждения повышают уверенность ИИ, но не точность', 'desc': 'Статья исследует влияние цепочки рассуждений на уверенность языковых моделей в ответах на вопросы с множественным выбором. Авторы обнаружили, что модели более уверены в своих ответах, когда они предоставляют рассуждения перед ответом, независимо от правильности ответа. Это поведение наблюдалось у семи различных моделей на широком спектре тем. Исследователи предполагают, что это связано с тем, как рассуждения модифицируют вероятность выбранного ответа в процессе генерации.'}, 'en': {'title': 'Boosting LLM Confidence Through Reasoning!', 'desc': "This paper investigates how the confidence of large language models (LLMs) in their answers is influenced by the method of response. Specifically, it compares direct answers to those that include reasoning, known as the chain of thought approach. The study finds that LLMs exhibit higher confidence in their answers when they provide reasoning first, regardless of the correctness of the answer. This suggests that the reasoning process alters the model's probability estimates, highlighting potential limitations in using these probabilities for evaluation purposes."}, 'zh': {'title': '推理提升LLM回答信心的秘密', 'desc': '本文研究了大型语言模型(LLM)在回答多项选择题时的信心如何受到回答方式的影响。通过提供推理过程,LLM在选择答案时表现出更高的信心,无论所选答案是否正确。研究表明,推理过程会改变LLM对所选答案的概率估计,这可能是LLM信心的内在限制。类似的现象也在人的回答中观察到,解释答案会提高对其正确性的信心。'}}}, {'id': 'https://huggingface.co/papers/2501.10020', 'title': 'Textoon: Generating Vivid 2D Cartoon Characters from Text Descriptions', 'url': 'https://huggingface.co/papers/2501.10020', 'abstract': 'The 2D cartoon style is a prominent art form in digital character creation, particularly popular among younger audiences. While advancements in digital human technology have spurred extensive research into photorealistic digital humans and 3D characters, interactive 2D cartoon characters have received comparatively less attention. Unlike 3D counterparts, which require sophisticated construction and resource-intensive rendering, Live2D, a widely-used format for 2D cartoon characters, offers a more efficient alternative, which allows to animate 2D characters in a manner that simulates 3D movement without the necessity of building a complete 3D model. Furthermore, Live2D employs lightweight HTML5 (H5) rendering, improving both accessibility and efficiency. In this technical report, we introduce Textoon, an innovative method for generating diverse 2D cartoon characters in the Live2D format based on text descriptions. The Textoon leverages cutting-edge language and vision models to comprehend textual intentions and generate 2D appearance, capable of creating a wide variety of stunning and interactive 2D characters within one minute. The project homepage is https://human3daigc.github.io/Textoon_webpage/.', 'score': 12, 'issue_id': 1751, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '828788f94bccbdc9', 'authors': ['Chao He', 'Jianqiang Ren', 'Liefeng Bo'], 'affiliations': ['Tongyi Lab, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.10020.jpg', 'data': {'categories': ['#3d', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'Textoon: ИИ создает 2D мультперсонажей по текстовому описанию', 'desc': 'В статье представлен метод Textoon для создания 2D мультипликационных персонажей в формате Live2D на основе текстовых описаний. Textoon использует современные языковые и визуальные модели для понимания текстовых намерений и генерации 2D внешнего вида персонажей. Метод способен создавать разнообразных интерактивных 2D персонажей менее чем за минуту. Live2D предлагает эффективную альтернативу 3D моделям, позволяя анимировать 2D персонажей, имитируя 3D движение, без необходимости создания полной 3D модели.'}, 'en': {'title': 'Transforming Text into 2D Cartoon Characters with Textoon!', 'desc': 'This paper presents Textoon, a novel approach for generating diverse 2D cartoon characters using the Live2D format. By utilizing advanced language and vision models, Textoon interprets text descriptions to create visually appealing and interactive characters efficiently. Unlike traditional 3D character models, Textoon allows for quick generation of 2D characters that simulate 3D movement without extensive resources. The method enhances accessibility and efficiency in digital character creation, catering especially to younger audiences.'}, 'zh': {'title': 'Textoon:快速生成多样化2D卡通角色的创新方法', 'desc': '这篇论文介绍了一种名为Textoon的方法,用于根据文本描述生成多样化的2D卡通角色。与3D角色相比,2D卡通角色的动画制作更为高效,Textoon利用先进的语言和视觉模型来理解文本意图,并生成2D外观。该方法使用Live2D格式,使得角色动画能够模拟3D运动,而无需构建完整的3D模型。Textoon能够在一分钟内创建出多种令人惊叹和互动的2D角色,提升了数字角色创作的效率和可访问性。'}}}, {'id': 'https://huggingface.co/papers/2501.09825', 'title': 'Bridging Language Barriers in Healthcare: A Study on Arabic LLMs', 'url': 'https://huggingface.co/papers/2501.09825', 'abstract': 'This paper investigates the challenges of developing large language models (LLMs) proficient in both multilingual understanding and medical knowledge. We demonstrate that simply translating medical data does not guarantee strong performance on clinical tasks in the target language. Our experiments reveal that the optimal language mix in training data varies significantly across different medical tasks. We find that larger models with carefully calibrated language ratios achieve superior performance on native-language clinical tasks. Furthermore, our results suggest that relying solely on fine-tuning may not be the most effective approach for incorporating new language knowledge into LLMs. Instead, data and computationally intensive pretraining methods may still be necessary to achieve optimal performance in multilingual medical settings. These findings provide valuable guidance for building effective and inclusive medical AI systems for diverse linguistic communities.', 'score': 8, 'issue_id': 1758, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'a2bf2d3dc7e978d7', 'authors': ['Nada Saadi', 'Tathagata Raha', 'Clément Christophe', 'Marco AF Pimentel', 'Ronnie Rajan', 'Praveen K Kanithi'], 'affiliations': ['M42 Health, Abu Dhabi, UAE'], 'pdf_title_img': 'assets/pdf/title_img/2501.09825.jpg', 'data': {'categories': ['#healthcare', '#training', '#science', '#low_resource', '#multilingual'], 'emoji': '🌐', 'ru': {'title': 'Многоязычные медицинские LLM: больше, чем просто перевод', 'desc': 'Статья исследует проблемы разработки больших языковых моделей (LLM), обладающих как многоязычным пониманием, так и медицинскими знаниями. Авторы показывают, что простой перевод медицинских данных не гарантирует высокой производительности на клинических задачах в целевом языке. Эксперименты выявляют, что оптимальное соотношение языков в обучающих данных значительно варьируется для разных медицинских задач. Результаты также указывают на то, что для включения знаний нового языка в LLM может потребоваться ресурсоемкое предобучение, а не только тонкая настройка.'}, 'en': {'title': 'Optimizing Multilingual Medical AI: Beyond Translation and Fine-Tuning', 'desc': 'This paper explores the difficulties in creating large language models (LLMs) that can understand multiple languages and possess medical expertise. It shows that merely translating medical information does not ensure good performance in clinical tasks for different languages. The research indicates that the best combination of languages in training data changes depending on the specific medical task. Additionally, it suggests that larger models with well-balanced language inputs perform better, and that extensive pretraining may be more beneficial than just fine-tuning for integrating new language capabilities.'}, 'zh': {'title': '多语言医学模型的优化之道', 'desc': '本论文探讨了开发能够理解多种语言和医学知识的大型语言模型(LLMs)所面临的挑战。我们证明,仅仅翻译医学数据并不能保证在目标语言的临床任务中表现良好。实验结果显示,不同医学任务对训练数据中的语言组合有显著不同的最佳需求。我们的研究表明,经过精心调整语言比例的大型模型在本土语言的临床任务中表现更佳,而仅依赖微调可能不是将新语言知识有效融入LLMs的最佳方法。'}}}, {'id': 'https://huggingface.co/papers/2501.10021', 'title': 'X-Dyna: Expressive Dynamic Human Image Animation', 'url': 'https://huggingface.co/papers/2501.10021', 'abstract': 'We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for animating a single human image using facial expressions and body movements derived from a driving video, that generates realistic, context-aware dynamics for both the subject and the surrounding environment. Building on prior approaches centered on human pose control, X-Dyna addresses key shortcomings causing the loss of dynamic details, enhancing the lifelike qualities of human video animations. At the core of our approach is the Dynamics-Adapter, a lightweight module that effectively integrates reference appearance context into the spatial attentions of the diffusion backbone while preserving the capacity of motion modules in synthesizing fluid and intricate dynamic details. Beyond body pose control, we connect a local control module with our model to capture identity-disentangled facial expressions, facilitating accurate expression transfer for enhanced realism in animated scenes. Together, these components form a unified framework capable of learning physical human motion and natural scene dynamics from a diverse blend of human and scene videos. Comprehensive qualitative and quantitative evaluations demonstrate that X-Dyna outperforms state-of-the-art methods, creating highly lifelike and expressive animations. The code is available at https://github.com/bytedance/X-Dyna.', 'score': 5, 'issue_id': 1752, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '4163d7e5ec4b04ce', 'authors': ['Di Chang', 'Hongyi Xu', 'You Xie', 'Yipeng Gao', 'Zhengfei Kuang', 'Shengqu Cai', 'Chenxu Zhang', 'Guoxian Song', 'Chao Wang', 'Yichun Shi', 'Zeyuan Chen', 'Shijie Zhou', 'Linjie Luo', 'Gordon Wetzstein', 'Mohammad Soleymani'], 'affiliations': ['ByteDance', 'Stanford University', 'University of California Los Angeles', 'University of California San Diego', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.10021.jpg', 'data': {'categories': ['#architecture', '#synthetic', '#diffusion', '#cv', '#video', '#multimodal'], 'emoji': '🎭', 'ru': {'title': 'Оживление статичных изображений с помощью ИИ: реалистичная анимация человека и окружения', 'desc': 'X-Dyna - это новый подход к анимации изображений человека с нуля, основанный на диффузионных моделях. Он использует выражения лица и движения тела из видео-драйвера для создания реалистичной динамики как субъекта, так и окружающей среды. В основе X-Dyna лежит модуль Dynamics-Adapter, который интегрирует контекст внешнего вида в пространственное внимание диффузионной модели. Система также включает локальный модуль управления для передачи выражений лица, что повышает реалистичность анимированных сцен.'}, 'en': {'title': 'X-Dyna: Realistic Animation from a Single Image', 'desc': 'X-Dyna is a new method for animating a single human image by using expressions and movements from a video. It improves on previous techniques by maintaining dynamic details, making animations look more realistic. The key part of X-Dyna is the Dynamics-Adapter, which helps blend the appearance of the subject with their movements while keeping the animation smooth. Additionally, it includes a module for accurately transferring facial expressions, resulting in more lifelike and expressive animations.'}, 'zh': {'title': 'X-Dyna:真实感动画的新突破', 'desc': 'X-Dyna是一种新颖的零样本扩散基础管道,能够通过驱动视频中的面部表情和身体动作为单个人物图像生成动画。该方法解决了以往人类姿态控制方法中的动态细节丢失问题,增强了视频动画的真实感。X-Dyna的核心是Dynamics-Adapter模块,它有效地将参考外观上下文整合到扩散模型的空间注意力中,同时保持运动模块合成流畅动态细节的能力。通过连接局部控制模块,X-Dyna能够捕捉与身份无关的面部表情,实现更真实的动画场景中的表情转移。'}}}, {'id': 'https://huggingface.co/papers/2501.10045', 'title': 'HiFi-SR: A Unified Generative Transformer-Convolutional Adversarial Network for High-Fidelity Speech Super-Resolution', 'url': 'https://huggingface.co/papers/2501.10045', 'abstract': 'The application of generative adversarial networks (GANs) has recently advanced speech super-resolution (SR) based on intermediate representations like mel-spectrograms. However, existing SR methods that typically rely on independently trained and concatenated networks may lead to inconsistent representations and poor speech quality, especially in out-of-domain scenarios. In this work, we propose HiFi-SR, a unified network that leverages end-to-end adversarial training to achieve high-fidelity speech super-resolution. Our model features a unified transformer-convolutional generator designed to seamlessly handle both the prediction of latent representations and their conversion into time-domain waveforms. The transformer network serves as a powerful encoder, converting low-resolution mel-spectrograms into latent space representations, while the convolutional network upscales these representations into high-resolution waveforms. To enhance high-frequency fidelity, we incorporate a multi-band, multi-scale time-frequency discriminator, along with a multi-scale mel-reconstruction loss in the adversarial training process. HiFi-SR is versatile, capable of upscaling any input speech signal between 4 kHz and 32 kHz to a 48 kHz sampling rate. Experimental results demonstrate that HiFi-SR significantly outperforms existing speech SR methods across both objective metrics and ABX preference tests, for both in-domain and out-of-domain scenarios (https://github.com/modelscope/ClearerVoice-Studio).', 'score': 4, 'issue_id': 1751, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '8d8cd8e70ad62b51', 'authors': ['Shengkui Zhao', 'Kun Zhou', 'Zexu Pan', 'Yukun Ma', 'Chong Zhang', 'Bin Ma'], 'affiliations': ['Tongyi Lab, Alibaba Group, Singapore'], 'pdf_title_img': 'assets/pdf/title_img/2501.10045.jpg', 'data': {'categories': ['#audio', '#optimization'], 'emoji': '🎙️', 'ru': {'title': 'HiFi-SR: Единая сеть для сверхчеткой речи', 'desc': 'Статья представляет HiFi-SR - унифицированную нейронную сеть для высококачественного повышения разрешения речи. Модель использует единую архитектуру трансформер-сверточной сети для обработки мел-спектрограмм и генерации высококачественных аудиосигналов. Для улучшения качества высоких частот применяется многополосный дискриминатор и многомасштабная функция потерь реконструкции мел-спектрограмм. Экспериментальные результаты показывают превосходство HiFi-SR над существующими методами как по объективным метрикам, так и по субъективным тестам.'}, 'en': {'title': 'HiFi-SR: Elevating Speech Quality with Unified GANs', 'desc': 'This paper introduces HiFi-SR, a novel approach to speech super-resolution using generative adversarial networks (GANs). Unlike traditional methods that use separate networks, HiFi-SR employs a unified transformer-convolutional architecture for end-to-end training, improving the consistency and quality of generated speech. The transformer encodes low-resolution mel-spectrograms into latent representations, while the convolutional network converts these into high-resolution audio waveforms. The model also integrates a multi-band discriminator and a mel-reconstruction loss to enhance high-frequency details, achieving superior performance in various scenarios.'}, 'zh': {'title': 'HiFi-SR:高保真语音超分辨率的新方法', 'desc': '本研究提出了一种名为HiFi-SR的统一网络,用于语音超分辨率(SR),通过端到端的对抗训练实现高保真语音重建。该模型结合了变换器和卷积网络,能够有效地将低分辨率的mel谱图转换为高分辨率的时域波形。为了提高高频细节的保真度,我们在对抗训练中引入了多带宽、多尺度的时频判别器和多尺度mel重建损失。实验结果表明,HiFi-SR在目标指标和ABX偏好测试中显著优于现有的语音超分辨率方法,适用于不同的输入语音信号。'}}}, {'id': 'https://huggingface.co/papers/2501.10132', 'title': 'ComplexFuncBench: Exploring Multi-Step and Constrained Function Calling under Long-Context Scenario', 'url': 'https://huggingface.co/papers/2501.10132', 'abstract': 'Enhancing large language models (LLMs) with real-time APIs can help generate more accurate and up-to-date responses. However, evaluating the function calling abilities of LLMs in real-world scenarios remains under-explored due to the complexity of data collection and evaluation. In this work, we introduce ComplexFuncBench, a benchmark for complex function calling across five real-world scenarios. Compared to existing benchmarks, ComplexFuncBench encompasses multi-step and constrained function calling, which requires long-parameter filing, parameter value reasoning, and 128k long context. Additionally, we propose an automatic framework, ComplexEval, for quantitatively evaluating complex function calling tasks. Through comprehensive experiments, we demonstrate the deficiencies of state-of-the-art LLMs in function calling and suggest future directions for optimizing these capabilities. The data and code are available at https://github.com/THUDM/ComplexFuncBench.', 'score': 4, 'issue_id': 1749, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': 'de405dcc4bfc8efc', 'authors': ['Lucen Zhong', 'Zhengxiao Du', 'Xiaohan Zhang', 'Haiyi Hu', 'Jie Tang'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.10132.jpg', 'data': {'categories': ['#long_context', '#optimization', '#data', '#benchmark'], 'emoji': '🧪', 'ru': {'title': 'Новый бенчмарк для оценки сложных вызовов функций в больших языковых моделях', 'desc': 'Данная статья представляет новый бенчмарк ComplexFuncBench для оценки способностей больших языковых моделей (LLM) вызывать сложные функции в реальных сценариях. Бенчмарк включает в себя многошаговые и ограниченные вызовы функций, требующие заполнения длинных параметров и рассуждений о значениях параметров. Авторы также предлагают автоматическую систему ComplexEval для количественной оценки задач сложного вызова функций. Эксперименты показывают недостатки современных LLM в вызове функций и предлагают направления для оптимизации этих возможностей.'}, 'en': {'title': 'Benchmarking Complex Function Calling in LLMs', 'desc': 'This paper presents ComplexFuncBench, a new benchmark designed to evaluate the function calling abilities of large language models (LLMs) in real-world scenarios. It focuses on complex tasks that involve multi-step and constrained function calling, which require advanced reasoning and handling of long contexts. The authors also introduce ComplexEval, an automatic framework for quantitatively assessing these complex function calling tasks. Through their experiments, they highlight the limitations of current state-of-the-art LLMs and propose directions for improving their performance in this area.'}, 'zh': {'title': '提升LLMs函数调用能力的基准与评估', 'desc': '本论文提出了ComplexFuncBench,这是一个用于评估大型语言模型(LLMs)在复杂函数调用方面的基准测试。该基准涵盖了五种真实场景,涉及多步骤和受限的函数调用,要求模型进行长参数填写和参数值推理。我们还提出了ComplexEval,一个自动化框架,用于定量评估复杂函数调用任务的能力。通过实验,我们展示了当前最先进的LLMs在函数调用方面的不足,并提出了未来优化的方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09978', 'title': 'GaussianAvatar-Editor: Photorealistic Animatable Gaussian Head Avatar Editor', 'url': 'https://huggingface.co/papers/2501.09978', 'abstract': 'We introduce GaussianAvatar-Editor, an innovative framework for text-driven editing of animatable Gaussian head avatars that can be fully controlled in expression, pose, and viewpoint. Unlike static 3D Gaussian editing, editing animatable 4D Gaussian avatars presents challenges related to motion occlusion and spatial-temporal inconsistency. To address these issues, we propose the Weighted Alpha Blending Equation (WABE). This function enhances the blending weight of visible Gaussians while suppressing the influence on non-visible Gaussians, effectively handling motion occlusion during editing. Furthermore, to improve editing quality and ensure 4D consistency, we incorporate conditional adversarial learning into the editing process. This strategy helps to refine the edited results and maintain consistency throughout the animation. By integrating these methods, our GaussianAvatar-Editor achieves photorealistic and consistent results in animatable 4D Gaussian editing. We conduct comprehensive experiments across various subjects to validate the effectiveness of our proposed techniques, which demonstrates the superiority of our approach over existing methods. More results and code are available at: [Project Link](https://xiangyueliu.github.io/GaussianAvatar-Editor/).', 'score': 2, 'issue_id': 1751, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': 'e5b8603f26a902f9', 'authors': ['Xiangyue Liu', 'Kunming Luo', 'Heng Li', 'Qi Zhang', 'Yuan Liu', 'Li Yi', 'Ping Tan'], 'affiliations': ['Hong Kong University of Science and Technology', 'Tencent AI Lab', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09978.jpg', 'data': {'categories': ['#3d'], 'emoji': '🤖', 'ru': {'title': 'Революция в редактировании анимируемых 3D-аватаров с помощью гауссовых моделей', 'desc': 'Статья представляет GaussianAvatar-Editor - инновационную систему для редактирования анимируемых гауссовых аватаров головы на основе текстовых инструкций. Авторы предлагают функцию Weighted Alpha Blending Equation (WABE) для решения проблем, связанных с окклюзией при движении и пространственно-временной несогласованностью. Система использует условное состязательное обучение для улучшения качества редактирования и обеспечения согласованности в 4D. Эксперименты показывают превосходство предложенного подхода над существующими методами в создании фотореалистичных и согласованных результатов редактирования анимируемых 4D гауссовых аватаров.'}, 'en': {'title': 'Revolutionizing 4D Avatar Editing with GaussianAvatar-Editor', 'desc': 'GaussianAvatar-Editor is a new framework designed for editing animated Gaussian head avatars using text inputs. It tackles challenges like motion occlusion and maintaining spatial-temporal consistency, which are common in 4D animations. The framework introduces the Weighted Alpha Blending Equation (WABE) to improve the blending of visible elements while minimizing the impact of non-visible ones. Additionally, it employs conditional adversarial learning to enhance the quality of edits and ensure consistency throughout the animation process, resulting in photorealistic outputs.'}, 'zh': {'title': '高斯头像编辑的创新之路', 'desc': '我们介绍了GaussianAvatar-Editor,这是一个创新的框架,用于基于文本驱动的可动画高斯头像编辑。与静态3D高斯编辑不同,编辑可动画的4D高斯头像面临运动遮挡和时空不一致等挑战。为了解决这些问题,我们提出了加权阿尔法混合方程(WABE),该函数增强了可见高斯的混合权重,同时抑制了对不可见高斯的影响。通过结合条件对抗学习,我们提高了编辑质量并确保了4D一致性,从而实现了逼真且一致的可动画4D高斯编辑结果。'}}}, {'id': 'https://huggingface.co/papers/2501.17161', 'title': 'SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training', 'url': 'https://huggingface.co/papers/2501.17161', 'abstract': "Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic reasoning card game, and adopt V-IRL, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants in both textual and visual domains. We show that RL, especially when trained with an outcome-based reward, generalizes across both rule-based textual and visual variants. SFT, in contrast, tends to memorize training data and struggles to generalize out-of-distribution scenarios. Further analysis reveals that RL improves the model's underlying visual recognition capabilities, contributing to its enhanced generalization in the visual domain. Despite RL's superior generalization, we show that SFT remains essential for effective RL training; SFT stabilizes the model's output format, enabling subsequent RL to achieve its performance gains. These findings demonstrates the capability of RL for acquiring generalizable knowledge in complex, multi-modal tasks.", 'score': 27, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'ce9300709a3cdc7a', 'authors': ['Tianzhe Chu', 'Yuexiang Zhai', 'Jihan Yang', 'Shengbang Tong', 'Saining Xie', 'Dale Schuurmans', 'Quoc V. Le', 'Sergey Levine', 'Yi Ma'], 'affiliations': ['Google DeepMind', 'HKU', 'NYU', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.17161.jpg', 'data': {'categories': ['#reasoning', '#training', '#optimization', '#rl', '#multimodal', '#games'], 'emoji': '🧠', 'ru': {'title': 'RL превосходит SFT в обобщении для мультимодальных задач', 'desc': 'Это исследование сравнивает методы дообучения языковых моделей: обучение с учителем (SFT) и обучение с подкреплением (RL). Авторы анализируют способность моделей к обобщению на новые текстовые и визуальные варианты задач. Результаты показывают, что RL лучше обобщается на новые ситуации, особенно при использовании награды, основанной на результате. SFT, напротив, склонно к запоминанию обучающих данных и хуже справляется с обобщением.'}, 'en': {'title': 'Unlocking Generalization: RL Outshines SFT in Multi-Modal Tasks', 'desc': 'This paper investigates how supervised fine-tuning (SFT) and reinforcement learning (RL) affect the generalization abilities of foundation models. It highlights that while SFT often leads to memorization of training data, RL, particularly with outcome-based rewards, enhances generalization across unseen textual and visual variants. The study introduces GeneralPoints, a reasoning game, and V-IRL, a navigation environment, to evaluate model performance. The results indicate that RL not only improves generalization but also strengthens visual recognition, although SFT is still crucial for stabilizing the model before RL training.'}, 'zh': {'title': '强化学习提升模型泛化能力的研究', 'desc': '这篇论文研究了监督微调(SFT)和强化学习(RL)在基础模型中的作用,特别是在提高模型的泛化能力方面。研究表明,RL在处理文本和视觉变体时,能够更好地泛化,而SFT则倾向于记忆训练数据,难以应对未见过的情况。通过引入算术推理卡牌游戏GeneralPoints和真实世界导航环境V-IRL,作者评估了这两种方法的效果。尽管RL在泛化能力上表现优越,但SFT仍然对有效的RL训练至关重要,因为它稳定了模型的输出格式。'}}}, {'id': 'https://huggingface.co/papers/2501.17116', 'title': 'Optimizing Large Language Model Training Using FP4 Quantization', 'url': 'https://huggingface.co/papers/2501.17116', 'abstract': 'The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work introduces the first FP4 training framework for LLMs, addressing these challenges with two key innovations: a differentiable quantization estimator for precise weight updates and an outlier clamping and compensation strategy to prevent activation collapse. To ensure stability, the framework integrates a mixed-precision training scheme and vector-wise quantization. Experimental results demonstrate that our FP4 framework achieves accuracy comparable to BF16 and FP8, with minimal degradation, scaling effectively to 13B-parameter LLMs trained on up to 100B tokens. With the emergence of next-generation hardware supporting FP4, our framework sets a foundation for efficient ultra-low precision training.', 'score': 12, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '9ce85dc91aee17fc', 'authors': ['Ruizhe Wang', 'Yeyun Gong', 'Xiao Liu', 'Guoshuai Zhao', 'Ziyue Yang', 'Baining Guo', 'Zhengjun Zha', 'Peng Cheng'], 'affiliations': ['Microsoft Research Asia', 'Microsoft SIGMA Team', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.17116.jpg', 'data': {'categories': ['#optimization', '#training', '#inference'], 'emoji': '🔢', 'ru': {'title': 'FP4: Революция в эффективности обучения языковых моделей', 'desc': 'Статья представляет первую систему обучения больших языковых моделей (LLM) с использованием 4-битной точности с плавающей запятой (FP4). Авторы разработали дифференцируемый оценщик квантования для точного обновления весов и стратегию ограничения и компенсации выбросов для предотвращения коллапса активаций. Система включает схему обучения со смешанной точностью и векторное квантование для обеспечения стабильности. Экспериментальные результаты показывают, что FP4-обучение достигает точности, сравнимой с BF16 и FP8, эффективно масштабируясь до LLM с 13 млрд параметров.'}, 'en': {'title': 'Efficient Training of Large Language Models with FP4 Precision', 'desc': 'This paper addresses the high computational costs associated with training large language models (LLMs) by introducing a novel FP4 training framework. The framework utilizes quantized training techniques, specifically focusing on low-bit arithmetic to enhance efficiency while maintaining model accuracy. Key innovations include a differentiable quantization estimator for better weight updates and a strategy to manage outliers, which helps prevent activation collapse. Experimental results show that this FP4 approach achieves performance similar to higher precision formats like BF16 and FP8, making it suitable for large-scale LLMs.'}, 'zh': {'title': 'FP4训练框架:高效的超低精度训练新方案', 'desc': '随着大型语言模型(LLMs)训练对计算资源的需求不断增加,寻找更高效的方法变得尤为重要。量化训练通过允许低位数算术运算来降低这些成本,展现出良好的前景。尽管FP8精度已被证明可行,但FP4的应用仍面临显著的量化误差和有限的表示能力。本文提出了首个FP4训练框架,通过可微分量化估计器和异常值钳制与补偿策略,解决了这些挑战,并在稳定性方面结合了混合精度训练方案和向量级量化。'}}}, {'id': 'https://huggingface.co/papers/2501.16975', 'title': 'Over-Tokenized Transformer: Vocabulary is Generally Worth Scaling', 'url': 'https://huggingface.co/papers/2501.16975', 'abstract': 'Tokenization is a fundamental component of large language models (LLMs), yet its influence on model scaling and performance is not fully explored. In this paper, we introduce Over-Tokenized Transformers, a novel framework that decouples input and output vocabularies to improve language modeling performance. Specifically, our approach scales up input vocabularies to leverage multi-gram tokens. Through extensive experiments, we uncover a log-linear relationship between input vocabulary size and training loss, demonstrating that larger input vocabularies consistently enhance model performance, regardless of model size. Using a large input vocabulary, we achieve performance comparable to double-sized baselines with no additional cost. Our findings highlight the importance of tokenization in scaling laws and provide practical insight for tokenizer design, paving the way for more efficient and powerful LLMs.', 'score': 10, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '27930c2f5d17471e', 'authors': ['Hongzhi Huang', 'Defa Zhu', 'Banggu Wu', 'Yutao Zeng', 'Ya Wang', 'Qiyang Min', 'Xun Zhou'], 'affiliations': ['Seed-Foundation-Model Team, Bytedance'], 'pdf_title_img': 'assets/pdf/title_img/2501.16975.jpg', 'data': {'categories': ['#optimization', '#training', '#architecture'], 'emoji': '🔤', 'ru': {'title': 'Больше токенов - выше эффективность: новый взгляд на масштабирование языковых моделей', 'desc': 'Статья представляет новый подход к токенизации в больших языковых моделях, называемый Over-Tokenized Transformers. Авторы предлагают разделить входной и выходной словари, увеличивая размер входного словаря для использования мультиграммных токенов. Исследование выявило логарифмически-линейную зависимость между размером входного словаря и потерями при обучении. Результаты показывают, что увеличение входного словаря consistently улучшает производительность модели независимо от её размера.'}, 'en': {'title': 'Unlocking Performance: The Power of Over-Tokenization in Language Models', 'desc': "This paper presents a new approach called Over-Tokenized Transformers, which focuses on improving the tokenization process in large language models (LLMs). By separating the input and output vocabularies, the authors demonstrate that increasing the input vocabulary size can significantly reduce training loss and enhance model performance. Their experiments reveal a consistent log-linear relationship between the size of the input vocabulary and the model's effectiveness, showing that larger vocabularies lead to better results without increasing computational costs. This research emphasizes the critical role of tokenization in the scaling of LLMs and offers valuable insights for designing more efficient tokenizers."}, 'zh': {'title': '分词技术提升大语言模型性能的关键', 'desc': '本文探讨了大语言模型中的分词技术对模型性能的影响。我们提出了一种新的框架——过度分词变换器,旨在通过解耦输入和输出词汇表来提升语言建模性能。研究表明,增大输入词汇表可以有效降低训练损失,从而提高模型性能。我们的实验结果显示,使用更大的输入词汇表可以在不增加成本的情况下,达到与双倍基线相当的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.16764', 'title': 'DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation', 'url': 'https://huggingface.co/papers/2501.16764', 'abstract': 'Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-image diffusion models. It differs from previous 3D generative models by effectively utilizing web-scale 2D priors while maintaining 3D consistency in a unified model. To bootstrap the training, a lightweight reconstruction model is proposed to instantly produce multi-view Gaussian splat grids for scalable dataset curation. In conjunction with the regular diffusion loss on these grids, a 3D rendering loss is introduced to facilitate 3D coherence across arbitrary views. The compatibility with image diffusion models enables seamless adaptions of numerous techniques for image generation to the 3D realm. Extensive experiments reveal the superiority of DiffSplat in text- and image-conditioned generation tasks and downstream applications. Thorough ablation studies validate the efficacy of each critical design choice and provide insights into the underlying mechanism.', 'score': 8, 'issue_id': 1921, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '00ee1a0338716711', 'authors': ['Chenguo Lin', 'Panwang Pan', 'Bangbang Yang', 'Zeming Li', 'Yadong Mu'], 'affiliations': ['ByteDance', 'Peking University'], 'pdf_title_img': 'assets/pdf/title_img/2501.16764.jpg', 'data': {'categories': ['#diffusion', '#optimization', '#training', '#dataset', '#3d'], 'emoji': '🎨', 'ru': {'title': 'DiffSplat: Генерация 3D контента на новом уровне', 'desc': 'DiffSplat - это новая система генерации 3D контента, использующая диффузионные модели для создания трехмерных гауссовых сплатов. Она решает проблемы ограниченных 3D датасетов и несогласованности при мультиракурсной 2D генерации. DiffSplat объединяет масштабные 2D-приоры с 3D-согласованностью, используя легковесную модель реконструкции и специальную функцию потерь. Эксперименты показывают превосходство DiffSplat в задачах генерации по тексту и изображениям.'}, 'en': {'title': 'Revolutionizing 3D Generation with DiffSplat', 'desc': 'DiffSplat is a new framework for generating 3D content from text or images, addressing challenges like the lack of high-quality 3D datasets. It uses advanced text-to-image diffusion models to create 3D Gaussian splats while ensuring consistency across different views. The framework includes a lightweight reconstruction model that helps quickly generate multi-view datasets for training. Through extensive testing, DiffSplat shows improved performance in generating 3D content and offers insights into its effective design choices.'}, 'zh': {'title': 'DiffSplat:3D生成的新突破', 'desc': '最近,3D内容生成从文本或单张图像中取得了进展,但高质量3D数据集有限,且2D多视图生成存在不一致性。我们提出了DiffSplat,这是一种新颖的3D生成框架,能够通过控制大规模文本到图像的扩散模型,原生生成3D高斯点云。与以往的3D生成模型不同,DiffSplat有效利用了网络规模的2D先验,同时在统一模型中保持3D一致性。通过引入轻量级重建模型和3D渲染损失,DiffSplat在文本和图像条件生成任务中表现出色,且在下游应用中也显示出其优越性。'}}}, {'id': 'https://huggingface.co/papers/2501.16496', 'title': 'Open Problems in Mechanistic Interpretability', 'url': 'https://huggingface.co/papers/2501.16496', 'abstract': "Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting scientific questions about the nature of intelligence. Despite recent progress toward these goals, there are many open problems in the field that require solutions before many scientific and practical benefits can be realized: Our methods require both conceptual and practical improvements to reveal deeper insights; we must figure out how best to apply our methods in pursuit of specific goals; and the field must grapple with socio-technical challenges that influence and are influenced by our work. This forward-facing review discusses the current frontier of mechanistic interpretability and the open problems that the field may benefit from prioritizing.", 'score': 7, 'issue_id': 1920, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '5a7a914accebfa33', 'authors': ['Lee Sharkey', 'Bilal Chughtai', 'Joshua Batson', 'Jack Lindsey', 'Jeff Wu', 'Lucius Bushnaq', 'Nicholas Goldowsky-Dill', 'Stefan Heimersheim', 'Alejandro Ortega', 'Joseph Bloom', 'Stella Biderman', 'Adria Garriga-Alonso', 'Arthur Conmy', 'Neel Nanda', 'Jessica Rumbelow', 'Martin Wattenberg', 'Nandi Schoots', 'Joseph Miller', 'Eric J. Michaud', 'Stephen Casper', 'Max Tegmark', 'William Saunders', 'David Bau', 'Eric Todd', 'Atticus Geiger', 'Mor Geva', 'Jesse Hoogland', 'Daniel Murfet', 'Tom McGrath'], 'affiliations': ['Anthropic', 'Apollo Research', 'Google DeepMind', 'Harvard University', 'Imperial College London', 'Kings College London', 'Leap Laboratories', 'MIT', 'Northeastern University', 'Tel Aviv University', 'University of Melbourne'], 'pdf_title_img': 'assets/pdf/title_img/2501.16496.jpg', 'data': {'categories': ['#interpretability', '#survey'], 'emoji': '🧠', 'ru': {'title': 'Раскрывая тайны нейронных сетей: путь к пониманию искусственного интеллекта', 'desc': 'Статья посвящена механистической интерпретируемости нейронных сетей, цель которой - понять вычислительные механизмы, лежащие в основе их возможностей. Прогресс в этой области обещает обеспечить большую уверенность в поведении систем искусственного интеллекта и пролить свет на природу интеллекта. Авторы обсуждают открытые проблемы в области, требующие решения для реализации научных и практических преимуществ. Статья рассматривает текущие границы механистической интерпретируемости и приоритетные задачи для дальнейшего развития области.'}, 'en': {'title': 'Unlocking the Secrets of Neural Networks for Reliable AI', 'desc': 'Mechanistic interpretability focuses on understanding how neural networks work to achieve specific tasks, which can enhance the reliability of AI systems. This area of research aims to uncover the underlying processes that contribute to the intelligence exhibited by these models. Despite advancements, there are still significant challenges that need to be addressed, including improving methods for deeper insights and applying these methods effectively. Additionally, the field must consider socio-technical issues that affect and are affected by mechanistic interpretability efforts.'}, 'zh': {'title': '揭示神经网络的计算机制', 'desc': '机械解释性旨在理解神经网络能力背后的计算机制,以实现具体的科学和工程目标。该领域的进展有望提高对人工智能系统行为的信心,并揭示关于智能本质的有趣科学问题。尽管最近在这些目标上取得了一些进展,但仍有许多未解决的问题需要解决,以便实现更多的科学和实际利益。本文回顾了机械解释性的当前前沿及该领域应优先解决的开放问题。'}}}, {'id': 'https://huggingface.co/papers/2501.16372', 'title': 'Low-Rank Adapters Meet Neural Architecture Search for LLM Compression', 'url': 'https://huggingface.co/papers/2501.16372', 'abstract': 'The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank representations with Neural Architecture Search (NAS) techniques, particularly weight-sharing super-networks. Robust solutions for compressing and fine-tuning large pre-trained models are developed by integrating these methodologies. Our analysis highlights the potential of these combined strategies to democratize the use of LLMs, making them more accessible for deployment in resource-constrained environments. The resulting models exhibit reduced memory footprints and faster inference times, paving the way for more practical and scalable applications of LLMs. Models and code are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.', 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'f1d43a985dbea0af', 'authors': ['J. Pablo Muñoz', 'Jinjie Yuan', 'Nilesh Jain'], 'affiliations': ['Intel Corporation', 'Intel Labs'], 'pdf_title_img': 'assets/pdf/title_img/2501.16372.jpg', 'data': {'categories': ['#inference', '#optimization', '#open_source', '#training', '#low_resource', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективная настройка крупных языковых моделей для ограниченных ресурсов', 'desc': 'Эта статья рассматривает проблему больших вычислительных ресурсов, необходимых для настройки и развертывания крупных языковых моделей (LLM). Авторы предлагают комбинировать низкоранговые адаптеры и методы поиска нейронных архитектур (NAS) для эффективной настройки параметров. Такой подход позволяет сжимать и дообучать большие предобученные модели, делая их более доступными в условиях ограниченных ресурсов. В результате получаются модели с меньшим потреблением памяти и более быстрым выводом, что открывает путь к более практичному применению LLM.'}, 'en': {'title': 'Democratizing Large Language Models with Efficient Fine-Tuning Techniques', 'desc': 'This paper addresses the challenges of using Large Language Models (LLMs) due to their high computational demands. It explores the use of low-rank adapters for parameter-efficient fine-tuning (PEFT), which helps reduce the resources needed. The authors combine low-rank representations with Neural Architecture Search (NAS) techniques, particularly through weight-sharing super-networks, to create efficient solutions for model compression and fine-tuning. The findings suggest that these strategies can make LLMs more accessible and practical for deployment in environments with limited resources, resulting in models that are faster and require less memory.'}, 'zh': {'title': '低秩适配器助力大型语言模型的高效微调', 'desc': '大型语言模型(LLMs)的快速发展带来了在微调和部署时对计算资源的巨大挑战。最近,低秩适配器在参数高效微调(PEFT)方面显示出了良好的效果。本文回顾了将低秩表示与神经架构搜索(NAS)技术相结合的创新方法,特别是权重共享超网络。通过整合这些方法,开发了压缩和微调大型预训练模型的稳健解决方案,使得LLMs在资源受限的环境中更易于部署。'}}}, {'id': 'https://huggingface.co/papers/2501.15747', 'title': 'IndicMMLU-Pro: Benchmarking Indic Large Language Models on Multi-Task Language Understanding', 'url': 'https://huggingface.co/papers/2501.15747', 'abstract': "Known by more than 1.5 billion people in the Indian subcontinent, Indic languages present unique challenges and opportunities for natural language processing (NLP) research due to their rich cultural heritage, linguistic diversity, and complex structures. IndicMMLU-Pro is a comprehensive benchmark designed to evaluate Large Language Models (LLMs) across Indic languages, building upon the MMLU Pro (Massive Multitask Language Understanding) framework. Covering major languages such as Hindi, Bengali, Gujarati, Marathi, Kannada, Punjabi, Tamil, Telugu, and Urdu, our benchmark addresses the unique challenges and opportunities presented by the linguistic diversity of the Indian subcontinent. This benchmark encompasses a wide range of tasks in language comprehension, reasoning, and generation, meticulously crafted to capture the intricacies of Indian languages. IndicMMLU-Pro provides a standardized evaluation framework to push the research boundaries in Indic language AI, facilitating the development of more accurate, efficient, and culturally sensitive models. This paper outlines the benchmarks' design principles, task taxonomy, and data collection methodology, and presents baseline results from state-of-the-art multilingual models.", 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '4b666d035c5e5c4c', 'authors': ['Sankalp KJ', 'Ashutosh Kumar', 'Laxmaan Balaji', 'Nikunj Kotecha', 'Vinija Jain', 'Aman Chadha', 'Sreyoshi Bhaduri'], 'affiliations': ['Amazon Gen AI', 'Artificial Intelligence Institute, University of South Carolina', 'Independent Researcher', 'Meta AI', 'Rochester Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.15747.jpg', 'data': {'categories': ['#reasoning', '#low_resource', '#multilingual', '#benchmark'], 'emoji': '🇮🇳', 'ru': {'title': 'Новый рубеж в NLP: комплексная оценка языковых моделей для индийских языков', 'desc': 'IndicMMLU-Pro - это комплексный бенчмарк для оценки языковых моделей в индийских языках. Он охватывает 9 основных языков Индийского субконтинента и включает широкий спектр задач по пониманию языка, рассуждению и генерации текста. Бенчмарк разработан с учетом уникальных особенностей и сложностей индийских языков. IndicMMLU-Pro предоставляет стандартизированную систему оценки для продвижения исследований в области ИИ для индийских языков.'}, 'en': {'title': 'Empowering Indic Languages with Advanced NLP Benchmarks', 'desc': 'The paper introduces IndicMMLU-Pro, a benchmark specifically designed to assess Large Language Models (LLMs) in the context of Indic languages. It builds on the existing MMLU Pro framework and includes major languages like Hindi, Bengali, and Tamil, addressing the unique linguistic challenges of the Indian subcontinent. The benchmark features a variety of tasks that test language comprehension, reasoning, and generation, ensuring a comprehensive evaluation of models. By providing a standardized framework, IndicMMLU-Pro aims to enhance the development of more accurate and culturally aware AI models for Indic languages.'}, 'zh': {'title': '推动印度语言AI研究的基准', 'desc': 'IndicMMLU-Pro是一个专门为印度语言设计的基准,旨在评估大型语言模型(LLMs)的表现。该基准基于MMLU Pro框架,涵盖了印地语、孟加拉语、古吉拉特语等主要语言,解决了印度次大陆语言的多样性带来的挑战。它包括语言理解、推理和生成等多种任务,旨在捕捉印度语言的复杂性。通过提供标准化的评估框架,IndicMMLU-Pro推动了印度语言人工智能的研究,促进了更准确、高效和文化敏感的模型的发展。'}}}, {'id': 'https://huggingface.co/papers/2501.17117', 'title': 'Histoires Morales: A French Dataset for Assessing Moral Alignment', 'url': 'https://huggingface.co/papers/2501.17117', 'abstract': 'Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant progress in languages like English and Chinese, French has seen little attention in this area, leaving a gap in understanding how LLMs handle moral reasoning in this language. To address this gap, we introduce Histoires Morales, a French dataset derived from Moral Stories, created through translation and subsequently refined with the assistance of native speakers to guarantee grammatical accuracy and adaptation to the French cultural context. We also rely on annotations of the moral values within the dataset to ensure their alignment with French norms. Histoires Morales covers a wide range of social situations, including differences in tipping practices, expressions of honesty in relationships, and responsibilities toward animals. To foster future research, we also conduct preliminary experiments on the alignment of multilingual models on French and English data and the robustness of the alignment. We find that while LLMs are generally aligned with human moral norms by default, they can be easily influenced with user-preference optimization for both moral and immoral data.', 'score': 2, 'issue_id': 1924, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'd2d1461e245219e8', 'authors': ['Thibaud Leteno', 'Irina Proskurina', 'Antoine Gourru', 'Julien Velcin', 'Charlotte Laclau', 'Guillaume Metzler', 'Christophe Gravier'], 'affiliations': ['Laboratoire Hubert Curien, UMR CNRS 5516, Saint-Etienne, France', 'Télécom Paris, Institut Polytechnique de Paris, Paris, France', 'Université Lumière Lyon 2, Université Claude Bernard Lyon 1, ERIC, 69007, Lyon, France'], 'pdf_title_img': 'assets/pdf/title_img/2501.17117.jpg', 'data': {'categories': ['#dataset', '#multilingual', '#alignment', '#ethics'], 'emoji': '🇫🇷', 'ru': {'title': 'Французский датасет для морального выравнивания языковых моделей', 'desc': "Статья представляет набор данных 'Histoires Morales' на французском языке для выравнивания языковых моделей с человеческими ценностями. Этот датасет создан на основе 'Moral Stories' путем перевода и адаптации к французскому культурному контексту. Исследование включает эксперименты по выравниванию мультиязычных моделей на французских и английских данных. Результаты показывают, что языковые модели в целом соответствуют человеческим моральным нормам, но могут быть легко подвержены влиянию при оптимизации под предпочтения пользователей."}, 'en': {'title': 'Bridging Language Models and French Moral Values', 'desc': 'This paper emphasizes the importance of aligning language models with human values, particularly in the context of the French language. It introduces Histoires Morales, a dataset created from Moral Stories, which has been translated and refined to reflect French cultural norms and moral reasoning. The dataset includes various social situations to better understand how language models handle moral values in French. Preliminary experiments show that while language models generally align with human morals, they can be swayed by user preferences, highlighting the need for careful optimization.'}, 'zh': {'title': '让语言模型与人类价值观对齐', 'desc': '本论文强调了将语言模型与人类价值观对齐的重要性,尤其是在日常生活中。我们介绍了一个名为Histoires Morales的法语数据集,旨在填补法语在道德推理方面的研究空白。该数据集通过翻译和母语者的帮助进行精细化,确保其语法准确并适应法国文化背景。我们的初步实验表明,尽管大型语言模型通常与人类道德规范一致,但它们可以通过用户偏好优化轻易受到影响。'}}}, {'id': 'https://huggingface.co/papers/2501.01895', 'title': 'EnerVerse: Envisioning Embodied Future Space for Robotics Manipulation', 'url': 'https://huggingface.co/papers/2501.01895', 'abstract': "We introduce EnerVerse, a comprehensive framework for embodied future space generation specifically designed for robotic manipulation tasks. EnerVerse seamlessly integrates convolutional and bidirectional attention mechanisms for inner-chunk space modeling, ensuring low-level consistency and continuity. Recognizing the inherent redundancy in video data, we propose a sparse memory context combined with a chunkwise unidirectional generative paradigm to enable the generation of infinitely long sequences. To further augment robotic capabilities, we introduce the Free Anchor View (FAV) space, which provides flexible perspectives to enhance observation and analysis. The FAV space mitigates motion modeling ambiguity, removes physical constraints in confined environments, and significantly improves the robot's generalization and adaptability across various tasks and settings. To address the prohibitive costs and labor intensity of acquiring multi-camera observations, we present a data engine pipeline that integrates a generative model with 4D Gaussian Splatting (4DGS). This pipeline leverages the generative model's robust generalization capabilities and the spatial constraints provided by 4DGS, enabling an iterative enhancement of data quality and diversity, thus creating a data flywheel effect that effectively narrows the sim-to-real gap. Finally, our experiments demonstrate that the embodied future space generation prior substantially enhances policy predictive capabilities, resulting in improved overall performance, particularly in long-range robotic manipulation tasks.", 'score': 41, 'issue_id': 1506, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'bae2a6e63f87958d', 'authors': ['Siyuan Huang', 'Liliang Chen', 'Pengfei Zhou', 'Shengcong Chen', 'Zhengkai Jiang', 'Yue Hu', 'Peng Gao', 'Hongsheng Li', 'Maoqing Yao', 'Guanghui Ren'], 'affiliations': ['AgiBot', 'CUHK', 'FDU', 'HIT', 'HKUST', 'SJTU', 'Shanghai AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.01895.jpg', 'data': {'categories': ['#3d', '#data', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'EnerVerse: Революция в пространственном моделировании для роботов-манипуляторов', 'desc': 'EnerVerse - это комплексная система для генерации пространства будущего в задачах роботизированной манипуляции. Она использует сверточные механизмы и двунаправленное внимание для моделирования внутренних фрагментов пространства, обеспечивая согласованность на низком уровне. Система вводит пространство Free Anchor View для гибких перспектив наблюдения и анализа, улучшая обобщение и адаптивность робота. EnerVerse также включает конвейер данных, интегрирующий генеративную модель с 4D Gaussian Splatting для сужения разрыва между симуляцией и реальностью.'}, 'en': {'title': 'Empowering Robots with EnerVerse: A New Era in Space Generation and Manipulation', 'desc': 'EnerVerse is a new framework designed to help robots better understand and manipulate their environments. It uses advanced techniques like convolutional and bidirectional attention mechanisms to create a consistent model of space. By recognizing that video data often has unnecessary information, EnerVerse employs a sparse memory context to generate long sequences efficiently. Additionally, the Free Anchor View (FAV) space allows robots to observe from different angles, improving their ability to adapt and perform tasks in various settings.'}, 'zh': {'title': 'EnerVerse:提升机器人操作的未来空间生成框架', 'desc': '本文介绍了EnerVerse,这是一个专为机器人操作任务设计的未来空间生成框架。EnerVerse结合了卷积和双向注意机制,以确保内部空间建模的一致性和连续性。我们提出了一种稀疏记忆上下文和单向生成范式的结合,能够生成无限长的序列,从而提高机器人的能力。通过引入自由锚视图空间(FAV),我们增强了观察和分析的灵活性,显著改善了机器人在各种任务和环境中的泛化能力和适应性。'}}}, {'id': 'https://huggingface.co/papers/2501.01957', 'title': 'VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction', 'url': 'https://huggingface.co/papers/2501.01957', 'abstract': 'Recent Multimodal Large Language Models (MLLMs) have typically focused on integrating visual and textual modalities, with less emphasis placed on the role of speech in enhancing interaction. However, speech plays a crucial role in multimodal dialogue systems, and implementing high-performance in both vision and speech tasks remains a significant challenge due to the fundamental modality differences. In this paper, we propose a carefully designed multi-stage training methodology that progressively trains LLM to understand both visual and speech information, ultimately enabling fluent vision and speech interaction. Our approach not only preserves strong vision-language capacity, but also enables efficient speech-to-speech dialogue capabilities without separate ASR and TTS modules, significantly accelerating multimodal end-to-end response speed. By comparing our method against state-of-the-art counterparts across benchmarks for image, video, and speech tasks, we demonstrate that our model is equipped with both strong visual and speech capabilities, making near real-time vision and speech interaction.', 'score': 19, 'issue_id': 1506, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'b6690c7efedf5a39', 'authors': ['Chaoyou Fu', 'Haojia Lin', 'Xiong Wang', 'Yi-Fan Zhang', 'Yunhang Shen', 'Xiaoyu Liu', 'Yangze Li', 'Zuwei Long', 'Heting Gao', 'Ke Li', 'Xiawu Zheng', 'Rongrong Ji', 'Xing Sun', 'Caifeng Shan', 'Ran He'], 'affiliations': ['CASIA', 'NJU', 'Tencent Youtu Lab', 'XMU'], 'pdf_title_img': 'assets/pdf/title_img/2501.01957.jpg', 'data': {'categories': ['#training', '#cv', '#multimodal', '#benchmark', '#audio'], 'emoji': '🗣️', 'ru': {'title': 'Революция в мультимодальном взаимодействии: речь и зрение в одной модели', 'desc': 'В статье представлена новая методология обучения мультимодальных языковых моделей, объединяющая визуальную и речевую модальности. Авторы предлагают поэтапный подход к обучению, который позволяет модели эффективно понимать как визуальную, так и речевую информацию. Модель демонстрирует высокую производительность в задачах обработки изображений, видео и речи, превосходя современные аналоги. Этот подход обеспечивает возможность ведения диалога с использованием речи и изображений в режиме, близком к реальному времени.'}, 'en': {'title': 'Enhancing Multimodal Interaction with Speech and Vision Integration', 'desc': 'This paper introduces a novel training methodology for Multimodal Large Language Models (MLLMs) that enhances their ability to process both visual and speech data. The proposed multi-stage training approach allows the model to progressively learn and integrate information from images, videos, and spoken language, facilitating seamless interaction. By eliminating the need for separate Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) modules, the model achieves faster response times in multimodal dialogues. Experimental results show that this method not only maintains strong vision-language performance but also excels in speech tasks, enabling near real-time interactions.'}, 'zh': {'title': '实现流畅的视觉与语音交互', 'desc': '最近的多模态大型语言模型(MLLMs)主要集中在视觉和文本的整合上,而对语音在增强交互中的作用关注较少。然而,语音在多模态对话系统中起着至关重要的作用,如何在视觉和语音任务中实现高性能仍然是一个重大挑战。本文提出了一种精心设计的多阶段训练方法,逐步训练大型语言模型理解视觉和语音信息,从而实现流畅的视觉和语音交互。我们的方法不仅保持了强大的视觉-语言能力,还实现了高效的语音对话能力,显著加快了多模态端到端的响应速度。'}}}, {'id': 'https://huggingface.co/papers/2501.01904', 'title': 'Virgo: A Preliminary Exploration on Reproducing o1-like MLLM', 'url': 'https://huggingface.co/papers/2501.01904', 'abstract': 'Recently, slow-thinking reasoning systems, built upon large language models (LLMs), have garnered widespread attention by scaling the thinking time during inference. There is also growing interest in adapting this capability to multimodal large language models (MLLMs). Given that MLLMs handle more complex data semantics across different modalities, it is intuitively more challenging to implement multimodal slow-thinking systems. To address this issue, in this paper, we explore a straightforward approach by fine-tuning a capable MLLM with a small amount of textual long-form thought data, resulting in a multimodal slow-thinking system, Virgo (Visual reasoning with long thought). We find that these long-form reasoning processes, expressed in natural language, can be effectively transferred to MLLMs. Moreover, it seems that such textual reasoning data can be even more effective than visual reasoning data in eliciting the slow-thinking capacities of MLLMs. While this work is preliminary, it demonstrates that slow-thinking capacities are fundamentally associated with the language model component, which can be transferred across modalities or domains. This finding can be leveraged to guide the development of more powerful slow-thinking reasoning systems. We release our resources at https://github.com/RUCAIBox/Virgo.', 'score': 12, 'issue_id': 1505, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '576423a20b419d0f', 'authors': ['Yifan Du', 'Zikang Liu', 'Yifan Li', 'Wayne Xin Zhao', 'Yuqi Huo', 'Bingning Wang', 'Weipeng Chen', 'Zheng Liu', 'Zhongyuan Wang', 'Ji-Rong Wen'], 'affiliations': ['BAAI', 'Baichuan AI', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01904.jpg', 'data': {'categories': ['#reasoning', '#multimodal', '#transfer_learning', '#training'], 'emoji': '🧠', 'ru': {'title': 'Обучение мультимодальных ИИ длительным рассуждениям через текст', 'desc': 'Статья описывает исследование в области мультимодальных больших языковых моделей (MLLM) и их способности к медленному мышлению. Авторы предлагают метод Virgo, который позволяет обучить MLLM длительным рассуждениям с помощью небольшого количества текстовых данных. Результаты показывают, что текстовые данные для обучения рассуждениям могут быть даже эффективнее визуальных. Это исследование демонстрирует, что способности к медленному мышлению в основном связаны с языковым компонентом модели и могут переноситься между модальностями.'}, 'en': {'title': 'Unlocking Slow-Thinking in Multimodal Models with Textual Reasoning', 'desc': 'This paper discusses the development of a multimodal slow-thinking reasoning system called Virgo, which is based on fine-tuning a multimodal large language model (MLLM) using long-form textual reasoning data. The authors found that incorporating long-form reasoning in natural language significantly enhances the slow-thinking capabilities of MLLMs, even more so than using visual reasoning data. This suggests that the slow-thinking abilities are closely linked to the language model aspect, allowing for effective transfer across different data modalities. The research indicates a promising direction for creating advanced reasoning systems that can handle complex data semantics.'}, 'zh': {'title': '多模态慢思维推理的探索', 'desc': '最近,基于大型语言模型(LLMs)的慢思维推理系统引起了广泛关注,尤其是在推理过程中延长思考时间的能力。本文探讨了如何将这种能力应用于多模态大型语言模型(MLLMs),尽管处理不同模态的复杂数据语义更具挑战性。我们通过微调一个强大的MLLM,使用少量的长文本思维数据,成功构建了一个多模态慢思维系统,命名为Virgo(视觉推理与长思维)。研究表明,长文本推理过程可以有效转移到MLLMs,并且这种文本推理数据在激发MLLMs的慢思维能力方面,似乎比视觉推理数据更有效。'}}}, {'id': 'https://huggingface.co/papers/2412.21059', 'title': 'VisionReward: Fine-Grained Multi-Dimensional Human Preference Learning for Image and Video Generation', 'url': 'https://huggingface.co/papers/2412.21059', 'abstract': 'We present a general strategy to aligning visual generation models -- both image and video generation -- with human preference. To start with, we build VisionReward -- a fine-grained and multi-dimensional reward model. We decompose human preferences in images and videos into multiple dimensions, each represented by a series of judgment questions, linearly weighted and summed to an interpretable and accurate score. To address the challenges of video quality assessment, we systematically analyze various dynamic features of videos, which helps VisionReward surpass VideoScore by 17.2% and achieve top performance for video preference prediction. Based on VisionReward, we develop a multi-objective preference learning algorithm that effectively addresses the issue of confounding factors within preference data. Our approach significantly outperforms existing image and video scoring methods on both machine metrics and human evaluation. All code and datasets are provided at https://github.com/THUDM/VisionReward.', 'score': 11, 'issue_id': 1510, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '1f3bb267ffa751d9', 'authors': ['Jiazheng Xu', 'Yu Huang', 'Jiale Cheng', 'Yuanming Yang', 'Jiajun Xu', 'Yuan Wang', 'Wenbo Duan', 'Shen Yang', 'Qunlin Jin', 'Shurun Li', 'Jiayan Teng', 'Zhuoyi Yang', 'Wendi Zheng', 'Xiao Liu', 'Ming Ding', 'Xiaohan Zhang', 'Xiaotao Gu', 'Shiyu Huang', 'Minlie Huang', 'Jie Tang', 'Yuxiao Dong'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2412.21059.jpg', 'data': {'categories': ['#rag', '#training', '#open_source', '#cv', '#video', '#optimization', '#alignment'], 'emoji': '🎥', 'ru': {'title': 'VisionReward: многомерная оценка визуального контента с учетом человеческих предпочтений', 'desc': 'Исследователи представили стратегию для согласования моделей генерации визуального контента с человеческими предпочтениями. Они разработали VisionReward - многомерную модель вознаграждения, которая декомпозирует предпочтения в изображениях и видео на несколько измерений. Для оценки качества видео были проанализированы различные динамические характеристики, что позволило VisionReward превзойти существующие методы на 17.2%. На основе VisionReward был разработан алгоритм многоцелевого обучения предпочтениям, эффективно решающий проблему конфаундинг-факторов в данных о предпочтениях.'}, 'en': {'title': 'Aligning Visual Generation with Human Preferences', 'desc': 'This paper introduces a method for aligning visual generation models, such as those for images and videos, with human preferences. The authors create a reward model called VisionReward, which breaks down human preferences into multiple dimensions assessed through specific judgment questions. They enhance video quality assessment by analyzing dynamic features, leading to a 17.2% improvement over previous methods. Additionally, a multi-objective preference learning algorithm is developed to manage confounding factors in preference data, resulting in superior performance compared to existing scoring methods.'}, 'zh': {'title': '视觉生成模型与人类偏好的完美对齐', 'desc': '本文提出了一种通用策略,用于将视觉生成模型(包括图像和视频生成)与人类偏好对齐。我们构建了VisionReward,这是一个细粒度和多维度的奖励模型,能够将人类对图像和视频的偏好分解为多个维度。通过分析视频的动态特征,VisionReward在视频偏好预测中超越了现有方法,提升了17.2%的性能。基于VisionReward,我们开发了一种多目标偏好学习算法,有效解决了偏好数据中的混淆因素问题。'}}}, {'id': 'https://huggingface.co/papers/2501.01821', 'title': 'SDPO: Segment-Level Direct Preference Optimization for Social Agents', 'url': 'https://huggingface.co/papers/2501.01821', 'abstract': "Social agents powered by large language models (LLMs) can simulate human social behaviors but fall short in handling complex goal-oriented social dialogues. Direct Preference Optimization (DPO) has proven effective in aligning LLM behavior with human preferences across a variety of agent tasks. Existing DPO-based approaches for multi-turn interactions are divided into turn-level and session-level methods. The turn-level method is overly fine-grained, focusing exclusively on individual turns, while session-level methods are too coarse-grained, often introducing training noise. To address these limitations, we propose Segment-Level Direct Preference Optimization (SDPO), which focuses on specific key segments within interactions to optimize multi-turn agent behavior while minimizing training noise. Evaluations on the SOTOPIA benchmark demonstrate that SDPO-tuned agents consistently outperform both existing DPO-based methods and proprietary LLMs like GPT-4o, underscoring SDPO's potential to advance the social intelligence of LLM-based agents. We release our code and data at https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/SDPO.", 'score': 10, 'issue_id': 1514, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '499b008b0bce4f74', 'authors': ['Aobo Kong', 'Wentao Ma', 'Shiwan Zhao', 'Yongbin Li', 'Yuchuan Wu', 'Ke Wang', 'Xiaoqian Liu', 'Qicheng Li', 'Yong Qin', 'Fei Huang'], 'affiliations': ['TMCC, CS, Nankai University', 'Tongyi Lab', 'alibaba-inc.com'], 'pdf_title_img': 'assets/pdf/title_img/2501.01821.jpg', 'data': {'categories': ['#open_source', '#benchmark', '#rlhf', '#agents', '#alignment', '#training'], 'emoji': '🤖', 'ru': {'title': 'SDPO: Новый шаг к созданию социально интеллектуальных ИИ-агентов', 'desc': 'В статье представлен новый метод оптимизации поведения языковых моделей (LLM) в сложных многоходовых социальных диалогах - Segment-Level Direct Preference Optimization (SDPO). SDPO фокусируется на ключевых сегментах взаимодействия, что позволяет эффективнее оптимизировать поведение агентов по сравнению с существующими методами. Эксперименты на бенчмарке SOTOPIA показали, что агенты, настроенные с помощью SDPO, превосходят как другие методы на основе DPO, так и проприетарные модели вроде GPT-4. Это демонстрирует потенциал SDPO для повышения социального интеллекта агентов на основе LLM.'}, 'en': {'title': 'Enhancing Social Intelligence in LLMs with SDPO', 'desc': "This paper introduces Segment-Level Direct Preference Optimization (SDPO), a new method for improving the performance of social agents powered by large language models (LLMs) in complex dialogues. Unlike existing methods that either focus too narrowly on individual turns or too broadly on entire sessions, SDPO targets specific key segments of conversations to better align agent behavior with human preferences. The approach reduces training noise and enhances the agent's ability to engage in multi-turn interactions effectively. Evaluations show that agents trained with SDPO outperform both traditional DPO methods and advanced LLMs like GPT-4o, highlighting its effectiveness in enhancing social intelligence."}, 'zh': {'title': '提升社交智能的新方法:分段级直接偏好优化', 'desc': '本论文提出了一种新的方法,称为分段级直接偏好优化(SDPO),旨在提高大型语言模型(LLM)在多轮社交对话中的表现。现有的直接偏好优化(DPO)方法在处理多轮交互时存在细粒度和粗粒度的局限性,导致训练噪声。SDPO通过关注交互中的关键段落,优化代理的多轮行为,从而减少训练噪声。实验结果表明,SDPO调优的代理在SOTOPIA基准测试中表现优于现有的DPO方法和其他大型语言模型,显示出其在提升社交智能方面的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01073', 'title': 'Graph Generative Pre-trained Transformer', 'url': 'https://huggingface.co/papers/2501.01073', 'abstract': "Graph generation is a critical task in numerous domains, including molecular design and social network analysis, due to its ability to model complex relationships and structured data. While most modern graph generative models utilize adjacency matrix representations, this work revisits an alternative approach that represents graphs as sequences of node set and edge set. We advocate for this approach due to its efficient encoding of graphs and propose a novel representation. Based on this representation, we introduce the Graph Generative Pre-trained Transformer (G2PT), an auto-regressive model that learns graph structures via next-token prediction. To further exploit G2PT's capabilities as a general-purpose foundation model, we explore fine-tuning strategies for two downstream applications: goal-oriented generation and graph property prediction. We conduct extensive experiments across multiple datasets. Results indicate that G2PT achieves superior generative performance on both generic graph and molecule datasets. Furthermore, G2PT exhibits strong adaptability and versatility in downstream tasks from molecular design to property prediction.", 'score': 9, 'issue_id': 1508, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '596abc88d57e0650', 'authors': ['Xiaohui Chen', 'Yinkai Wang', 'Jiaxing He', 'Yuanqi Du', 'Soha Hassoun', 'Xiaolin Xu', 'Li-Ping Liu'], 'affiliations': ['Cornell University', 'Northeastern University', 'Tufts University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01073.jpg', 'data': {'categories': ['#dataset', '#optimization', '#training', '#architecture', '#data', '#graphs'], 'emoji': '🕸️', 'ru': {'title': 'G2PT: Универсальный трансформер для эффективной генерации графов', 'desc': 'В статье представлена новая модель генерации графов - Graph Generative Pre-trained Transformer (G2PT). G2PT использует альтернативный подход к представлению графов в виде последовательностей множеств узлов и рёбер вместо матриц смежности. Модель обучается предсказывать следующий токен автореgressивным способом. G2PT показывает превосходные результаты в генерации как общих графов, так и молекул, а также демонстрирует хорошую адаптивность к различным задачам.'}, 'en': {'title': 'Revolutionizing Graph Generation with G2PT', 'desc': 'This paper focuses on improving graph generation, which is important for tasks like designing molecules and analyzing social networks. Instead of using the common adjacency matrix, it proposes a new way to represent graphs as sequences of node and edge sets, making the encoding more efficient. The authors introduce the Graph Generative Pre-trained Transformer (G2PT), an auto-regressive model that learns to generate graph structures by predicting the next token in a sequence. Through various experiments, they demonstrate that G2PT outperforms existing models in generating graphs and is effective in applications like molecular design and predicting graph properties.'}, 'zh': {'title': '图生成的创新:G2PT模型', 'desc': '图生成在许多领域中非常重要,比如分子设计和社交网络分析,因为它能够建模复杂的关系和结构化数据。本文提出了一种新的图表示方法,将图表示为节点集和边集的序列,而不是传统的邻接矩阵。基于这种表示,我们引入了图生成预训练变换器(G2PT),这是一种通过下一个标记预测学习图结构的自回归模型。实验结果表明,G2PT在通用图和分子数据集上表现出色,并且在分子设计和属性预测等下游任务中具有很强的适应性和多功能性。'}}}, {'id': 'https://huggingface.co/papers/2501.00874', 'title': 'LUSIFER: Language Universal Space Integration for Enhanced Multilingual Embeddings with Large Language Models', 'url': 'https://huggingface.co/papers/2501.00874', 'abstract': "Recent advancements in large language models (LLMs) based embedding models have established new state-of-the-art benchmarks for text embedding tasks, particularly in dense vector-based retrieval. However, these models predominantly focus on English, leaving multilingual embedding capabilities largely unexplored. To address this limitation, we present LUSIFER, a novel zero-shot approach that adapts LLM-based embedding models for multilingual tasks without requiring multilingual supervision. LUSIFER's architecture combines a multilingual encoder, serving as a language-universal learner, with an LLM-based embedding model optimized for embedding-specific tasks. These components are seamlessly integrated through a minimal set of trainable parameters that act as a connector, effectively transferring the multilingual encoder's language understanding capabilities to the specialized embedding model. Additionally, to comprehensively evaluate multilingual embedding performance, we introduce a new benchmark encompassing 5 primary embedding tasks, 123 diverse datasets, and coverage across 14 languages. Extensive experimental results demonstrate that LUSIFER significantly enhances the multilingual performance across various embedding tasks, particularly for medium and low-resource languages, without requiring explicit multilingual training data.", 'score': 7, 'issue_id': 1507, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': '5bdfec436923a2a6', 'authors': ['Hieu Man', 'Nghia Trung Ngo', 'Viet Dac Lai', 'Ryan A. Rossi', 'Franck Dernoncourt', 'Thien Huu Nguyen'], 'affiliations': ['Adobe Research, USA', 'Dept. of Computer Science, University of Oregon, OR, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.00874.jpg', 'data': {'categories': ['#transfer_learning', '#architecture', '#benchmark', '#multilingual', '#low_resource'], 'emoji': '🌍', 'ru': {'title': 'Универсальные многоязычные эмбеддинги без многоязычного обучения', 'desc': 'LUSIFER - это новый подход к созданию многоязычных эмбеддингов без использования многоязычных обучающих данных. Он объединяет многоязычный энкодер и LLM-модель для эмбеддингов через набор обучаемых параметров. Авторы также представили новый бенчмарк для оценки качества многоязычных эмбеддингов, охватывающий 5 основных задач, 123 датасета и 14 языков. Эксперименты показали, что LUSIFER значительно улучшает многоязычную производительность, особенно для языков с ограниченными ресурсами.'}, 'en': {'title': 'LUSIFER: Bridging Multilingual Gaps in Text Embedding', 'desc': "This paper introduces LUSIFER, a new method that enhances large language models (LLMs) for multilingual text embedding tasks. Unlike existing models that mainly focus on English, LUSIFER uses a zero-shot approach to adapt LLMs for multiple languages without needing multilingual training data. It combines a multilingual encoder with an LLM-based embedding model, allowing for effective language understanding and embedding performance. The authors also present a comprehensive benchmark to evaluate LUSIFER's performance across various languages and tasks, showing significant improvements, especially for less-resourced languages."}, 'zh': {'title': 'LUSIFER:无监督多语言嵌入的新突破', 'desc': '最近,大型语言模型(LLMs)在文本嵌入任务中取得了新的突破,尤其是在基于密集向量的检索方面。然而,这些模型主要集中在英语上,导致多语言嵌入能力尚未得到充分探索。为了解决这个问题,我们提出了LUSIFER,这是一种新颖的零样本方法,可以在不需要多语言监督的情况下,将LLM嵌入模型适应于多语言任务。LUSIFER的架构结合了一个多语言编码器和一个针对嵌入特定任务优化的LLM嵌入模型,通过一组最小的可训练参数实现无缝连接,有效地将多语言编码器的语言理解能力转移到专门的嵌入模型上。'}}}, {'id': 'https://huggingface.co/papers/2501.01540', 'title': 'BoxingGym: Benchmarking Progress in Automated Experimental Design and Model Discovery', 'url': 'https://huggingface.co/papers/2501.01540', 'abstract': "Understanding the world and explaining it with scientific theories is a central aspiration of artificial intelligence research. Proposing theories, designing experiments to test them, and then revising them based on data are fundamental to scientific discovery. Despite the significant promise of LLM-based scientific agents, no benchmarks systematically test LLM's ability to propose scientific models, collect experimental data, and revise them in light of new data. We introduce BoxingGym, a benchmark with 10 environments for systematically evaluating both experimental design (e.g. collecting data to test a scientific theory) and model discovery (e.g. proposing and revising scientific theories). To enable tractable and quantitative evaluation, we implement each environment as a generative probabilistic model with which a scientific agent can run interactive experiments. These probabilistic models are drawn from various real-world scientific domains ranging from psychology to ecology. To quantitatively evaluate a scientific agent's ability to collect informative experimental data, we compute the expected information gain (EIG), an information-theoretic quantity which measures how much an experiment reduces uncertainty about the parameters of a generative model. A good scientific theory is a concise and predictive explanation. Therefore, to quantitatively evaluate model discovery, we ask a scientific agent to explain their model and then assess whether this explanation enables another scientific agent to make reliable predictions about this environment. In addition to this explanation-based evaluation, we compute standard model evaluation metrics such as prediction errors. We find that current LLMs, such as GPT-4o, struggle with both experimental design and model discovery. We find that augmenting the LLM-based agent with an explicit statistical model does not reliably improve these results.", 'score': 4, 'issue_id': 1510, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '0f853b1681ef29b5', 'authors': ['Kanishk Gandhi', 'Michael Y. Li', 'Lyle Goodyear', 'Louise Li', 'Aditi Bhaskar', 'Mohammed Zaman', 'Noah D. Goodman'], 'affiliations': ['Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01540.jpg', 'data': {'categories': ['#benchmark', '#data', '#science', '#agents'], 'emoji': '🧪', 'ru': {'title': 'BoxingGym: новый вызов для ИИ в научном моделировании', 'desc': 'Статья представляет новый бенчмарк BoxingGym для оценки способности языковых моделей (LLM) к научному открытию. Бенчмарк включает 10 сред, моделирующих различные научные области, и позволяет тестировать планирование экспериментов и построение теорий. Для оценки качества экспериментов используется ожидаемый прирост информации (EIG), а для оценки теорий - их способность объяснять и предсказывать. Результаты показывают, что современные LLM, включая GPT-4, пока слабо справляются с этими задачами.'}, 'en': {'title': 'BoxingGym: Evaluating LLMs in Scientific Discovery', 'desc': 'This paper introduces BoxingGym, a benchmark designed to evaluate the capabilities of large language models (LLMs) in scientific discovery tasks. It focuses on two main aspects: experimental design, which involves collecting data to test scientific theories, and model discovery, which includes proposing and revising these theories. The benchmark consists of 10 environments modeled as generative probabilistic models from various scientific fields, allowing for interactive experimentation. The study finds that current LLMs, like GPT-4o, face challenges in both areas, and adding a statistical model does not consistently enhance their performance.'}, 'zh': {'title': '评估人工智能在科学研究中的能力', 'desc': '这篇论文探讨了人工智能在科学研究中的应用,特别是大型语言模型(LLM)在提出科学理论和设计实验方面的能力。作者提出了一个名为BoxingGym的基准测试,包含10个环境,用于系统评估实验设计和模型发现的能力。通过计算期望信息增益(EIG),论文量化了科学代理收集实验数据的有效性,并评估其提出的模型是否能进行可靠预测。研究发现,当前的LLM在实验设计和模型发现方面表现不佳,且简单地增加统计模型并未显著改善结果。'}}}, {'id': 'https://huggingface.co/papers/2501.04519', 'title': 'rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking', 'url': 'https://huggingface.co/papers/2501.04519', 'abstract': 'We present rStar-Math to demonstrate that small language models (SLMs) can rival or even surpass the math reasoning capability of OpenAI o1, without distillation from superior models. rStar-Math achieves this by exercising "deep thinking" through Monte Carlo Tree Search (MCTS), where a math policy SLM performs test-time search guided by an SLM-based process reward model. rStar-Math introduces three innovations to tackle the challenges in training the two SLMs: (1) a novel code-augmented CoT data sythesis method, which performs extensive MCTS rollouts to generate step-by-step verified reasoning trajectories used to train the policy SLM; (2) a novel process reward model training method that avoids na\\"ive step-level score annotation, yielding a more effective process preference model (PPM); (3) a self-evolution recipe in which the policy SLM and PPM are built from scratch and iteratively evolved to improve reasoning capabilities. Through 4 rounds of self-evolution with millions of synthesized solutions for 747k math problems, rStar-Math boosts SLMs\' math reasoning to state-of-the-art levels. On the MATH benchmark, it improves Qwen2.5-Math-7B from 58.8% to 90.0% and Phi3-mini-3.8B from 41.4% to 86.4%, surpassing o1-preview by +4.5% and +0.9%. On the USA Math Olympiad (AIME), rStar-Math solves an average of 53.3% (8/15) of problems, ranking among the top 20% the brightest high school math students. Code and data will be available at https://github.com/microsoft/rStar.', 'score': 100, 'issue_id': 1572, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'b065003de5fa3bde', 'authors': ['Xinyu Guan', 'Li Lyna Zhang', 'Yifei Liu', 'Ning Shang', 'Youran Sun', 'Yi Zhu', 'Fan Yang', 'Mao Yang'], 'affiliations': ['Microsoft', 'Peking University', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04519.jpg', 'data': {'categories': ['#training', '#reasoning', '#optimization', '#benchmark', '#small_models', '#dataset'], 'emoji': '🧮', 'ru': {'title': 'Малые модели решают большие задачи: rStar-Math превосходит гигантов в математике', 'desc': 'Статья представляет rStar-Math - подход, позволяющий малым языковым моделям (SLM) достичь или превзойти способности крупных моделей в математических рассуждениях. Метод использует поиск по методу Монте-Карло (MCTS) с двумя специально обученными SLM: политикой и моделью вознаграждения. Авторы вводят новые методы синтеза обучающих данных, обучения модели вознаграждения и итеративного улучшения моделей. В результате rStar-Math значительно повышает эффективность SLM на математических тестах, превосходя более крупные модели.'}, 'en': {'title': 'Empowering Small Models to Excel in Math Reasoning', 'desc': 'The paper introduces rStar-Math, a framework that enhances the math reasoning abilities of small language models (SLMs) without relying on larger models. It employs Monte Carlo Tree Search (MCTS) to enable deep thinking, allowing the SLM to perform guided search during problem-solving. Key innovations include a code-augmented Chain of Thought (CoT) data synthesis method for generating verified reasoning paths, a refined process preference model (PPM) for better reward training, and a self-evolution strategy for iterative improvement. As a result, rStar-Math significantly boosts the performance of SLMs on math benchmarks, achieving state-of-the-art results in various assessments.'}, 'zh': {'title': '小型语言模型的数学推理新突破', 'desc': 'rStar-Math展示了小型语言模型(SLMs)在数学推理能力上可以与OpenAI的o1相媲美,甚至超越它,而无需从更强大的模型中蒸馏。该方法通过蒙特卡洛树搜索(MCTS)实现“深度思考”,在测试时由SLM驱动的过程奖励模型指导数学策略SLM进行搜索。rStar-Math引入了三项创新来解决训练两个SLM的挑战,包括新颖的代码增强的链式推理数据合成方法和更有效的过程偏好模型(PPM)训练方法。经过四轮自我进化,rStar-Math在747,000个数学问题上生成了数百万个合成解,使SLMs的数学推理能力达到了最先进的水平。'}}}, {'id': 'https://huggingface.co/papers/2501.04682', 'title': 'Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Though', 'url': 'https://huggingface.co/papers/2501.04682', 'abstract': 'We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends traditional Chain-of-Thought (CoT) by explicitly modeling the underlying reasoning required to arrive at a particular CoT. We present empirical evidence from state-of-the-art models exhibiting behaviors consistent with in-context search, and explore methods for producing Meta-CoT via process supervision, synthetic data generation, and search algorithms. Finally, we outline a concrete pipeline for training a model to produce Meta-CoTs, incorporating instruction tuning with linearized search traces and reinforcement learning post-training. Finally, we discuss open research questions, including scaling laws, verifier roles, and the potential for discovering novel reasoning algorithms. This work provides a theoretical and practical roadmap to enable Meta-CoT in LLMs, paving the way for more powerful and human-like reasoning in artificial intelligence.', 'score': 42, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '3479f7793755e586', 'authors': ['Violet Xiang', 'Charlie Snell', 'Kanishk Gandhi', 'Alon Albalak', 'Anikait Singh', 'Chase Blagden', 'Duy Phung', 'Rafael Rafailov', 'Nathan Lile', 'Dakota Mahan', 'Louis Castricato', 'Jan-Philipp Franken', 'Nick Haber', 'Chelsea Finn'], 'affiliations': ['Stanford University', 'SynthLabs.ai', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.04682.jpg', 'data': {'categories': ['#synthetic', '#training', '#rlhf', '#rl', '#multimodal', '#optimization', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Meta-CoT: новый уровень рассуждений для ИИ', 'desc': 'Исследователи предлагают новую концепцию под названием Meta Chain-of-Thought (Meta-CoT), которая расширяет традиционный подход Chain-of-Thought. Meta-CoT моделирует базовые рассуждения, необходимые для построения цепочки мыслей. Авторы представляют эмпирические доказательства того, что современные языковые модели демонстрируют поведение, согласующееся с контекстным поиском. Они также описывают конкретный процесс обучения модели для генерации Meta-CoT, включающий инструктивную настройку и обучение с подкреплением.'}, 'en': {'title': 'Empowering AI with Enhanced Reasoning through Meta-CoT', 'desc': 'The paper introduces a new framework called Meta Chain-of-Thought (Meta-CoT), which enhances the traditional Chain-of-Thought (CoT) approach by focusing on the reasoning processes behind generating CoTs. It provides experimental results from advanced models that show behaviors similar to in-context search, and discusses techniques for creating Meta-CoT through process supervision, synthetic data, and search algorithms. The authors propose a detailed training pipeline that combines instruction tuning with search traces and reinforcement learning to improve the generation of Meta-CoTs. Additionally, the paper raises important questions about scaling, the role of verifiers, and the potential for discovering new reasoning methods, aiming to advance the reasoning capabilities of large language models (LLMs).'}, 'zh': {'title': '推动人工智能推理能力的元思维链', 'desc': '我们提出了一种新颖的框架,称为元思维链(Meta-CoT),它通过明确建模所需的推理过程来扩展传统的思维链(CoT)。我们展示了来自最先进模型的实证证据,这些模型表现出与上下文搜索一致的行为,并探索了通过过程监督、合成数据生成和搜索算法来生成元思维链的方法。最后,我们概述了一个具体的训练流程,结合了指令调优、线性化搜索轨迹和强化学习后训练,以生成元思维链。此项工作为在大型语言模型中实现元思维链提供了理论和实践的路线图,推动了人工智能更强大和更人性化的推理能力。'}}}, {'id': 'https://huggingface.co/papers/2501.04686', 'title': 'URSA: Understanding and Verifying Chain-of-thought Reasoning in Multimodal Mathematics', 'url': 'https://huggingface.co/papers/2501.04686', 'abstract': 'Chain-of-thought (CoT) reasoning has been widely applied in the mathematical reasoning of Large Language Models (LLMs). Recently, the introduction of derivative process supervision on CoT trajectories has sparked discussions on enhancing scaling capabilities during test time, thereby boosting the potential of these models. However, in multimodal mathematical reasoning, the scarcity of high-quality CoT training data has hindered existing models from achieving high-precision CoT reasoning and has limited the realization of reasoning potential during test time. In this work, we propose a three-module synthesis strategy that integrates CoT distillation, trajectory-format rewriting, and format unification. It results in a high-quality CoT reasoning instruction fine-tuning dataset in multimodal mathematics, MMathCoT-1M. We comprehensively validate the state-of-the-art (SOTA) performance of the trained URSA-7B model on multiple multimodal mathematical benchmarks. For test-time scaling, we introduce a data synthesis strategy that automatically generates process annotation datasets, known as DualMath-1.1M, focusing on both interpretation and logic. By further training URSA-7B on DualMath-1.1M, we transition from CoT reasoning capabilities to robust supervision abilities. The trained URSA-RM-7B acts as a verifier, effectively enhancing the performance of URSA-7B at test time. URSA-RM-7B also demonstrates excellent out-of-distribution (OOD) verifying capabilities, showcasing its generalization. Model weights, training data and code will be open-sourced.', 'score': 35, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '089df0fb9a548ce8', 'authors': ['Ruilin Luo', 'Zhuofan Zheng', 'Yifan Wang', 'Yiyao Yu', 'Xinzhe Ni', 'Zicheng Lin', 'Jin Zeng', 'Yujiu Yang'], 'affiliations': ['ByteDance', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04686.jpg', 'data': {'categories': ['#dataset', '#training', '#multimodal', '#data', '#open_source', '#reasoning', '#math', '#architecture', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Усиление мультимодальных математических рассуждений через синтез данных и верификацию', 'desc': 'Статья представляет новый подход к улучшению математических рассуждений в мультимодальных языковых моделях. Авторы предлагают стратегию синтеза высококачественного набора данных MMathCoT-1M для обучения цепочкам рассуждений. Они также вводят метод DualMath-1.1M для генерации аннотаций процесса рассуждений, что позволяет модели URSA-7B перейти от способности рассуждать к возможности проверять рассуждения. Результаты показывают улучшение производительности и обобщающей способности модели.'}, 'en': {'title': 'Enhancing Multimodal Mathematical Reasoning with CoT Synthesis', 'desc': "This paper discusses improving mathematical reasoning in Large Language Models (LLMs) using a method called Chain-of-Thought (CoT) reasoning. The authors introduce a new dataset, MMathCoT-1M, which is created through a three-module synthesis strategy to enhance the quality of CoT training data in multimodal mathematics. They also present a data synthesis strategy, DualMath-1.1M, that generates additional training data to improve the model's reasoning capabilities during testing. The results show that their model, URSA-RM-7B, significantly enhances performance and generalization in multimodal mathematical tasks."}, 'zh': {'title': '提升多模态数学推理的链式推理能力', 'desc': '本文探讨了链式推理(CoT)在大型语言模型(LLMs)中的应用,特别是在多模态数学推理中的挑战。由于高质量的CoT训练数据稀缺,现有模型在测试时的推理能力受到限制。为了解决这个问题,作者提出了一种三模块合成策略,生成了高质量的多模态数学推理指令微调数据集MMathCoT-1M。通过进一步训练URSA-7B模型,结合生成的数据集DualMath-1.1M,显著提升了模型在测试时的推理能力和验证能力。'}}}, {'id': 'https://huggingface.co/papers/2501.04227', 'title': 'Agent Laboratory: Using LLM Agents as Research Assistants', 'url': 'https://huggingface.co/papers/2501.04227', 'abstract': 'Historically, scientific discovery has been a lengthy and costly process, demanding substantial time and resources from initial conception to final results. To accelerate scientific discovery, reduce research costs, and improve research quality, we introduce Agent Laboratory, an autonomous LLM-based framework capable of completing the entire research process. This framework accepts a human-provided research idea and progresses through three stages--literature review, experimentation, and report writing to produce comprehensive research outputs, including a code repository and a research report, while enabling users to provide feedback and guidance at each stage. We deploy Agent Laboratory with various state-of-the-art LLMs and invite multiple researchers to assess its quality by participating in a survey, providing human feedback to guide the research process, and then evaluate the final paper. We found that: (1) Agent Laboratory driven by o1-preview generates the best research outcomes; (2) The generated machine learning code is able to achieve state-of-the-art performance compared to existing methods; (3) Human involvement, providing feedback at each stage, significantly improves the overall quality of research; (4) Agent Laboratory significantly reduces research expenses, achieving an 84% decrease compared to previous autonomous research methods. We hope Agent Laboratory enables researchers to allocate more effort toward creative ideation rather than low-level coding and writing, ultimately accelerating scientific discovery.', 'score': 34, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'ff592ae1a5a88909', 'authors': ['Samuel Schmidgall', 'Yusheng Su', 'Ze Wang', 'Ximeng Sun', 'Jialian Wu', 'Xiaodong Yu', 'Jiang Liu', 'Zicheng Liu', 'Emad Barsoum'], 'affiliations': ['AMD', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04227.jpg', 'data': {'categories': ['#science', '#training', '#agents', '#rlhf', '#survey'], 'emoji': '🧪', 'ru': {'title': 'Автономная лаборатория ИИ: революция в научных исследованиях', 'desc': 'Статья представляет Agent Laboratory - автономную систему на основе моделей LLM, способную выполнять полный цикл научного исследования. Система проходит через этапы обзора литературы, экспериментов и написания отчета, позволяя пользователям давать обратную связь на каждом этапе. Эксперименты показали, что Agent Laboratory, работающая на модели o1-preview, генерирует лучшие результаты исследований и значительно снижает затраты на исследования. Авторы надеются, что эта система позволит исследователям сосредоточиться на творческом процессе, ускоряя научные открытия.'}, 'en': {'title': 'Accelerating Science with Autonomous Research Frameworks', 'desc': 'The paper presents Agent Laboratory, an autonomous framework that utilizes large language models (LLMs) to streamline the scientific research process. It operates in three stages: conducting a literature review, performing experiments, and writing reports, all while allowing human researchers to provide feedback. The study shows that Agent Laboratory can produce high-quality research outputs, including code that outperforms existing methods, and significantly reduces research costs by 84%. By automating routine tasks, the framework aims to free researchers to focus more on innovative ideas and less on tedious coding and documentation.'}, 'zh': {'title': 'Agent Laboratory:加速科学发现的智能助手', 'desc': '本文介绍了一种名为Agent Laboratory的自主框架,旨在加速科学发现并降低研究成本。该框架基于大型语言模型(LLM),能够完成文献综述、实验和报告撰写等整个研究过程。研究表明,Agent Laboratory在生成研究成果方面表现优异,尤其是在机器学习代码的性能上,达到了最先进的水平。通过人类反馈的参与,研究质量显著提高,同时研究费用减少了84%。'}}}, {'id': 'https://huggingface.co/papers/2501.04306', 'title': 'LLM4SR: A Survey on Large Language Models for Scientific Research', 'url': 'https://huggingface.co/papers/2501.04306', 'abstract': 'In recent years, the rapid advancement of Large Language Models (LLMs) has transformed the landscape of scientific research, offering unprecedented support across various stages of the research cycle. This paper presents the first systematic survey dedicated to exploring how LLMs are revolutionizing the scientific research process. We analyze the unique roles LLMs play across four critical stages of research: hypothesis discovery, experiment planning and implementation, scientific writing, and peer reviewing. Our review comprehensively showcases the task-specific methodologies and evaluation benchmarks. By identifying current challenges and proposing future research directions, this survey not only highlights the transformative potential of LLMs, but also aims to inspire and guide researchers and practitioners in leveraging LLMs to advance scientific inquiry. Resources are available at the following repository: https://github.com/du-nlp-lab/LLM4SR', 'score': 17, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'bfb9039780003b6d', 'authors': ['Ziming Luo', 'Zonglin Yang', 'Zexin Xu', 'Wei Yang', 'Xinya Du'], 'affiliations': ['Nanyang Technological University, Singapore', 'University of Texas at Dallas, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.04306.jpg', 'data': {'categories': ['#science', '#survey', '#multimodal', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'LLM как революционный инструмент в научных исследованиях', 'desc': 'Эта статья представляет собой первый систематический обзор роли больших языковых моделей (LLM) в научных исследованиях. Авторы анализируют, как LLM используются на четырех ключевых этапах исследовательского процесса: формирование гипотез, планирование и проведение экспериментов, научное письмо и рецензирование. В работе рассматриваются специфические методологии и критерии оценки для каждой задачи. Статья также обсуждает текущие проблемы и предлагает направления для будущих исследований в этой области.'}, 'en': {'title': 'Revolutionizing Research: The Power of Large Language Models', 'desc': 'This paper systematically surveys the impact of Large Language Models (LLMs) on the scientific research process. It identifies how LLMs assist in four key stages: generating hypotheses, planning and conducting experiments, writing scientific papers, and facilitating peer reviews. The authors discuss specific methodologies and evaluation benchmarks for each task, highlighting the transformative potential of LLMs in enhancing research efficiency. Additionally, the paper addresses current challenges and suggests future research directions to further integrate LLMs into scientific inquiry.'}, 'zh': {'title': '大型语言模型:科学研究的变革者', 'desc': '近年来,大型语言模型(LLMs)的快速发展改变了科学研究的格局,为研究周期的各个阶段提供了前所未有的支持。本文首次系统性地调查了LLMs如何革新科学研究过程,分析了它们在假设发现、实验规划与实施、科学写作和同行评审等四个关键阶段的独特作用。我们的综述全面展示了任务特定的方法论和评估基准,并识别了当前面临的挑战,提出了未来的研究方向。通过强调LLMs的变革潜力,本文旨在激励和指导研究人员和从业者利用LLMs推动科学探索。'}}}, {'id': 'https://huggingface.co/papers/2501.04575', 'title': 'InfiGUIAgent: A Multimodal Generalist GUI Agent with Native Reasoning and Reflection', 'url': 'https://huggingface.co/papers/2501.04575', 'abstract': 'Graphical User Interface (GUI) Agents, powered by multimodal large language models (MLLMs), have shown great potential for task automation on computing devices such as computers and mobile phones. However, existing agents face challenges in multi-step reasoning and reliance on textual annotations, limiting their effectiveness. We introduce InfiGUIAgent, an MLLM-based GUI Agent trained with a two-stage supervised fine-tuning pipeline. Stage 1 enhances fundamental skills such as GUI understanding and grounding, while Stage 2 integrates hierarchical reasoning and expectation-reflection reasoning skills using synthesized data to enable native reasoning abilities of the agents. InfiGUIAgent achieves competitive performance on several GUI benchmarks, highlighting the impact of native reasoning skills in enhancing GUI interaction for automation tasks. Resources are available at https://github.com/Reallm-Labs/InfiGUIAgent.', 'score': 14, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '501c7ba58ede235b', 'authors': ['Yuhang Liu', 'Pengxiang Li', 'Zishu Wei', 'Congkai Xie', 'Xueyu Hu', 'Xinchen Xu', 'Shengyu Zhang', 'Xiaotian Han', 'Hongxia Yang', 'Fei Wu'], 'affiliations': ['ByteDance Inc', 'Dalian University of Technology', 'Reallm Labs', 'The Hong Kong Polytechnic University', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04575.jpg', 'data': {'categories': ['#benchmark', '#synthetic', '#training', '#agents', '#multimodal', '#reasoning'], 'emoji': '🤖', 'ru': {'title': 'Умный агент GUI: новый уровень автоматизации интерфейсов', 'desc': 'InfiGUIAgent - это агент графического пользовательского интерфейса, основанный на мультимодальных больших языковых моделях (MLLM). Он обучается с помощью двухэтапного процесса точной настройки, который улучшает базовые навыки понимания GUI и развивает способности к иерархическому рассуждению. InfiGUIAgent демонстрирует высокую эффективность в автоматизации задач взаимодействия с GUI, превосходя существующие подходы. Разработка направлена на преодоление ограничений, связанных с многошаговыми рассуждениями и зависимостью от текстовых аннотаций.'}, 'en': {'title': 'Empowering GUI Agents with Native Reasoning Skills', 'desc': "InfiGUIAgent is a new type of Graphical User Interface (GUI) agent that uses multimodal large language models (MLLMs) to improve task automation on devices like computers and smartphones. This agent addresses the limitations of existing systems by employing a two-stage supervised fine-tuning process. The first stage focuses on developing basic skills such as understanding and interacting with GUIs, while the second stage enhances the agent's ability to perform complex reasoning tasks. As a result, InfiGUIAgent demonstrates strong performance on various GUI benchmarks, showcasing the importance of advanced reasoning capabilities in automating GUI interactions."}, 'zh': {'title': '提升GUI交互的原生推理能力', 'desc': '本文介绍了一种名为InfiGUIAgent的图形用户界面(GUI)代理,它基于多模态大型语言模型(MLLM)进行任务自动化。InfiGUIAgent通过两阶段的监督微调流程进行训练,第一阶段提升了GUI理解和基础技能,第二阶段则通过合成数据整合了层次推理和期望反思推理能力。该代理在多个GUI基准测试中表现出色,显示了原生推理能力在增强GUI交互中的重要性。此研究为提高计算设备上的自动化任务提供了新的思路和方法。'}}}, {'id': 'https://huggingface.co/papers/2501.02772', 'title': 'GeAR: Generation Augmented Retrieval', 'url': 'https://huggingface.co/papers/2501.02772', 'abstract': 'Document retrieval techniques form the foundation for the development of large-scale information systems. The prevailing methodology is to construct a bi-encoder and compute the semantic similarity. However, such scalar similarity is difficult to reflect enough information and impedes our comprehension of the retrieval results. In addition, this computational process mainly emphasizes the global semantics and ignores the fine-grained semantic relationship between the query and the complex text in the document. In this paper, we propose a new method called Generation Augmented Retrieval (GeAR) that incorporates well-designed fusion and decoding modules. This enables GeAR to generate the relevant text from documents based on the fused representation of the query and the document, thus learning to "focus on" the fine-grained information. Also when used as a retriever, GeAR does not add any computational burden over bi-encoders. To support the training of the new framework, we have introduced a pipeline to efficiently synthesize high-quality data by utilizing large language models. GeAR exhibits competitive retrieval and localization performance across diverse scenarios and datasets. Moreover, the qualitative analysis and the results generated by GeAR provide novel insights into the interpretation of retrieval results. The code, data, and models will be released after completing technical review to facilitate future research.', 'score': 11, 'issue_id': 1572, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'dafa87428ce906b5', 'authors': ['Haoyu Liu', 'Shaohan Huang', 'Jianfeng Liu', 'Yuefeng Zhan', 'Hao Sun', 'Weiwei Deng', 'Feng Sun', 'Furu Wei', 'Qi Zhang'], 'affiliations': ['Microsoft Corporation'], 'pdf_title_img': 'assets/pdf/title_img/2501.02772.jpg', 'data': {'categories': ['#interpretability', '#data', '#rag', '#synthetic', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'GeAR: Новый взгляд на извлечение документов через генерацию', 'desc': 'Статья предлагает новый метод извлечения документов под названием Generation Augmented Retrieval (GeAR). В отличие от традиционных би-энкодеров, GeAR использует модули слияния и декодирования для генерации релевантного текста на основе запроса и документа. Это позволяет модели фокусироваться на детальной информации, не увеличивая вычислительную нагрузку. Авторы также разработали конвейер для синтеза качественных данных с помощью больших языковых моделей для обучения GeAR.'}, 'en': {'title': 'GeAR: Enhancing Document Retrieval with Fine-Grained Semantic Focus', 'desc': 'This paper introduces a new method called Generation Augmented Retrieval (GeAR) that enhances document retrieval techniques by focusing on fine-grained semantic relationships. Unlike traditional bi-encoders that primarily assess global semantics, GeAR generates relevant text from documents by fusing the query and document representations. This approach allows for a deeper understanding of retrieval results without increasing computational costs. Additionally, the authors provide a pipeline for synthesizing high-quality training data using large language models, leading to improved performance across various datasets.'}, 'zh': {'title': '生成增强检索:关注细粒度信息的创新方法', 'desc': '本文提出了一种新的文档检索方法,称为生成增强检索(GeAR)。GeAR通过融合查询和文档的表示,生成相关文本,从而关注细粒度信息。与传统的双编码器方法相比,GeAR在检索时不会增加计算负担,同时在多种场景和数据集上表现出竞争力的检索和定位性能。该方法还通过利用大型语言模型合成高质量数据,支持新框架的训练。'}}}, {'id': 'https://huggingface.co/papers/2501.04144', 'title': 'Chirpy3D: Continuous Part Latents for Creative 3D Bird Generation', 'url': 'https://huggingface.co/papers/2501.04144', 'abstract': 'In this paper, we push the boundaries of fine-grained 3D generation into truly creative territory. Current methods either lack intricate details or simply mimic existing objects -- we enable both. By lifting 2D fine-grained understanding into 3D through multi-view diffusion and modeling part latents as continuous distributions, we unlock the ability to generate entirely new, yet plausible parts through interpolation and sampling. A self-supervised feature consistency loss further ensures stable generation of these unseen parts. The result is the first system capable of creating novel 3D objects with species-specific details that transcend existing examples. While we demonstrate our approach on birds, the underlying framework extends beyond things that can chirp! Code will be released at https://github.com/kamwoh/chirpy3d.', 'score': 9, 'issue_id': 1578, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '89e2fad397bf0684', 'authors': ['Kam Woh Ng', 'Jing Yang', 'Jia Wei Sii', 'Jiankang Deng', 'Chee Seng Chan', 'Yi-Zhe Song', 'Tao Xiang', 'Xiatian Zhu'], 'affiliations': ['Imperial College London', 'Universiti Malaya', 'University of Cambridge', 'University of Surrey'], 'pdf_title_img': 'assets/pdf/title_img/2501.04144.jpg', 'data': {'categories': ['#diffusion', '#open_source', '#3d'], 'emoji': '🐦', 'ru': {'title': 'Генерация креативных 3D-моделей с беспрецедентной детализацией', 'desc': 'Эта статья представляет новый метод генерации детализированных 3D-объектов, выходящий за рамки простого копирования существующих примеров. Авторы используют мультиракурсную диффузию и моделирование латентных представлений частей объекта как непрерывных распределений. Это позволяет создавать совершенно новые, но правдоподобные части объектов путем интерполяции и сэмплирования. Самоконтролируемая функция потерь обеспечивает стабильную генерацию этих невиданных ранее частей.'}, 'en': {'title': 'Unlocking Creative 3D Generation with Fine-Grained Detail', 'desc': 'This paper introduces a novel approach to generating detailed 3D objects that are not just replicas of existing items. By utilizing multi-view diffusion and treating part latents as continuous distributions, the authors enable the creation of new and realistic 3D parts through interpolation and sampling techniques. A self-supervised feature consistency loss is implemented to maintain stability in generating these novel parts. The system is demonstrated on birds, showcasing its ability to produce unique species-specific details, while the framework is applicable to a broader range of objects.'}, 'zh': {'title': '突破性细粒度3D生成,创造全新物体!', 'desc': '本文提出了一种创新的细粒度3D生成方法,能够创造出全新的3D物体,而不仅仅是模仿现有物体。我们通过多视角扩散将2D细粒度理解提升到3D,并将部分潜变量建模为连续分布,从而实现了新部件的插值和采样生成。自监督特征一致性损失确保了这些未见部件的稳定生成。我们的系统能够生成具有特定物种细节的全新3D对象,超越了现有的示例。'}}}, {'id': 'https://huggingface.co/papers/2501.04689', 'title': 'SPAR3D: Stable Point-Aware Reconstruction of 3D Objects from Single Images', 'url': 'https://huggingface.co/papers/2501.04689', 'abstract': 'We study the problem of single-image 3D object reconstruction. Recent works have diverged into two directions: regression-based modeling and generative modeling. Regression methods efficiently infer visible surfaces, but struggle with occluded regions. Generative methods handle uncertain regions better by modeling distributions, but are computationally expensive and the generation is often misaligned with visible surfaces. In this paper, we present SPAR3D, a novel two-stage approach aiming to take the best of both directions. The first stage of SPAR3D generates sparse 3D point clouds using a lightweight point diffusion model, which has a fast sampling speed. The second stage uses both the sampled point cloud and the input image to create highly detailed meshes. Our two-stage design enables probabilistic modeling of the ill-posed single-image 3D task while maintaining high computational efficiency and great output fidelity. Using point clouds as an intermediate representation further allows for interactive user edits. Evaluated on diverse datasets, SPAR3D demonstrates superior performance over previous state-of-the-art methods, at an inference speed of 0.7 seconds. Project page with code and model: https://spar3d.github.io', 'score': 9, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '00474027a65aa27c', 'authors': ['Zixuan Huang', 'Mark Boss', 'Aaryaman Vasishta', 'James M. Rehg', 'Varun Jampani'], 'affiliations': ['Stability AI', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2501.04689.jpg', 'data': {'categories': ['#3d'], 'emoji': '🧊', 'ru': {'title': 'SPAR3D: Эффективная реконструкция 3D-объектов с использованием облаков точек', 'desc': 'В статье представлен новый двухэтапный подход SPAR3D для реконструкции 3D-объектов по одному изображению. На первом этапе генерируется разреженное облако точек с помощью легковесной модели диффузии точек. На втором этапе используются сгенерированное облако точек и исходное изображение для создания детализированных 3D-моделей. Этот метод сочетает преимущества регрессионного и генеративного моделирования, обеспечивая высокую вычислительную эффективность и качество результатов.'}, 'en': {'title': 'SPAR3D: Efficient and Detailed 3D Reconstruction from a Single Image', 'desc': 'This paper introduces SPAR3D, a new method for reconstructing 3D objects from a single image. It combines regression and generative modeling to efficiently create 3D point clouds and detailed meshes. The first stage generates sparse point clouds quickly, while the second stage refines these into high-quality meshes using the input image. SPAR3D achieves high fidelity and speed, outperforming existing methods and allowing for user interaction with the 3D output.'}, 'zh': {'title': 'SPAR3D:高效的单图像三维重建新方法', 'desc': '我们研究了单幅图像的三维物体重建问题。最近的研究分为两种方向:基于回归的建模和生成建模。回归方法能够有效推断可见表面,但在处理遮挡区域时表现不佳;而生成方法通过建模分布更好地处理不确定区域,但计算开销大且生成结果常常与可见表面不对齐。本文提出了SPAR3D,这是一种新颖的两阶段方法,旨在结合两种方法的优点,快速生成稀疏的三维点云,并利用输入图像创建高细节的网格。'}}}, {'id': 'https://huggingface.co/papers/2501.03271', 'title': 'DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich Paradigm for Direct Preference Optimization', 'url': 'https://huggingface.co/papers/2501.03271', 'abstract': 'The rapid rise of large language models (LLMs) has unlocked many applications but also underscores the challenge of aligning them with diverse values and preferences. Direct Preference Optimization (DPO) is central to alignment but constrained by fixed divergences and limited feature transformations. We propose DPO-Kernels, which integrates kernel methods to address these issues through four key contributions: (i) Kernelized Representations with polynomial, RBF, Mahalanobis, and spectral kernels for richer transformations, plus a hybrid loss combining embedding-based and probability-based objectives; (ii) Divergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya, Wasserstein, and f-divergences) for greater stability; (iii) Data-Driven Selection metrics that automatically choose the best kernel-divergence pair; and (iv) a Hierarchical Mixture of Kernels for both local precision and global modeling. Evaluations on 12 datasets demonstrate state-of-the-art performance in factuality, safety, reasoning, and instruction following. Grounded in Heavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization for LLMs, offering a comprehensive resource for further alignment research.', 'score': 5, 'issue_id': 1576, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': '33d1640aee045ed5', 'authors': ['Amitava Das', 'Suranjana Trivedy', 'Danush Khanna', 'Rajarshi Roy', 'Gurpreet Singh', 'Basab Ghosh', 'Yaswanth Narsupalli', 'Vinija Jain', 'Vasu Sharma', 'Aishwarya Naresh Reganti', 'Aman Chadha'], 'affiliations': ['Amazon AI, USA', 'Artificial Intelligence Institute, University of South Carolina, USA', 'Meta AI, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.03271.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#reasoning', '#dataset', '#training'], 'emoji': '🧠', 'ru': {'title': 'DPO-Kernels: Новый подход к выравниванию языковых моделей', 'desc': 'Статья представляет новый метод под названием DPO-Kernels для улучшения выравнивания больших языковых моделей (LLM) с различными ценностями и предпочтениями. Авторы предлагают использовать методы ядер для расширения возможностей прямой оптимизации предпочтений (DPO), включая кернелизованные представления, альтернативные дивергенции и data-driven выбор наилучшей комбинации ядра и дивергенции. DPO-Kernels демонстрирует улучшенные результаты в задачах фактологичности, безопасности, рассуждений и следования инструкциям на 12 наборах данных. Метод основан на саморегуляризации с тяжелыми хвостами и обеспечивает надежную генерализацию для LLM.'}, 'en': {'title': 'Enhancing LLM Alignment with DPO-Kernels', 'desc': 'This paper introduces DPO-Kernels, a method designed to improve the alignment of large language models (LLMs) with diverse user values. It enhances Direct Preference Optimization (DPO) by incorporating kernel methods, allowing for more flexible feature transformations and better divergence measures. The approach includes a hybrid loss function, various divergence alternatives, and data-driven selection metrics to optimize performance. Evaluations show that DPO-Kernels achieves state-of-the-art results in key areas such as factuality and safety across multiple datasets.'}, 'zh': {'title': 'DPO-Kernels:提升大型语言模型对齐的创新方法', 'desc': '大型语言模型(LLMs)的快速发展带来了许多应用,但也突显了与多样化价值观和偏好对齐的挑战。直接偏好优化(DPO)是对齐的核心,但受到固定散度和有限特征变换的限制。我们提出了DPO-Kernels,通过四个关键贡献来解决这些问题,包括使用多项式、RBF、Mahalanobis和谱核的核化表示,以及结合嵌入基础和基于概率的目标的混合损失。我们的评估在12个数据集上展示了在事实性、安全性、推理和指令遵循方面的最先进性能,DPO-Kernels为进一步的对齐研究提供了全面的资源。'}}}, {'id': 'https://huggingface.co/papers/2501.04694', 'title': 'EpiCoder: Encompassing Diversity and Complexity in Code Generation', 'url': 'https://huggingface.co/papers/2501.04694', 'abstract': 'Effective instruction tuning is indispensable for optimizing code LLMs, aligning model behavior with user expectations and enhancing model performance in real-world applications. However, most existing methods focus on code snippets, which are limited to specific functionalities and rigid structures, restricting the complexity and diversity of the synthesized data. To address these limitations, we introduce a novel feature tree-based synthesis framework inspired by Abstract Syntax Trees (AST). Unlike AST, which captures syntactic structure of code, our framework models semantic relationships between code elements, enabling the generation of more nuanced and diverse data. The feature tree is constructed from raw data and refined iteratively to increase the quantity and diversity of the extracted features. This process enables the identification of more complex patterns and relationships within the code. By sampling subtrees with controlled depth and breadth, our framework allows precise adjustments to the complexity of the generated code, supporting a wide range of tasks from simple function-level operations to intricate multi-file scenarios. We fine-tuned widely-used base models to create the EpiCoder series, achieving state-of-the-art performance at both the function and file levels across multiple benchmarks. Notably, empirical evidence indicates that our approach shows significant potential in synthesizing highly complex repository-level code data. Further analysis elucidates the merits of this approach by rigorously assessing data complexity and diversity through software engineering principles and LLM-as-a-judge method.', 'score': 4, 'issue_id': 1581, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1c1ef93cdfc23c2f', 'authors': ['Yaoxiang Wang', 'Haoling Li', 'Xin Zhang', 'Jie Wu', 'Xiao Liu', 'Wenxiang Hu', 'Zhongxin Guo', 'Yangyu Huang', 'Ying Xin', 'Yujiu Yang', 'Jinsong Su', 'Qi Chen', 'Scarlett Li'], 'affiliations': ['Microsoft', 'Tsinghua University', 'Xiamen University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04694.jpg', 'data': {'categories': ['#dataset', '#data', '#synthetic', '#training', '#optimization', '#alignment', '#architecture'], 'emoji': '🌳', 'ru': {'title': 'Дерево признаков: новый путь к улучшению языковых моделей для кода', 'desc': 'Статья представляет новый подход к улучшению языковых моделей для программирования с использованием дерева признаков, вдохновленного абстрактными синтаксическими деревьями. Этот метод позволяет генерировать более сложные и разнообразные обучающие данные, моделируя семантические связи между элементами кода. Авторы создали серию моделей EpiCoder, достигших высоких результатов в нескольких бенчмарках. Эмпирические данные показывают потенциал метода для синтеза сложных репозиториев кода.'}, 'en': {'title': 'Unlocking Code Complexity with Feature Trees', 'desc': 'This paper presents a new framework for instruction tuning in code language models (LLMs) that enhances their performance by generating more complex and diverse code data. The proposed feature tree-based synthesis framework goes beyond traditional code snippet methods by modeling semantic relationships between code elements, inspired by Abstract Syntax Trees (AST). By iteratively refining the feature tree, the framework captures intricate patterns and relationships, allowing for the generation of code that ranges from simple functions to complex multi-file scenarios. The authors demonstrate that their fine-tuned EpiCoder models achieve state-of-the-art results across various benchmarks, highlighting the effectiveness of their approach in synthesizing complex repository-level code data.'}, 'zh': {'title': '特征树框架:提升代码生成的复杂性与多样性', 'desc': '本论文提出了一种新的特征树合成框架,用于优化代码大语言模型(LLMs)的指令调优。该框架通过建模代码元素之间的语义关系,克服了现有方法在功能和结构上的局限性,从而生成更复杂和多样化的数据。特征树从原始数据构建,并通过迭代精炼,增加提取特征的数量和多样性。最终,我们通过微调广泛使用的基础模型,创建了EpiCoder系列,在多个基准测试中实现了函数和文件级别的最先进性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04652', 'title': 'Multi-task retriever fine-tuning for domain-specific and efficient RAG', 'url': 'https://huggingface.co/papers/2501.04652', 'abstract': 'Retrieval-Augmented Generation (RAG) has become ubiquitous when deploying Large Language Models (LLMs), as it can address typical limitations such as generating hallucinated or outdated information. However, when building real-world RAG applications, practical issues arise. First, the retrieved information is generally domain-specific. Since it is computationally expensive to fine-tune LLMs, it is more feasible to fine-tune the retriever to improve the quality of the data included in the LLM input. Second, as more applications are deployed in the same real-world system, one cannot afford to deploy separate retrievers. Moreover, these RAG applications normally retrieve different kinds of data. Our solution is to instruction fine-tune a small retriever encoder on a variety of domain-specific tasks to allow us to deploy one encoder that can serve many use cases, thereby achieving low-cost, scalability, and speed. We show how this encoder generalizes to out-of-domain settings as well as to an unseen retrieval task on real-world enterprise use cases.', 'score': 1, 'issue_id': 1584, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1c906eb3ec9e3da5', 'authors': ['Patrice Béchard', 'Orlando Marquez Ayala'], 'affiliations': ['ServiceNow'], 'pdf_title_img': 'assets/pdf/title_img/2501.04652.jpg', 'data': {'categories': ['#transfer_learning', '#training', '#hallucinations', '#rag', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Универсальный извлекатель информации для эффективного RAG', 'desc': 'Данная статья представляет новый подход к улучшению систем извлечения информации для крупных языковых моделей. Авторы предлагают дообучать небольшой энкодер для извлечения информации на различных доменно-специфичных задачах. Это позволяет использовать один энкодер для множества приложений, обеспечивая масштабируемость и эффективность. Исследование показывает, что такой подход хорошо обобщается на новые домены и задачи извлечения информации в реальных корпоративных сценариях.'}, 'en': {'title': 'One Retriever to Rule Them All: Scalable RAG Solutions', 'desc': 'This paper discusses the challenges of using Retrieval-Augmented Generation (RAG) with Large Language Models (LLMs), particularly the issues of domain-specific information retrieval and the high cost of fine-tuning LLMs. The authors propose a solution that involves instruction fine-tuning a small retriever encoder on multiple domain-specific tasks, allowing it to serve various applications without needing separate retrievers. This approach enhances the quality of data fed into the LLM while maintaining low costs and scalability. The results demonstrate that the fine-tuned encoder can effectively generalize to new, unseen tasks in real-world scenarios.'}, 'zh': {'title': '一个编码器,多种应用,低成本高效能', 'desc': '检索增强生成(RAG)在部署大型语言模型(LLM)时变得非常普遍,因为它可以解决生成虚假或过时信息的典型问题。本文提出了一种解决方案,通过对小型检索器编码器进行指令微调,使其能够在多种特定领域任务上工作,从而实现一个编码器服务多个用例。这样可以降低成本,提高可扩展性和速度,同时避免为每个应用程序部署单独的检索器。我们的实验表明,该编码器在不同领域设置和未见过的检索任务中也能很好地泛化。'}}}, {'id': 'https://huggingface.co/papers/2501.00958', 'title': '2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining', 'url': 'https://huggingface.co/papers/2501.00958', 'abstract': 'Compared to image-text pair data, interleaved corpora enable Vision-Language Models (VLMs) to understand the world more naturally like humans. However, such existing datasets are crawled from webpage, facing challenges like low knowledge density, loose image-text relations, and poor logical coherence between images. On the other hand, the internet hosts vast instructional videos (e.g., online geometry courses) that are widely used by humans to learn foundational subjects, yet these valuable resources remain underexplored in VLM training. In this paper, we introduce a high-quality multimodal textbook corpus with richer foundational knowledge for VLM pretraining. It collects over 2.5 years of instructional videos, totaling 22,000 class hours. We first use an LLM-proposed taxonomy to systematically gather instructional videos. Then we progressively extract and refine visual (keyframes), audio (ASR), and textual knowledge (OCR) from the videos, and organize as an image-text interleaved corpus based on temporal order. Compared to its counterparts, our video-centric textbook offers more coherent context, richer knowledge, and better image-text alignment. Experiments demonstrate its superb pretraining performance, particularly in knowledge- and reasoning-intensive tasks like ScienceQA and MathVista. Moreover, VLMs pre-trained on our textbook exhibit outstanding interleaved context awareness, leveraging visual and textual cues in their few-shot context for task solving~Our code are available at \\url{https://github.com/DAMO-NLP-SG/multimodal_textbook}.', 'score': 68, 'issue_id': 1475, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'b10f0cd62f6334fc', 'authors': ['Wenqi Zhang', 'Hang Zhang', 'Xin Li', 'Jiashuo Sun', 'Yongliang Shen', 'Weiming Lu', 'Deli Zhao', 'Yueting Zhuang', 'Lidong Bing'], 'affiliations': ['College of Computer Science and Technology, Zhejiang University', 'DAMO Academy, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.00958.jpg', 'data': {'categories': ['#science', '#dataset', '#reasoning', '#multimodal', '#cv', '#video'], 'emoji': '📚', 'ru': {'title': 'Мультимодальный учебник: новый стандарт для обучения VLM', 'desc': 'Эта статья представляет новый подход к обучению моделей компьютерного зрения и обработки естественного языка (VLM) с использованием мультимодального учебного корпуса. Авторы создали базу данных из 22 000 часов обучающих видео, систематически собранных с помощью таксономии, предложенной языковой моделью (LLM). Этот корпус отличается более высокой плотностью знаний, лучшей связью между изображениями и текстом, а также логической согласованностью по сравнению с существующими наборами данных. Эксперименты показывают превосходную производительность предобучения на этом корпусе, особенно в задачах, требующих глубоких знаний и рассуждений.'}, 'en': {'title': 'Harnessing Instructional Videos for Superior Vision-Language Model Training', 'desc': 'This paper presents a new approach to training Vision-Language Models (VLMs) using a multimodal textbook corpus derived from instructional videos. Unlike traditional datasets that often suffer from low knowledge density and weak image-text relationships, this corpus offers a richer and more coherent context for VLM pretraining. The authors systematically extract visual, audio, and textual information from over 22,000 hours of instructional content, enhancing the alignment between images and text. Experiments show that VLMs trained on this video-centric dataset perform significantly better on knowledge-intensive tasks, demonstrating improved reasoning and context awareness.'}, 'zh': {'title': '视频教材:提升视觉语言模型的知识与推理能力', 'desc': '本文提出了一种高质量的多模态教材语料库,旨在为视觉语言模型(VLM)提供更丰富的基础知识。该语料库收集了超过2.5年的教学视频,总计22,000小时,系统性地提取了视频中的视觉、音频和文本知识。与现有的数据集相比,这种视频中心的教材提供了更连贯的上下文、更丰富的知识和更好的图像-文本对齐。实验结果表明,基于该教材预训练的VLM在知识和推理密集型任务中表现优异,尤其在ScienceQA和MathVista等任务中。'}}}, {'id': 'https://huggingface.co/papers/2501.01427', 'title': 'VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion Control', 'url': 'https://huggingface.co/papers/2501.01427', 'abstract': 'Despite significant advancements in video generation, inserting a given object into videos remains a challenging task. The difficulty lies in preserving the appearance details of the reference object and accurately modeling coherent motions at the same time. In this paper, we propose VideoAnydoor, a zero-shot video object insertion framework with high-fidelity detail preservation and precise motion control. Starting from a text-to-video model, we utilize an ID extractor to inject the global identity and leverage a box sequence to control the overall motion. To preserve the detailed appearance and meanwhile support fine-grained motion control, we design a pixel warper. It takes the reference image with arbitrary key-points and the corresponding key-point trajectories as inputs. It warps the pixel details according to the trajectories and fuses the warped features with the diffusion U-Net, thus improving detail preservation and supporting users in manipulating the motion trajectories. In addition, we propose a training strategy involving both videos and static images with a reweight reconstruction loss to enhance insertion quality. VideoAnydoor demonstrates significant superiority over existing methods and naturally supports various downstream applications (e.g., talking head generation, video virtual try-on, multi-region editing) without task-specific fine-tuning.', 'score': 39, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '4c67f688775a3eca', 'authors': ['Yuanpeng Tu', 'Hao Luo', 'Xi Chen', 'Sihui Ji', 'Xiang Bai', 'Hengshuang Zhao'], 'affiliations': ['DAMO Academy, Alibaba Group', 'HUST', 'Hupan Lab', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.01427.jpg', 'data': {'categories': ['#diffusion', '#games', '#video'], 'emoji': '🎬', 'ru': {'title': 'Точная вставка объектов в видео с сохранением деталей', 'desc': 'В этой статье представлен VideoAnydoor - фреймворк для вставки объектов в видео без предварительного обучения. Он использует экстрактор идентификаторов и последовательность ограничивающих рамок для контроля движения объекта. Ключевым компонентом является пиксельный варпер, который сохраняет детали внешнего вида и позволяет точно управлять движением. Предложенная стратегия обучения с использованием видео и статических изображений улучшает качество вставки объектов.'}, 'en': {'title': 'Seamless Object Insertion in Videos with VideoAnydoor', 'desc': 'This paper introduces VideoAnydoor, a novel framework for zero-shot video object insertion that excels in maintaining high-fidelity details and precise motion control. The approach begins with a text-to-video model and incorporates an ID extractor to ensure consistent object identity while using a box sequence for motion management. A key innovation is the pixel warper, which adjusts pixel details based on key-point trajectories, enhancing both detail preservation and user control over motion. The proposed training strategy, which combines videos and static images with a reweighted reconstruction loss, significantly improves the quality of object insertion, making VideoAnydoor versatile for various applications without needing specific fine-tuning.'}, 'zh': {'title': '高保真视频对象插入的新突破', 'desc': '尽管视频生成技术取得了显著进展,但将特定对象插入视频仍然是一项具有挑战性的任务。本文提出了VideoAnydoor,这是一个零-shot视频对象插入框架,能够高保真地保留细节并精确控制运动。我们设计了一种像素变形器,能够根据关键点轨迹扭曲像素细节,并与扩散U-Net融合,从而提高细节保留能力。VideoAnydoor在现有方法中表现出显著优势,并支持多种下游应用,无需特定任务的微调。'}}}, {'id': 'https://huggingface.co/papers/2501.01257', 'title': 'CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings', 'url': 'https://huggingface.co/papers/2501.01257', 'abstract': 'With the increasing code reasoning capabilities of existing large language models (LLMs) and breakthroughs in reasoning models like OpenAI o1 and o3, there is a growing need to develop more challenging and comprehensive benchmarks that effectively test their sophisticated competition-level coding abilities. Existing benchmarks, like LiveCodeBench and USACO, fall short due to the unavailability of private test cases, lack of support for special judges, and misaligned execution environments. To bridge this gap, we introduce CodeElo, a standardized competition-level code generation benchmark that effectively addresses all these challenges for the first time. CodeElo benchmark is mainly based on the official CodeForces platform and tries to align with the platform as much as possible. We compile the recent six months of contest problems on CodeForces with detailed information such as contest divisions, problem difficulty ratings, and problem algorithm tags. We introduce a unique judging method in which problems are submitted directly to the platform and develop a reliable Elo rating calculation system that aligns with the platform and is comparable with human participants but has lower variance. By testing on our CodeElo, we provide the Elo ratings of 30 existing popular open-source and 3 proprietary LLMs for the first time. The results show that o1-mini and QwQ-32B-Preview stand out significantly, achieving Elo ratings of 1578 and 1261, respectively, while other models struggle even with the easiest problems, placing in the lowest 20 percent among all human participants. Detailed analysis experiments are also conducted to provide insights into performance across algorithms and comparisons between using C++ and Python, which can suggest directions for future studies.', 'score': 36, 'issue_id': 1475, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'e31430bb6ba5dfc8', 'authors': ['Shanghaoran Quan', 'Jiaxi Yang', 'Bowen Yu', 'Bo Zheng', 'Dayiheng Liu', 'An Yang', 'Xuancheng Ren', 'Bofei Gao', 'Yibo Miao', 'Yunlong Feng', 'Zekun Wang', 'Jian Yang', 'Zeyu Cui', 'Yang Fan', 'Yichang Zhang', 'Binyuan Hui', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.01257.jpg', 'data': {'categories': ['#dataset', '#benchmark', '#reasoning', '#optimization', '#open_source'], 'emoji': '🏆', 'ru': {'title': 'CodeElo: новый стандарт оценки LLM в соревновательном программировании', 'desc': 'Статья представляет новый бенчмарк CodeElo для оценки способностей больших языковых моделей (LLM) в решении задач по программированию соревновательного уровня. CodeElo основан на платформе CodeForces и включает проблемы с детальной информацией о сложности и алгоритмических тегах. Авторы разработали систему расчета рейтинга Эло, сопоставимую с рейтингами человеческих участников. Результаты тестирования 33 LLM показали, что модели o1-mini и QwQ-32B-Preview значительно превосходят остальные, достигая рейтингов 1578 и 1261 соответственно.'}, 'en': {'title': 'CodeElo: Elevating Code Generation Benchmarks for LLMs', 'desc': 'This paper presents CodeElo, a new benchmark designed to evaluate the coding abilities of large language models (LLMs) in a competitive setting. Unlike existing benchmarks, CodeElo addresses limitations such as the lack of private test cases and misaligned execution environments by utilizing the CodeForces platform. The benchmark includes a unique judging method and an Elo rating system that allows for fair comparisons between LLMs and human participants. Results indicate that certain models, like o1-mini, perform significantly better than others, highlighting the varying capabilities of LLMs in code generation tasks.'}, 'zh': {'title': 'CodeElo:提升代码生成能力的标准化基准测试', 'desc': '随着大型语言模型(LLMs)在代码推理能力上的提升,开发更具挑战性和全面性的基准测试变得愈发重要。现有的基准测试如LiveCodeBench和USACO存在一些不足,例如缺乏私有测试用例和特殊评判支持。为了解决这些问题,我们提出了CodeElo,这是一个标准化的竞赛级代码生成基准,首次有效应对这些挑战。通过在CodeForces平台上编译最近六个月的竞赛问题,我们为30个流行的开源和3个专有LLMs提供了Elo评分,结果显示o1-mini和QwQ-32B-Preview表现突出。'}}}, {'id': 'https://huggingface.co/papers/2501.00599', 'title': 'VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with Video LLM', 'url': 'https://huggingface.co/papers/2501.00599', 'abstract': 'Video Large Language Models (Video LLMs) have recently exhibited remarkable capabilities in general video understanding. However, they mainly focus on holistic comprehension and struggle with capturing fine-grained spatial and temporal details. Besides, the lack of high-quality object-level video instruction data and a comprehensive benchmark further hinders their advancements. To tackle these challenges, we introduce the VideoRefer Suite to empower Video LLM for finer-level spatial-temporal video understanding, i.e., enabling perception and reasoning on any objects throughout the video. Specially, we thoroughly develop VideoRefer Suite across three essential aspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent data engine to meticulously curate a large-scale, high-quality object-level video instruction dataset, termed VideoRefer-700K. Next, we present the VideoRefer model, which equips a versatile spatial-temporal object encoder to capture precise regional and sequential representations. Finally, we meticulously create a VideoRefer-Bench to comprehensively assess the spatial-temporal understanding capability of a Video LLM, evaluating it across various aspects. Extensive experiments and analyses demonstrate that our VideoRefer model not only achieves promising performance on video referring benchmarks but also facilitates general video understanding capabilities.', 'score': 31, 'issue_id': 1474, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': 'daee687ce36ef3db', 'authors': ['Yuqian Yuan', 'Hang Zhang', 'Wentong Li', 'Zesen Cheng', 'Boqiang Zhang', 'Long Li', 'Xin Li', 'Deli Zhao', 'Wenqiao Zhang', 'Yueting Zhuang', 'Jianke Zhu', 'Lidong Bing'], 'affiliations': ['DAMO Academy, Alibaba Group', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00599.jpg', 'data': {'categories': ['#reasoning', '#benchmark', '#dataset', '#optimization', '#video'], 'emoji': '🎥', 'ru': {'title': 'Точное пространственно-временное понимание видео с помощью VideoRefer Suite', 'desc': 'Статья представляет VideoRefer Suite - комплексный подход к улучшению пространственно-временного понимания видео большими языковыми моделями. Авторы разработали масштабный набор данных VideoRefer-700K с инструкциями на уровне объектов, созданный с помощью мультиагентного движка. Они также представили модель VideoRefer с универсальным пространственно-временным кодировщиком объектов. Для оценки возможностей видео-LLM был создан бенчмарк VideoRefer-Bench, охватывающий различные аспекты понимания видео.'}, 'en': {'title': 'Empowering Video LLMs for Fine-Grained Understanding', 'desc': 'This paper introduces the VideoRefer Suite, which enhances Video Large Language Models (Video LLMs) for better understanding of videos by focusing on fine-grained spatial and temporal details. It addresses the limitations of existing models that primarily focus on overall comprehension and lack high-quality object-level instruction data. The suite includes a new dataset called VideoRefer-700K, a specialized VideoRefer model with a spatial-temporal object encoder, and a benchmark for evaluating video understanding capabilities. Experimental results show that the VideoRefer model significantly improves performance on video referring tasks while also enhancing general video comprehension.'}, 'zh': {'title': '提升视频理解,细致捕捉空间与时间', 'desc': '视频大型语言模型(Video LLMs)在视频理解方面展现了出色的能力,但在捕捉细粒度的空间和时间细节上存在困难。为了应对这些挑战,我们提出了VideoRefer Suite,以增强视频LLM在空间-时间视频理解方面的能力。我们开发了一个多代理数据引擎,创建了一个高质量的对象级视频指令数据集VideoRefer-700K,并提出了VideoRefer模型,配备了多功能的空间-时间对象编码器。最后,我们创建了VideoRefer-Bench,以全面评估视频LLM的空间-时间理解能力,实验结果表明我们的模型在视频引用基准上表现优异。'}}}, {'id': 'https://huggingface.co/papers/2501.01423', 'title': 'Reconstruction vs. Generation: Taming Optimization Dilemma in Latent Diffusion Models', 'url': 'https://huggingface.co/papers/2501.01423', 'abstract': 'Latent diffusion models with Transformer architectures excel at generating high-fidelity images. However, recent studies reveal an optimization dilemma in this two-stage design: while increasing the per-token feature dimension in visual tokenizers improves reconstruction quality, it requires substantially larger diffusion models and more training iterations to achieve comparable generation performance. Consequently, existing systems often settle for sub-optimal solutions, either producing visual artifacts due to information loss within tokenizers or failing to converge fully due to expensive computation costs. We argue that this dilemma stems from the inherent difficulty in learning unconstrained high-dimensional latent spaces. To address this, we propose aligning the latent space with pre-trained vision foundation models when training the visual tokenizers. Our proposed VA-VAE (Vision foundation model Aligned Variational AutoEncoder) significantly expands the reconstruction-generation frontier of latent diffusion models, enabling faster convergence of Diffusion Transformers (DiT) in high-dimensional latent spaces. To exploit the full potential of VA-VAE, we build an enhanced DiT baseline with improved training strategies and architecture designs, termed LightningDiT. The integrated system achieves state-of-the-art (SOTA) performance on ImageNet 256x256 generation with an FID score of 1.35 while demonstrating remarkable training efficiency by reaching an FID score of 2.11 in just 64 epochs--representing an over 21 times convergence speedup compared to the original DiT. Models and codes are available at: https://github.com/hustvl/LightningDiT.', 'score': 30, 'issue_id': 1473, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '173fa21b6e47d04c', 'authors': ['Jingfeng Yao', 'Xinggang Wang'], 'affiliations': ['Huazhong University of Science and Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.01423.jpg', 'data': {'categories': ['#training', '#optimization', '#cv', '#architecture', '#diffusion'], 'emoji': '⚡', 'ru': {'title': 'Революция в латентных диффузионных моделях: быстрее, лучше, эффективнее', 'desc': 'Статья представляет новый подход к улучшению латентных диффузионных моделей с архитектурой Трансформер для генерации изображений высокого качества. Авторы предлагают метод VA-VAE, который выравнивает латентное пространство с предобученными моделями компьютерного зрения. Это позволяет значительно расширить границы реконструкции-генерации и ускорить сходимость Диффузионных Трансформеров в высокоразмерных латентных пространствах. На основе VA-VAE авторы создали улучшенную модель LightningDiT, достигающую современного уровня производительности на задаче генерации изображений ImageNet 256x256.'}, 'en': {'title': 'Accelerating Image Generation with Aligned Latent Spaces', 'desc': 'This paper discusses the challenges faced by latent diffusion models, particularly when using Transformer architectures for image generation. It highlights an optimization issue where increasing the feature dimensions in visual tokenizers can lead to larger models and longer training times, often resulting in sub-optimal image quality. The authors propose a solution by aligning the latent space with pre-trained vision models, introducing a new framework called VA-VAE to enhance the training process. Their improved model, LightningDiT, achieves state-of-the-art performance in image generation while significantly speeding up the training process.'}, 'zh': {'title': '优化潜在扩散模型,提升图像生成效率', 'desc': '本论文探讨了潜在扩散模型与变换器架构在生成高质量图像时的优化困境。研究表明,虽然增加视觉标记器中的每个标记特征维度可以提高重建质量,但这也导致需要更大的扩散模型和更多的训练迭代。为了解决这一问题,作者提出将潜在空间与预训练的视觉基础模型对齐,从而提高训练效率。最终,提出的VA-VAE模型显著提升了潜在扩散模型的重建生成能力,并在ImageNet数据集上实现了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.00103', 'title': 'LTX-Video: Realtime Video Latent Diffusion', 'url': 'https://huggingface.co/papers/2501.00103', 'abstract': "We introduce LTX-Video, a transformer-based latent diffusion model that adopts a holistic approach to video generation by seamlessly integrating the responsibilities of the Video-VAE and the denoising transformer. Unlike existing methods, which treat these components as independent, LTX-Video aims to optimize their interaction for improved efficiency and quality. At its core is a carefully designed Video-VAE that achieves a high compression ratio of 1:192, with spatiotemporal downscaling of 32 x 32 x 8 pixels per token, enabled by relocating the patchifying operation from the transformer's input to the VAE's input. Operating in this highly compressed latent space enables the transformer to efficiently perform full spatiotemporal self-attention, which is essential for generating high-resolution videos with temporal consistency. However, the high compression inherently limits the representation of fine details. To address this, our VAE decoder is tasked with both latent-to-pixel conversion and the final denoising step, producing the clean result directly in pixel space. This approach preserves the ability to generate fine details without incurring the runtime cost of a separate upsampling module. Our model supports diverse use cases, including text-to-video and image-to-video generation, with both capabilities trained simultaneously. It achieves faster-than-real-time generation, producing 5 seconds of 24 fps video at 768x512 resolution in just 2 seconds on an Nvidia H100 GPU, outperforming all existing models of similar scale. The source code and pre-trained models are publicly available, setting a new benchmark for accessible and scalable video generation.", 'score': 29, 'issue_id': 1484, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'a2358f7cf156ff08', 'authors': ['Yoav HaCohen', 'Nisan Chiprut', 'Benny Brazowski', 'Daniel Shalem', 'Dudu Moshe', 'Eitan Richardson', 'Eran Levin', 'Guy Shiran', 'Nir Zabari', 'Ori Gordon', 'Poriya Panet', 'Sapir Weissbuch', 'Victor Kulikov', 'Yaki Bitterman', 'Zeev Melumian', 'Ofir Bibi'], 'affiliations': ['Lightricks'], 'pdf_title_img': 'assets/pdf/title_img/2501.00103.jpg', 'data': {'categories': ['#open_source', '#benchmark', '#video', '#diffusion'], 'emoji': '🎬', 'ru': {'title': 'Революция в генерации видео: быстрее реального времени', 'desc': 'LTX-Video - это трансформерная модель латентной диффузии для генерации видео. Она объединяет функции Video-VAE и шумоподавляющего трансформера, оптимизируя их взаимодействие. Модель использует сильно сжатое латентное пространство, позволяя трансформеру эффективно выполнять полное пространственно-временное самовнимание. LTX-Video поддерживает генерацию видео из текста и изображений, превосходя существующие модели по скорости и качеству.'}, 'en': {'title': 'Revolutionizing Video Generation with LTX-Video', 'desc': "LTX-Video is a novel transformer-based latent diffusion model designed for efficient video generation by integrating the roles of Video-VAE and denoising transformers. It achieves a high compression ratio of 1:192, allowing the model to operate in a compressed latent space while maintaining spatiotemporal self-attention for generating high-resolution videos. The model's VAE decoder performs both latent-to-pixel conversion and denoising, enabling the generation of fine details without the need for a separate upsampling module. With capabilities for text-to-video and image-to-video generation, LTX-Video produces videos faster than real-time, setting a new standard in the field."}, 'zh': {'title': 'LTX-Video:高效视频生成的新标准', 'desc': 'LTX-Video是一种基于变换器的潜在扩散模型,旨在通过整合视频生成中的Video-VAE和去噪变换器的功能来提高效率和质量。该模型的核心是一个高压缩比的Video-VAE,能够在压缩的潜在空间中高效执行时空自注意力,从而生成高分辨率且具有时间一致性的视频。为了克服高压缩带来的细节损失,VAE解码器同时负责潜在到像素的转换和最终的去噪步骤,直接在像素空间中生成清晰的结果。LTX-Video支持多种应用场景,包括文本到视频和图像到视频的生成,并且在Nvidia H100 GPU上以超实时速度生成视频,设立了视频生成的新基准。'}}}, {'id': 'https://huggingface.co/papers/2501.01264', 'title': 'ProgCo: Program Helps Self-Correction of Large Language Models', 'url': 'https://huggingface.co/papers/2501.01264', 'abstract': 'Self-Correction aims to enable large language models (LLMs) to self-verify and self-refine their initial responses without external feedback. However, LLMs often fail to effectively self-verify and generate correct feedback, further misleading refinement and leading to the failure of self-correction, especially in complex reasoning tasks. In this paper, we propose Program-driven Self-Correction (ProgCo). First, program-driven verification (ProgVe) achieves complex verification logic and extensive validation through self-generated, self-executing verification pseudo-programs. Then, program-driven refinement (ProgRe) receives feedback from ProgVe, conducts dual reflection and refinement on both responses and verification programs to mitigate misleading of incorrect feedback in complex reasoning tasks. Experiments on three instruction-following and mathematical benchmarks indicate that ProgCo achieves effective self-correction, and can be further enhance performance when combined with real program tools.', 'score': 22, 'issue_id': 1473, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'bda3f96e83319526', 'authors': ['Xiaoshuai Song', 'Yanan Wu', 'Weixun Wang', 'Jiaheng Liu', 'Wenbo Su', 'Bo Zheng'], 'affiliations': ['Taobao & Tmall Group of Alibaba'], 'pdf_title_img': 'assets/pdf/title_img/2501.01264.jpg', 'data': {'categories': ['#training', '#math', '#reasoning', '#interpretability', '#rlhf'], 'emoji': '🤖', 'ru': {'title': 'ProgCo: Самокоррекция языковых моделей через программно-управляемую верификацию и уточнение', 'desc': 'Эта статья представляет новый подход к самокоррекции больших языковых моделей (LLM) под названием Program-driven Self-Correction (ProgCo). Метод включает в себя программно-управляемую верификацию (ProgVe), которая использует самогенерируемые и самовыполняющиеся псевдопрограммы для сложной логики проверки. Затем программно-управляемое уточнение (ProgRe) проводит двойную рефлексию и улучшение как ответов, так и программ верификации. Эксперименты показали, что ProgCo эффективен в самокоррекции и может дополнительно улучшить производительность при комбинировании с реальными программными инструментами.'}, 'en': {'title': 'Empowering LLMs with Program-Driven Self-Correction', 'desc': 'This paper introduces Program-driven Self-Correction (ProgCo) to improve the self-verification and self-refinement capabilities of large language models (LLMs). It addresses the common issue where LLMs struggle to provide accurate feedback, which can lead to incorrect refinements, particularly in complex reasoning tasks. ProgCo utilizes program-driven verification (ProgVe) to create self-executing verification pseudo-programs that enhance the verification process. Additionally, program-driven refinement (ProgRe) allows the model to reflect on and refine both its responses and the verification programs, leading to more reliable self-correction outcomes.'}, 'zh': {'title': '基于程序的自我纠正:提升语言模型的自我验证能力', 'desc': '自我纠正旨在使大型语言模型(LLMs)能够在没有外部反馈的情况下自我验证和自我完善其初始响应。然而,LLMs往往无法有效自我验证并生成正确的反馈,这会进一步误导其完善过程,尤其是在复杂推理任务中。本文提出了基于程序的自我纠正(ProgCo),通过自生成、自执行的验证伪程序实现复杂的验证逻辑和广泛的验证。实验结果表明,ProgCo在三个指令遵循和数学基准测试中实现了有效的自我纠正,并且与真实程序工具结合时可以进一步提升性能。'}}}, {'id': 'https://huggingface.co/papers/2501.00316', 'title': 'MapEval: A Map-Based Evaluation of Geo-Spatial Reasoning in Foundation Models', 'url': 'https://huggingface.co/papers/2501.00316', 'abstract': "Recent advancements in foundation models have enhanced AI systems' capabilities in autonomous tool usage and reasoning. However, their ability in location or map-based reasoning - which improves daily life by optimizing navigation, facilitating resource discovery, and streamlining logistics - has not been systematically studied. To bridge this gap, we introduce MapEval, a benchmark designed to assess diverse and complex map-based user queries with geo-spatial reasoning. MapEval features three task types (textual, API-based, and visual) that require collecting world information via map tools, processing heterogeneous geo-spatial contexts (e.g., named entities, travel distances, user reviews or ratings, images), and compositional reasoning, which all state-of-the-art foundation models find challenging. Comprising 700 unique multiple-choice questions about locations across 180 cities and 54 countries, MapEval evaluates foundation models' ability to handle spatial relationships, map infographics, travel planning, and navigation challenges. Using MapEval, we conducted a comprehensive evaluation of 28 prominent foundation models. While no single model excelled across all tasks, Claude-3.5-Sonnet, GPT-4o, and Gemini-1.5-Pro achieved competitive performance overall. However, substantial performance gaps emerged, particularly in MapEval, where agents with Claude-3.5-Sonnet outperformed GPT-4o and Gemini-1.5-Pro by 16% and 21%, respectively, and the gaps became even more amplified when compared to open-source LLMs. Our detailed analyses provide insights into the strengths and weaknesses of current models, though all models still fall short of human performance by more than 20% on average, struggling with complex map images and rigorous geo-spatial reasoning. This gap highlights MapEval's critical role in advancing general-purpose foundation models with stronger geo-spatial understanding.", 'score': 20, 'issue_id': 1477, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': 'a4e45c6bd9d30ff4', 'authors': ['Mahir Labib Dihan', 'Md Tanvir Hassan', 'Md Tanvir Parvez', 'Md Hasebul Hasan', 'Md Almash Alam', 'Muhammad Aamir Cheema', 'Mohammed Eunus Ali', 'Md Rizwan Parvez'], 'affiliations': ['Bangladesh Computer Council (BCC)', 'Department of Computer Science and Engineering Bangladesh University of Engineering and Technology (BUET)', 'Monash University', 'Qatar Computing Research Institute (QCRI)', 'Statistics, Islamic University Bangladesh'], 'pdf_title_img': 'assets/pdf/title_img/2501.00316.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#multimodal', '#survey'], 'emoji': '🗺️', 'ru': {'title': 'MapEval: Новый рубеж в геопространственном ИИ', 'desc': 'Статья представляет MapEval - новый бенчмарк для оценки способностей моделей искусственного интеллекта в области пространственных рассуждений и работы с картами. MapEval включает 700 вопросов с множественным выбором, охватывающих 180 городов и 54 страны, и оценивает навыки моделей в понимании пространственных отношений, инфографики карт, планирования путешествий и навигации. Авторы провели оценку 28 ведущих фундаментальных моделей, выявив значительные различия в производительности, при этом все модели все еще отстают от человеческого уровня более чем на 20%. Результаты исследования подчеркивают важность MapEval для развития моделей с более сильным геопространственным пониманием.'}, 'en': {'title': "Enhancing AI's Geo-Spatial Reasoning with MapEval", 'desc': 'This paper introduces MapEval, a benchmark designed to evaluate the performance of foundation models in map-based reasoning tasks. It focuses on assessing how well these models can handle complex geo-spatial queries, which are essential for navigation and resource discovery. The benchmark includes various task types that require models to process diverse information, such as travel distances and user reviews, and perform compositional reasoning. The evaluation reveals that while some models perform competitively, they still lag behind human capabilities, indicating a need for further advancements in geo-spatial understanding within AI systems.'}, 'zh': {'title': '提升地图推理能力的基准评估', 'desc': '最近基础模型的进展提升了人工智能系统在自主工具使用和推理方面的能力。然而,它们在基于位置或地图的推理能力上尚未得到系统研究,这对于优化导航、资源发现和物流管理至关重要。为了解决这个问题,我们引入了MapEval,一个旨在评估复杂地图用户查询的基准,涉及地理空间推理。MapEval包含700个关于180个城市和54个国家的独特多项选择题,评估基础模型在处理空间关系、地图信息、旅行规划和导航挑战方面的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.01149', 'title': 'A3: Android Agent Arena for Mobile GUI Agents', 'url': 'https://huggingface.co/papers/2501.01149', 'abstract': 'AI agents have become increasingly prevalent in recent years, driven by significant advancements in the field of large language models (LLMs). Mobile GUI agents, a subset of AI agents, are designed to autonomously perform tasks on mobile devices. While numerous studies have introduced agents, datasets, and benchmarks to advance mobile GUI agent research, many existing datasets focus on static frame evaluations and fail to provide a comprehensive platform for assessing performance on real-world, in-the-wild tasks. To address this gap, we present Android Agent Arena (A3), a novel evaluation platform. Unlike existing in-the-wild systems, A3 offers: (1) meaningful and practical tasks, such as real-time online information retrieval and operational instructions; (2) a larger, more flexible action space, enabling compatibility with agents trained on any dataset; and (3) automated business-level LLM-based evaluation process. A3 includes 21 widely used general third-party apps and 201 tasks representative of common user scenarios, providing a robust foundation for evaluating mobile GUI agents in real-world situations and a new autonomous evaluation process for less human labor and coding expertise. The project is available at https://yuxiangchai.github.io/Android-Agent-Arena/.', 'score': 20, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '050f155aa526c100', 'authors': ['Yuxiang Chai', 'Hanhao Li', 'Jiayu Zhang', 'Liang Liu', 'Guozhi Wang', 'Shuai Ren', 'Siyuan Huang', 'Hongsheng Li'], 'affiliations': ['EE department @ CUHK', 'MMLab @ CUHK'], 'pdf_title_img': 'assets/pdf/title_img/2501.01149.jpg', 'data': {'categories': ['#benchmark', '#dataset', '#agents'], 'emoji': '🤖', 'ru': {'title': 'A3: Арена для тестирования мобильных AI-агентов в реальном мире', 'desc': 'Статья представляет новую платформу для оценки мобильных GUI-агентов под названием Android Agent Arena (A3). A3 предлагает реалистичные задачи, широкое пространство действий и автоматизированную оценку на основе больших языковых моделей. Платформа включает 21 популярное стороннее приложение и 201 задачу, отражающую типичные пользовательские сценарии. A3 позволяет оценивать производительность агентов в реальных условиях, что отличает её от существующих статических наборов данных.'}, 'en': {'title': 'Revolutionizing Mobile GUI Agent Evaluation with A3', 'desc': 'This paper introduces the Android Agent Arena (A3), a new evaluation platform for mobile GUI agents that addresses limitations in existing datasets. A3 focuses on real-world tasks, providing a larger action space that accommodates agents trained on various datasets. It features 21 popular third-party apps and 201 tasks that reflect common user scenarios, enhancing the assessment of agent performance. Additionally, A3 incorporates an automated evaluation process using large language models, reducing the need for extensive human involvement and coding skills.'}, 'zh': {'title': 'Android Agent Arena:移动GUI代理的新评估平台', 'desc': '近年来,人工智能代理的应用越来越广泛,尤其是在大型语言模型(LLMs)领域的进步推动下。移动图形用户界面(GUI)代理是人工智能代理的一种,旨在自主执行移动设备上的任务。现有的研究虽然提出了许多代理、数据集和基准,但大多数数据集仅关注静态框架评估,无法全面评估真实世界中的任务表现。为了解决这一问题,我们提出了Android Agent Arena(A3),这是一个新颖的评估平台,提供了实际的任务和更灵活的操作空间,支持基于LLM的自动化评估过程。'}}}, {'id': 'https://huggingface.co/papers/2501.00192', 'title': 'MLLM-as-a-Judge for Image Safety without Human Labeling', 'url': 'https://huggingface.co/papers/2501.00192', 'abstract': 'Image content safety has become a significant challenge with the rise of visual media on online platforms. Meanwhile, in the age of AI-generated content (AIGC), many image generation models are capable of producing harmful content, such as images containing sexual or violent material. Thus, it becomes crucial to identify such unsafe images based on established safety rules. Pre-trained Multimodal Large Language Models (MLLMs) offer potential in this regard, given their strong pattern recognition abilities. Existing approaches typically fine-tune MLLMs with human-labeled datasets, which however brings a series of drawbacks. First, relying on human annotators to label data following intricate and detailed guidelines is both expensive and labor-intensive. Furthermore, users of safety judgment systems may need to frequently update safety rules, making fine-tuning on human-based annotation more challenging. This raises the research question: Can we detect unsafe images by querying MLLMs in a zero-shot setting using a predefined safety constitution (a set of safety rules)? Our research showed that simply querying pre-trained MLLMs does not yield satisfactory results. This lack of effectiveness stems from factors such as the subjectivity of safety rules, the complexity of lengthy constitutions, and the inherent biases in the models. To address these challenges, we propose a MLLM-based method includes objectifying safety rules, assessing the relevance between rules and images, making quick judgments based on debiased token probabilities with logically complete yet simplified precondition chains for safety rules, and conducting more in-depth reasoning with cascaded chain-of-thought processes if necessary. Experiment results demonstrate that our method is highly effective for zero-shot image safety judgment tasks.', 'score': 20, 'issue_id': 1474, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': '2a62bcbb87c1b7a5', 'authors': ['Zhenting Wang', 'Shuming Hu', 'Shiyu Zhao', 'Xiaowen Lin', 'Felix Juefei-Xu', 'Zhuowei Li', 'Ligong Han', 'Harihar Subramanyam', 'Li Chen', 'Jianfa Chen', 'Nan Jiang', 'Lingjuan Lyu', 'Shiqing Ma', 'Dimitris N. Metaxas', 'Ankit Jain'], 'affiliations': ['GenAI @ Meta', 'Rutgers University', 'UMass Amherst', 'Westlake University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00192.jpg', 'data': {'categories': ['#reasoning', '#training', '#ethics', '#cv', '#multimodal'], 'emoji': '🛡️', 'ru': {'title': 'Интеллектуальная защита: Zero-shot оценка безопасности изображений с помощью MLLM', 'desc': 'Статья представляет метод определения безопасности изображений с использованием мультимодальных больших языковых моделей (MLLM) в режиме zero-shot. Авторы предлагают подход, включающий объективизацию правил безопасности, оценку релевантности между правилами и изображениями, и быстрое принятие решений на основе дебиасированных вероятностей токенов. Метод также включает каскадные цепочки рассуждений для более глубокого анализа при необходимости. Эксперименты показывают высокую эффективность предложенного метода для задач оценки безопасности изображений без предварительного обучения.'}, 'en': {'title': 'Zero-Shot Image Safety Detection with MLLMs', 'desc': 'This paper addresses the challenge of identifying unsafe images in the context of AI-generated content using Multimodal Large Language Models (MLLMs). The authors propose a novel approach that allows for zero-shot detection of harmful images by utilizing predefined safety rules without the need for extensive human labeling. They highlight the limitations of traditional methods, such as the subjectivity of safety rules and the biases present in models. The proposed method enhances safety judgment by objectifying rules, assessing their relevance to images, and employing a reasoning process that simplifies complex safety guidelines.'}, 'zh': {'title': '利用MLLMs实现零样本图像安全判断', 'desc': '随着在线平台视觉媒体的兴起,图像内容安全成为一个重要挑战。许多图像生成模型能够产生有害内容,因此识别不安全图像变得至关重要。我们提出了一种基于预训练多模态大语言模型(MLLMs)的方法,通过查询这些模型来检测不安全图像,而无需依赖人工标注。实验结果表明,我们的方法在零样本图像安全判断任务中非常有效。'}}}, {'id': 'https://huggingface.co/papers/2501.01426', 'title': 'Unifying Specialized Visual Encoders for Video Language Models', 'url': 'https://huggingface.co/papers/2501.01426', 'abstract': 'The recent advent of Large Language Models (LLMs) has ushered sophisticated reasoning capabilities into the realm of video through Video Large Language Models (VideoLLMs). However, VideoLLMs currently rely on a single vision encoder for all of their visual processing, which limits the amount and type of visual information that can be conveyed to the LLM. Our method, MERV, Multi-Encoder Representation of Videos, instead leverages multiple frozen visual encoders to create a unified representation of a video, providing the VideoLLM with a comprehensive set of specialized visual knowledge. Spatio-temporally aligning the features from each encoder allows us to tackle a wider range of open-ended and multiple-choice video understanding questions and outperform prior state-of-the-art works. MERV is up to 3.7% better in accuracy than Video-LLaVA across the standard suite video understanding benchmarks, while also having a better Video-ChatGPT score. We also improve upon SeViLA, the previous best on zero-shot Perception Test accuracy, by 2.2%. MERV introduces minimal extra parameters and trains faster than equivalent single-encoder methods while parallelizing the visual processing. Finally, we provide qualitative evidence that MERV successfully captures domain knowledge from each of its encoders. Our results offer promising directions in utilizing multiple vision encoders for comprehensive video understanding.', 'score': 19, 'issue_id': 1488, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'c868a7ebcbafa704', 'authors': ['Jihoon Chung', 'Tyler Zhu', 'Max Gonzalez Saez-Diez', 'Juan Carlos Niebles', 'Honglu Zhou', 'Olga Russakovsky'], 'affiliations': ['Princeton University', 'Salesforce Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.01426.jpg', 'data': {'categories': ['#architecture', '#reasoning', '#video', '#benchmark', '#multimodal', '#optimization'], 'emoji': '🎥', 'ru': {'title': 'MERV: Многоэнкодерное представление видео для улучшенного машинного понимания', 'desc': 'Статья представляет MERV - новый метод для улучшения понимания видео с помощью больших языковых моделей. MERV использует несколько замороженных визуальных энкодеров для создания единого представления видео, что позволяет охватить больший объем визуальной информации. Этот подход превосходит предыдущие методы в точности на стандартных тестах понимания видео. MERV вводит минимальное количество дополнительных параметров и обучается быстрее, чем эквивалентные методы с одним энкодером.'}, 'en': {'title': 'Unlocking Video Understanding with Multi-Encoder Magic!', 'desc': 'This paper introduces MERV, a method that enhances Video Large Language Models (VideoLLMs) by using multiple visual encoders instead of just one. By combining the outputs of these encoders, MERV creates a richer representation of videos, which helps the model understand complex video content better. The approach allows for improved performance on various video understanding tasks, achieving higher accuracy than previous models. Additionally, MERV is efficient, requiring fewer parameters and training time while effectively leveraging the strengths of each encoder.'}, 'zh': {'title': '多编码器提升视频理解能力', 'desc': '本文介绍了一种名为MERV(多编码器视频表示)的方法,旨在提升视频理解的能力。MERV通过使用多个冻结的视觉编码器,创建视频的统一表示,从而为视频大型语言模型(VideoLLM)提供更全面的视觉知识。通过时空对齐每个编码器的特征,MERV能够更好地处理开放式和多选的视频理解问题,且在准确性上超越了之前的最佳模型。该方法不仅提高了性能,还在参数和训练速度上优于单编码器方法,展示了多视觉编码器在视频理解中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01054', 'title': 'Dynamic Scaling of Unit Tests for Code Reward Modeling', 'url': 'https://huggingface.co/papers/2501.01054', 'abstract': 'Current large language models (LLMs) often struggle to produce accurate responses on the first attempt for complex reasoning tasks like code generation. Prior research tackles this challenge by generating multiple candidate solutions and validating them with LLM-generated unit tests. The execution results of unit tests serve as reward signals to identify correct solutions. As LLMs always confidently make mistakes, these unit tests are not reliable, thereby diminishing the quality of reward signals. Motivated by the observation that scaling the number of solutions improves LLM performance, we explore the impact of scaling unit tests to enhance reward signal quality. Our pioneer experiment reveals a positive correlation between the number of unit tests and reward signal quality, with greater benefits observed in more challenging problems. Based on these insights, we propose CodeRM-8B, a lightweight yet effective unit test generator that enables efficient and high-quality unit test scaling. Additionally, we implement a dynamic scaling mechanism that adapts the number of unit tests based on problem difficulty, further improving efficiency. Experimental results show that our approach significantly improves performance across various models on three benchmarks (e.g., with gains of 18.43% for Llama3-8B and 3.42% for GPT-4o-mini on HumanEval Plus).', 'score': 15, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '33b9590f2acb0e48', 'authors': ['Zeyao Ma', 'Xiaokang Zhang', 'Jing Zhang', 'Jifan Yu', 'Sijia Luo', 'Jie Tang'], 'affiliations': ['Key Laboratory of Data Engineering and Knowledge Engineering, Beijing, China', 'School of Information, Renmin University of China', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01054.jpg', 'data': {'categories': ['#reasoning', '#benchmark', '#training', '#small_models', '#rlhf', '#optimization'], 'emoji': '🧪', 'ru': {'title': 'Масштабирование юнит-тестов для повышения точности LLM в сложных задачах', 'desc': 'Эта статья посвящена улучшению точности больших языковых моделей (LLM) в задачах сложного мышления, таких как генерация кода. Авторы предлагают метод масштабирования юнит-тестов для повышения качества сигналов вознаграждения при оценке решений. Они разработали легковесный генератор юнит-тестов CodeRM-8B и механизм динамического масштабирования, адаптирующийся к сложности задачи. Эксперименты показали значительное улучшение производительности различных моделей на нескольких тестовых наборах.'}, 'en': {'title': 'Enhancing LLM Performance through Scaled Unit Testing', 'desc': 'This paper addresses the limitations of large language models (LLMs) in generating accurate responses for complex tasks like code generation. It highlights the issue of unreliable reward signals from LLM-generated unit tests, which can lead to incorrect solutions. The authors propose a novel approach, CodeRM-8B, which generates a larger number of unit tests to improve the quality of these reward signals. Their experiments demonstrate that scaling unit tests enhances LLM performance, particularly for more challenging problems, leading to significant improvements across various models.'}, 'zh': {'title': '提升单元测试质量,增强模型性能', 'desc': '当前的大型语言模型(LLMs)在复杂推理任务(如代码生成)中,往往难以在第一次尝试时产生准确的响应。以往的研究通过生成多个候选解决方案并使用LLM生成的单元测试进行验证来应对这一挑战。单元测试的执行结果作为奖励信号,用于识别正确的解决方案。然而,由于LLMs常常自信地犯错,这些单元测试的可靠性不足,从而降低了奖励信号的质量。我们提出了CodeRM-8B,一个轻量级且有效的单元测试生成器,能够高效地扩展单元测试,并根据问题的难度动态调整单元测试的数量,从而进一步提高效率。'}}}, {'id': 'https://huggingface.co/papers/2501.01320', 'title': 'SeedVR: Seeding Infinity in Diffusion Transformer Towards Generic Video Restoration', 'url': 'https://huggingface.co/papers/2501.01320', 'abstract': "Video restoration poses non-trivial challenges in maintaining fidelity while recovering temporally consistent details from unknown degradations in the wild. Despite recent advances in diffusion-based restoration, these methods often face limitations in generation capability and sampling efficiency. In this work, we present SeedVR, a diffusion transformer designed to handle real-world video restoration with arbitrary length and resolution. The core design of SeedVR lies in the shifted window attention that facilitates effective restoration on long video sequences. SeedVR further supports variable-sized windows near the boundary of both spatial and temporal dimensions, overcoming the resolution constraints of traditional window attention. Equipped with contemporary practices, including causal video autoencoder, mixed image and video training, and progressive training, SeedVR achieves highly-competitive performance on both synthetic and real-world benchmarks, as well as AI-generated videos. Extensive experiments demonstrate SeedVR's superiority over existing methods for generic video restoration.", 'score': 8, 'issue_id': 1479, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'fa277e5baed864a4', 'authors': ['Jianyi Wang', 'Zhijie Lin', 'Meng Wei', 'Yang Zhao', 'Ceyuan Yang', 'Chen Change Loy', 'Lu Jiang'], 'affiliations': ['ByteDance', 'Nanyang Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01320.jpg', 'data': {'categories': ['#architecture', '#benchmark', '#long_context', '#video', '#training', '#diffusion', '#synthetic'], 'emoji': '🎥', 'ru': {'title': 'SeedVR: Восстановление видео нового поколения с помощью диффузионных трансформеров', 'desc': 'SeedVR - это диффузионный трансформер для восстановления видео в реальных условиях. Он использует сдвинутое оконное внимание для эффективной обработки длинных видеопоследовательностей. SeedVR поддерживает окна переменного размера на границах пространственных и временных измерений, преодолевая ограничения традиционного оконного внимания. Благодаря современным практикам, таким как каузальный видеоавтоэнкодер и прогрессивное обучение, SeedVR достигает высоких результатов на синтетических и реальных тестовых наборах.'}, 'en': {'title': 'SeedVR: Revolutionizing Video Restoration with Diffusion Transformers', 'desc': 'This paper introduces SeedVR, a novel diffusion transformer aimed at improving video restoration by effectively managing long sequences and varying resolutions. It utilizes shifted window attention to enhance the restoration process, allowing for better handling of temporal consistency and fidelity in videos. SeedVR incorporates advanced techniques such as causal video autoencoders and mixed training strategies to boost its performance on both synthetic and real-world datasets. The results show that SeedVR outperforms existing video restoration methods, making it a significant advancement in the field.'}, 'zh': {'title': 'SeedVR:高效的视频修复新方法', 'desc': '视频修复面临着在恢复未知退化的同时保持细节一致性的挑战。尽管基于扩散的修复方法有所进展,但它们在生成能力和采样效率上仍存在局限性。本文提出了SeedVR,这是一种专为处理任意长度和分辨率的真实视频修复而设计的扩散变换器。SeedVR通过移动窗口注意力机制,有效地处理长视频序列,并在空间和时间维度的边界附近支持可变大小的窗口,克服了传统窗口注意力的分辨率限制。'}}}, {'id': 'https://huggingface.co/papers/2412.21015', 'title': 'MapQaTor: A System for Efficient Annotation of Map Query Datasets', 'url': 'https://huggingface.co/papers/2412.21015', 'abstract': 'Mapping and navigation services like Google Maps, Apple Maps, Openstreet Maps, are essential for accessing various location-based data, yet they often struggle to handle natural language geospatial queries. Recent advancements in Large Language Models (LLMs) show promise in question answering (QA), but creating reliable geospatial QA datasets from map services remains challenging. We introduce MapQaTor, a web application that streamlines the creation of reproducible, traceable map-based QA datasets. With its plug-and-play architecture, MapQaTor enables seamless integration with any maps API, allowing users to gather and visualize data from diverse sources with minimal setup. By caching API responses, the platform ensures consistent ground truth, enhancing the reliability of the data even as real-world information evolves. MapQaTor centralizes data retrieval, annotation, and visualization within a single platform, offering a unique opportunity to evaluate the current state of LLM-based geospatial reasoning while advancing their capabilities for improved geospatial understanding. Evaluation metrics show that, MapQaTor speeds up the annotation process by at least 30 times compared to manual methods, underscoring its potential for developing geospatial resources, such as complex map reasoning datasets. The website is live at: https://mapqator.github.io/ and a demo video is available at: https://youtu.be/7_aV9Wmhs6Q.', 'score': 8, 'issue_id': 1477, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '0d1081756b5bc4f7', 'authors': ['Mahir Labib Dihan', 'Mohammed Eunus Ali', 'Md Rizwan Parvez'], 'affiliations': ['Department of Computer Science and Engineering Bangladesh University of Engineering and Technology (BUET)', 'Qatar Computing Research Institute (QCRI)'], 'pdf_title_img': 'assets/pdf/title_img/2412.21015.jpg', 'data': {'categories': ['#dataset', '#science', '#reasoning', '#data', '#benchmark'], 'emoji': '🗺️', 'ru': {'title': 'MapQaTor: Революция в создании геопространственных данных для ИИ', 'desc': 'MapQaTor - это веб-приложение, которое упрощает создание воспроизводимых наборов данных для вопросно-ответных систем на основе карт. Оно интегрируется с любым картографическим API и позволяет собирать и визуализировать данные из различных источников. MapQaTor кэширует ответы API, обеспечивая согласованность данных, и централизует процессы сбора, аннотации и визуализации. Приложение ускоряет процесс аннотации в 30 раз по сравнению с ручными методами, что делает его полезным инструментом для развития геопространственных ресурсов и оценки возможностей больших языковых моделей в области геопространственных рассуждений.'}, 'en': {'title': 'Streamlining Geospatial QA with MapQaTor', 'desc': 'This paper presents MapQaTor, a web application designed to facilitate the creation of geospatial question answering (QA) datasets using map services. It leverages recent advancements in Large Language Models (LLMs) to improve the handling of natural language queries related to locations. The platform features a plug-and-play architecture that integrates with various maps APIs, allowing users to efficiently gather, annotate, and visualize geospatial data. By caching API responses, MapQaTor ensures consistent and reliable data, significantly speeding up the annotation process and enhancing the evaluation of LLM-based geospatial reasoning capabilities.'}, 'zh': {'title': 'MapQaTor:提升地图问答数据集创建效率的利器', 'desc': '本文介绍了MapQaTor,一个用于创建地图问答数据集的网络应用程序。它利用大型语言模型的优势,简化了从地图服务生成可重复和可追溯的数据集的过程。MapQaTor支持与任何地图API的无缝集成,并通过缓存API响应来确保数据的一致性。该平台显著提高了数据标注的效率,展示了在地理空间推理方面的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01407', 'title': 'Nested Attention: Semantic-aware Attention Values for Concept Personalization', 'url': 'https://huggingface.co/papers/2501.01407', 'abstract': "Personalizing text-to-image models to generate images of specific subjects across diverse scenes and styles is a rapidly advancing field. Current approaches often face challenges in maintaining a balance between identity preservation and alignment with the input text prompt. Some methods rely on a single textual token to represent a subject, which limits expressiveness, while others employ richer representations but disrupt the model's prior, diminishing prompt alignment. In this work, we introduce Nested Attention, a novel mechanism that injects a rich and expressive image representation into the model's existing cross-attention layers. Our key idea is to generate query-dependent subject values, derived from nested attention layers that learn to select relevant subject features for each region in the generated image. We integrate these nested layers into an encoder-based personalization method, and show that they enable high identity preservation while adhering to input text prompts. Our approach is general and can be trained on various domains. Additionally, its prior preservation allows us to combine multiple personalized subjects from different domains in a single image.", 'score': 7, 'issue_id': 1487, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '537e7bcc16fb17f5', 'authors': ['Or Patashnik', 'Rinon Gal', 'Daniil Ostashev', 'Sergey Tulyakov', 'Kfir Aberman', 'Daniel Cohen-Or'], 'affiliations': ['Snap Research', 'Tel Aviv University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01407.jpg', 'data': {'categories': ['#multimodal', '#architecture', '#cv'], 'emoji': '🎨', 'ru': {'title': 'Nested Attention: новый подход к персонализации генерации изображений', 'desc': "Статья представляет новый метод под названием 'Nested Attention' для персонализации моделей text-to-image. Этот механизм внедряет богатое и выразительное представление изображения в существующие слои кросс-внимания модели. Ключевая идея заключается в генерации зависимых от запроса значений субъекта, полученных из вложенных слоев внимания. Метод позволяет достичь высокого сохранения идентичности при соблюдении входных текстовых подсказок."}, 'en': {'title': 'Nested Attention: Balancing Identity and Text Alignment in Image Generation', 'desc': 'This paper presents a new method called Nested Attention for personalizing text-to-image models. The method addresses the challenge of balancing identity preservation of subjects with the alignment to text prompts. By using query-dependent subject values from nested attention layers, the model can effectively select relevant features for each part of the generated image. This approach not only maintains high identity fidelity but also allows for the integration of multiple personalized subjects from different domains into a single image.'}, 'zh': {'title': '嵌套注意力:个性化图像生成的新方法', 'desc': '本文介绍了一种新的机制,称为嵌套注意力,用于个性化文本到图像模型。该方法通过在模型的交叉注意力层中注入丰富的图像表示,解决了身份保留与文本提示对齐之间的平衡问题。嵌套注意力层能够为生成图像的每个区域选择相关的主题特征,从而实现高效的个性化。我们的研究表明,这种方法可以在多个领域进行训练,并允许在单个图像中结合来自不同领域的多个个性化主题。'}}}, {'id': 'https://huggingface.co/papers/2501.00658', 'title': 'Understanding and Mitigating Bottlenecks of State Space Models through the Lens of Recency and Over-smoothing', 'url': 'https://huggingface.co/papers/2501.00658', 'abstract': "Structured State Space Models (SSMs) have emerged as alternatives to transformers. While SSMs are often regarded as effective in capturing long-sequence dependencies, we rigorously demonstrate that they are inherently limited by strong recency bias. Our empirical studies also reveal that this bias impairs the models' ability to recall distant information and introduces robustness issues. Our scaling experiments then discovered that deeper structures in SSMs can facilitate the learning of long contexts. However, subsequent theoretical analysis reveals that as SSMs increase in depth, they exhibit another inevitable tendency toward over-smoothing, e.g., token representations becoming increasingly indistinguishable. This fundamental dilemma between recency and over-smoothing hinders the scalability of existing SSMs. Inspired by our theoretical findings, we propose to polarize two channels of the state transition matrices in SSMs, setting them to zero and one, respectively, simultaneously addressing recency bias and over-smoothing. Experiments demonstrate that our polarization technique consistently enhances the associative recall accuracy of long-range tokens and unlocks SSMs to benefit further from deeper architectures. All source codes are released at https://github.com/VITA-Group/SSM-Bottleneck.", 'score': 6, 'issue_id': 1476, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': '253304ea64defbe0', 'authors': ['Peihao Wang', 'Ruisi Cai', 'Yuehao Wang', 'Jiajun Zhu', 'Pragya Srivastava', 'Zhangyang Wang', 'Pan Li'], 'affiliations': ['Georgia Tech', 'Google DeepMind', 'University of Texas at Austin', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00658.jpg', 'data': {'categories': ['#training', '#open_source', '#long_context', '#optimization', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Преодоление ограничений SSM: баланс между недавностью и сглаживанием', 'desc': 'Структурированные модели пространства состояний (SSM) рассматриваются как альтернатива трансформерам в обработке длинных последовательностей. Исследование показало, что SSM имеют существенное ограничение в виде сильного смещения к недавним данным, что затрудняет запоминание отдаленной информации. Увеличение глубины SSM улучшает обработку длинных контекстов, но приводит к проблеме чрезмерного сглаживания. Авторы предлагают метод поляризации каналов матриц перехода состояний для решения этих проблем, что улучшает точность ассоциативного извлечения дальних токенов.'}, 'en': {'title': 'Balancing Recency and Over-Smoothing in SSMs', 'desc': "This paper discusses Structured State Space Models (SSMs) as alternatives to transformers, highlighting their limitations due to strong recency bias. This bias affects the models' ability to remember distant information and creates robustness issues. The authors propose a solution by polarizing the state transition matrices, which helps mitigate both recency bias and over-smoothing that occurs with deeper architectures. Their experiments show that this new approach improves the accuracy of recalling long-range tokens, allowing SSMs to effectively utilize deeper structures."}, 'zh': {'title': '解决近期偏见与过平滑的双重挑战', 'desc': '结构状态空间模型(SSMs)作为变换器的替代方案,虽然在捕捉长序列依赖性方面表现出色,但存在强烈的近期偏见限制。我们的实证研究表明,这种偏见影响了模型对远程信息的回忆能力,并引入了鲁棒性问题。通过扩展实验,我们发现SSMs的深层结构可以促进长上下文的学习,但理论分析显示,随着深度增加,模型会出现过平滑的趋势,使得标记表示变得难以区分。我们提出的极化技术通过将状态转移矩阵的两个通道设置为零和一,解决了近期偏见和过平滑的问题,显著提高了长距离标记的关联回忆准确性。'}}}, {'id': 'https://huggingface.co/papers/2501.01245', 'title': 'SeFAR: Semi-supervised Fine-grained Action Recognition with Temporal Perturbation and Learning Stabilization', 'url': 'https://huggingface.co/papers/2501.01245', 'abstract': 'Human action understanding is crucial for the advancement of multimodal systems. While recent developments, driven by powerful large language models (LLMs), aim to be general enough to cover a wide range of categories, they often overlook the need for more specific capabilities. In this work, we address the more challenging task of Fine-grained Action Recognition (FAR), which focuses on detailed semantic labels within shorter temporal duration (e.g., "salto backward tucked with 1 turn"). Given the high costs of annotating fine-grained labels and the substantial data needed for fine-tuning LLMs, we propose to adopt semi-supervised learning (SSL). Our framework, SeFAR, incorporates several innovative designs to tackle these challenges. Specifically, to capture sufficient visual details, we construct Dual-level temporal elements as more effective representations, based on which we design a new strong augmentation strategy for the Teacher-Student learning paradigm through involving moderate temporal perturbation. Furthermore, to handle the high uncertainty within the teacher model\'s predictions for FAR, we propose the Adaptive Regulation to stabilize the learning process. Experiments show that SeFAR achieves state-of-the-art performance on two FAR datasets, FineGym and FineDiving, across various data scopes. It also outperforms other semi-supervised methods on two classical coarse-grained datasets, UCF101 and HMDB51. Further analysis and ablation studies validate the effectiveness of our designs. Additionally, we show that the features extracted by our SeFAR could largely promote the ability of multimodal foundation models to understand fine-grained and domain-specific semantics.', 'score': 5, 'issue_id': 1475, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '30d94590a5c78569', 'authors': ['Yongle Huang', 'Haodong Chen', 'Zhenbang Xu', 'Zihan Jia', 'Haozhou Sun', 'Dian Shao'], 'affiliations': ['School of Automation, Northwestern Polytechnical University, Xian, China', 'School of Computer Science, Northwestern Polytechnical University, Xian, China', 'School of Software, Northwestern Polytechnical University, Xian, China', 'Unmanned System Research Institute, Northwestern Polytechnical University, Xian, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01245.jpg', 'data': {'categories': ['#dataset', '#transfer_learning', '#multimodal', '#optimization', '#training'], 'emoji': '🤸', 'ru': {'title': 'SeFAR: Прорыв в распознавании детализированных действий с помощью полу-контролируемого обучения', 'desc': 'Статья представляет новый подход к задаче распознавания детализированных действий (Fine-grained Action Recognition, FAR) с использованием полу-контролируемого обучения. Авторы предлагают фреймворк SeFAR, который включает в себя двухуровневые временные элементы для более эффективного представления действий и новую стратегию аугментации данных. SeFAR также использует адаптивную регуляцию для стабилизации процесса обучения при работе с неопределенностью в предсказаниях модели-учителя. Эксперименты показывают, что SeFAR достигает лучших результатов на нескольких наборах данных FAR и классических наборах данных для распознавания действий.'}, 'en': {'title': 'SeFAR: Elevating Fine-grained Action Recognition with Semi-supervised Learning', 'desc': "This paper focuses on improving Fine-grained Action Recognition (FAR), which identifies specific actions in short time frames. The authors introduce a semi-supervised learning framework called SeFAR, which uses innovative techniques to enhance the learning process despite the challenges of limited labeled data. They develop Dual-level temporal elements for better visual representation and implement a strong augmentation strategy within a Teacher-Student learning setup. The results demonstrate that SeFAR achieves top performance on FAR datasets and enhances multimodal models' understanding of detailed actions."}, 'zh': {'title': '细粒度动作识别的新突破', 'desc': '人类动作理解对多模态系统的发展至关重要。本文提出了一种新的框架SeFAR,专注于细粒度动作识别(FAR),旨在处理短时间内的详细语义标签。我们采用半监督学习(SSL)来减少对大量标注数据的需求,并通过构建双层时间元素和新的强增强策略来提高模型的表现。实验结果表明,SeFAR在多个数据集上达到了最先进的性能,证明了我们设计的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.00910', 'title': 'Population Aware Diffusion for Time Series Generation', 'url': 'https://huggingface.co/papers/2501.00910', 'abstract': 'Diffusion models have shown promising ability in generating high-quality time series (TS) data. Despite the initial success, existing works mostly focus on the authenticity of data at the individual level, but pay less attention to preserving the population-level properties on the entire dataset. Such population-level properties include value distributions for each dimension and distributions of certain functional dependencies (e.g., cross-correlation, CC) between different dimensions. For instance, when generating house energy consumption TS data, the value distributions of the outside temperature and the kitchen temperature should be preserved, as well as the distribution of CC between them. Preserving such TS population-level properties is critical in maintaining the statistical insights of the datasets, mitigating model bias, and augmenting downstream tasks like TS prediction. Yet, it is often overlooked by existing models. Hence, data generated by existing models often bear distribution shifts from the original data. We propose Population-aware Diffusion for Time Series (PaD-TS), a new TS generation model that better preserves the population-level properties. The key novelties of PaD-TS include 1) a new training method explicitly incorporating TS population-level property preservation, and 2) a new dual-channel encoder model architecture that better captures the TS data structure. Empirical results in major benchmark datasets show that PaD-TS can improve the average CC distribution shift score between real and synthetic data by 5.9x while maintaining a performance comparable to state-of-the-art models on individual-level authenticity.', 'score': 4, 'issue_id': 1486, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'cd3f9282d55e15f2', 'authors': ['Yang Li', 'Han Meng', 'Zhenyu Bi', 'Ingolv T. Urnes', 'Haipeng Chen'], 'affiliations': ['Generated Health', 'Virginia Tech', 'William & Mary'], 'pdf_title_img': 'assets/pdf/title_img/2501.00910.jpg', 'data': {'categories': ['#synthetic', '#benchmark', '#dataset', '#data', '#training', '#architecture', '#diffusion'], 'emoji': '📊', 'ru': {'title': 'Генерация временных рядов с сохранением свойств популяции', 'desc': 'Статья представляет новую модель генерации временных рядов под названием PaD-TS (Population-aware Diffusion for Time Series). Модель нацелена на сохранение свойств на уровне популяции, таких как распределения значений и функциональные зависимости между измерениями. PaD-TS использует новый метод обучения, явно включающий сохранение свойств временных рядов на уровне популяции, и новую архитектуру модели с двухканальным энкодером. Эмпирические результаты показывают значительное улучшение в сохранении распределения кросс-корреляций при сравнимой аутентичности на индивидуальном уровне.'}, 'en': {'title': 'Preserving Population Insights in Time Series Generation', 'desc': 'This paper introduces a new model called Population-aware Diffusion for Time Series (PaD-TS) that focuses on generating time series data while preserving important population-level properties. Unlike previous models that mainly ensure individual data authenticity, PaD-TS emphasizes maintaining the overall statistical characteristics of the dataset, such as value distributions and cross-correlations between different dimensions. The model employs a novel training method and a dual-channel encoder architecture to effectively capture the structure of time series data. Experimental results demonstrate that PaD-TS significantly reduces distribution shifts in generated data while achieving comparable performance in individual-level authenticity to existing state-of-the-art models.'}, 'zh': {'title': '保留人口级特性,提升时间序列生成质量', 'desc': '扩散模型在生成高质量时间序列数据方面表现出色。然而,现有研究主要关注个体数据的真实性,而忽视了整个数据集的人口级特性。我们提出了一种新的时间序列生成模型PaD-TS,旨在更好地保留这些人口级特性,包括值分布和不同维度之间的交叉相关性。实验结果表明,PaD-TS在保持个体级真实性的同时,显著改善了真实数据与合成数据之间的分布差异。'}}}, {'id': 'https://huggingface.co/papers/2501.00712', 'title': 'Rethinking Addressing in Language Models via Contexualized Equivariant Positional Encoding', 'url': 'https://huggingface.co/papers/2501.00712', 'abstract': 'Transformers rely on both content-based and position-based addressing mechanisms to make predictions, but existing positional encoding techniques often diminish the effectiveness of position-based addressing. Many current methods enforce rigid patterns in attention maps, limiting the ability to model long-range dependencies and adapt to diverse tasks. Additionally, most positional encodings are learned as general biases, lacking the specialization required for different instances within a dataset. To address this, we propose conTextualized equivariAnt Position Embedding (TAPE), a novel framework that enhances positional embeddings by incorporating sequence content across layers. TAPE introduces dynamic, context-aware positional encodings, overcoming the constraints of traditional fixed patterns. By enforcing permutation and orthogonal equivariance, TAPE ensures the stability of positional encodings during updates, improving robustness and adaptability. Our method can be easily integrated into pre-trained transformers, offering parameter-efficient fine-tuning with minimal overhead. Extensive experiments shows that TAPE achieves superior performance in language modeling, arithmetic reasoning, and long-context retrieval tasks compared to existing positional embedding techniques.', 'score': 4, 'issue_id': 1485, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'e5119d0e83ce2af2', 'authors': ['Jiajun Zhu', 'Peihao Wang', 'Ruisi Cai', 'Jason D. Lee', 'Pan Li', 'Zhangyang Wang'], 'affiliations': ['Georgia Tech', 'Princeton University', 'University of Texas at Austin', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00712.jpg', 'data': {'categories': ['#long_context', '#optimization', '#training', '#architecture', '#reasoning'], 'emoji': '🔀', 'ru': {'title': 'Динамические позиционные эмбеддинги для улучшения работы трансформеров', 'desc': 'Авторы предлагают новый метод позиционного кодирования для трансформеров под названием TAPE. Этот подход учитывает контекст последовательности и создает динамические позиционные эмбеддинги, адаптированные к конкретным задачам. TAPE обеспечивает стабильность кодирования благодаря свойствам перестановочной и ортогональной эквивариантности. Метод легко интегрируется в предобученные модели и показывает превосходные результаты в задачах языкового моделирования, арифметических рассуждений и поиска в длинных контекстах.'}, 'en': {'title': 'Enhancing Transformers with Context-Aware Positional Embeddings', 'desc': "This paper introduces a new method called conTextualized equivariAnt Position Embedding (TAPE) to improve how transformers use positional information. Traditional positional encodings often restrict the model's ability to understand long-range relationships in data. TAPE enhances these encodings by making them dynamic and context-aware, allowing them to adapt to different sequences and tasks. The method shows better performance in various applications, such as language modeling and reasoning, while being easy to integrate into existing transformer models."}, 'zh': {'title': '提升变换器模型的位置信息处理能力', 'desc': '本文提出了一种新的位置编码方法,称为TAPE(conTextualized equivariAnt Position Embedding),旨在提高变换器模型的预测能力。传统的位置编码方法往往限制了模型对长距离依赖关系的建模能力,而TAPE通过引入动态的、上下文感知的位置编码来克服这一问题。该方法确保了位置编码在更新过程中的稳定性,从而提高了模型的鲁棒性和适应性。实验结果表明,TAPE在语言建模、算术推理和长上下文检索任务中表现优于现有的位置编码技术。'}}}, {'id': 'https://huggingface.co/papers/2501.05441', 'title': 'The GAN is dead; long live the GAN! A Modern GAN Baseline', 'url': 'https://huggingface.co/papers/2501.05441', 'abstract': 'There is a widely-spread claim that GANs are difficult to train, and GAN architectures in the literature are littered with empirical tricks. We provide evidence against this claim and build a modern GAN baseline in a more principled manner. First, we derive a well-behaved regularized relativistic GAN loss that addresses issues of mode dropping and non-convergence that were previously tackled via a bag of ad-hoc tricks. We analyze our loss mathematically and prove that it admits local convergence guarantees, unlike most existing relativistic losses. Second, our new loss allows us to discard all ad-hoc tricks and replace outdated backbones used in common GANs with modern architectures. Using StyleGAN2 as an example, we present a roadmap of simplification and modernization that results in a new minimalist baseline -- R3GAN. Despite being simple, our approach surpasses StyleGAN2 on FFHQ, ImageNet, CIFAR, and Stacked MNIST datasets, and compares favorably against state-of-the-art GANs and diffusion models.', 'score': 51, 'issue_id': 1596, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'eb1cd90c4d5cb0ef', 'authors': ['Yiwen Huang', 'Aaron Gokaslan', 'Volodymyr Kuleshov', 'James Tompkin'], 'affiliations': ['Brown University', 'Cornell University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05441.jpg', 'data': {'categories': ['#training', '#architecture', '#diffusion', '#optimization', '#cv'], 'emoji': '🔬', 'ru': {'title': 'Упрощение и модернизация GAN: новый взгляд на обучение генеративных моделей', 'desc': 'Исследователи опровергают распространенное мнение о сложности обучения генеративно-состязательных сетей (GAN). Они разработали новый регуляризованный релятивистский GAN-лосс, который решает проблемы потери мод и отсутствия сходимости. Авторы математически доказывают, что их лосс обеспечивает локальную сходимость, в отличие от большинства существующих релятивистских лоссов. На основе этого подхода они создали минималистичную базовую модель R3GAN, которая превосходит StyleGAN2 и другие современные GAN на нескольких наборах данных.'}, 'en': {'title': 'Simplifying GAN Training with R3GAN: A New Era of Efficiency', 'desc': 'This paper challenges the common belief that Generative Adversarial Networks (GANs) are inherently difficult to train. It introduces a new GAN loss function called the regularized relativistic GAN loss, which effectively addresses issues like mode dropping and non-convergence without relying on numerous empirical tricks. The authors provide mathematical analysis showing that their loss function guarantees local convergence, which is a significant improvement over existing methods. By applying this new loss to modern architectures like StyleGAN2, they create a simplified and efficient GAN model named R3GAN, which outperforms previous models on several benchmark datasets.'}, 'zh': {'title': '简化GAN训练,超越传统架构', 'desc': '这篇论文探讨了生成对抗网络(GAN)训练的难点,并提出了一种新的方法来简化这一过程。作者提出了一种正则化的相对GAN损失函数,解决了模式丢失和非收敛的问题。通过数学分析,证明了这种损失函数具有局部收敛的保证,优于现有的相对损失函数。最终,作者展示了一个新的简约基线R3GAN,其在多个数据集上的表现超过了StyleGAN2,并与最先进的GAN和扩散模型相媲美。'}}}, {'id': 'https://huggingface.co/papers/2501.05032', 'title': 'Enhancing Human-Like Responses in Large Language Models', 'url': 'https://huggingface.co/papers/2501.05032', 'abstract': 'This paper explores the advancements in making large language models (LLMs) more human-like. We focus on techniques that enhance natural language understanding, conversational coherence, and emotional intelligence in AI systems. The study evaluates various approaches, including fine-tuning with diverse datasets, incorporating psychological principles, and designing models that better mimic human reasoning patterns. Our findings demonstrate that these enhancements not only improve user interactions but also open new possibilities for AI applications across different domains. Future work will address the ethical implications and potential biases introduced by these human-like attributes.', 'score': 28, 'issue_id': 1609, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '64e14687fd1e5dab', 'authors': ['Ethem Yağız Çalık', 'Talha Rüzgar Akkuş'], 'affiliations': ['Hugging Face'], 'pdf_title_img': 'assets/pdf/title_img/2501.05032.jpg', 'data': {'categories': ['#training', '#alignment', '#rlhf', '#ethics', '#multimodal'], 'emoji': '🤖', 'ru': {'title': 'Путь к человекоподобному ИИ: улучшение больших языковых моделей', 'desc': 'Статья исследует методы повышения человекоподобности больших языковых моделей (LLM). Авторы рассматривают техники улучшения понимания естественного языка, связности диалогов и эмоционального интеллекта в системах искусственного интеллекта. Исследование оценивает различные подходы, включая дообучение на разнообразных датасетах, внедрение психологических принципов и разработку моделей, лучше имитирующих человеческие паттерны мышления. Результаты показывают, что эти улучшения не только совершенствуют взаимодействие с пользователем, но и открывают новые возможности для применения ИИ в различных областях.'}, 'en': {'title': 'Enhancing AI: Making Language Models More Human-Like', 'desc': 'This paper investigates how to make large language models (LLMs) behave more like humans. It emphasizes improving natural language understanding, making conversations more coherent, and increasing emotional intelligence in AI. The research assesses methods such as fine-tuning models with varied datasets and applying psychological principles to enhance human-like reasoning. The results show that these improvements lead to better user experiences and expand the potential uses of AI, while also highlighting the need to consider ethical issues and biases that may arise.'}, 'zh': {'title': '让人工智能更像人类的未来', 'desc': '本文探讨了使大型语言模型(LLMs)更具人性化的进展。我们重点关注增强自然语言理解、对话连贯性和情感智能的技术。研究评估了多种方法,包括使用多样化数据集进行微调、融入心理学原理,以及设计更好模拟人类推理模式的模型。我们的发现表明,这些增强不仅改善了用户互动,还为不同领域的人工智能应用开辟了新可能。'}}}, {'id': 'https://huggingface.co/papers/2501.05453', 'title': 'An Empirical Study of Autoregressive Pre-training from Videos', 'url': 'https://huggingface.co/papers/2501.05453', 'abstract': 'We empirically study autoregressive pre-training from videos. To perform our study, we construct a series of autoregressive video models, called Toto. We treat videos as sequences of visual tokens and train transformer models to autoregressively predict future tokens. Our models are pre-trained on a diverse dataset of videos and images comprising over 1 trillion visual tokens. We explore different architectural, training, and inference design choices. We evaluate the learned visual representations on a range of downstream tasks including image recognition, video classification, object tracking, and robotics. Our results demonstrate that, despite minimal inductive biases, autoregressive pre-training leads to competitive performance across all benchmarks. Finally, we find that scaling our video models results in similar scaling curves to those seen in language models, albeit with a different rate. More details at https://brjathu.github.io/toto/', 'score': 28, 'issue_id': 1596, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '3846ea8507d046be', 'authors': ['Jathushan Rajasegaran', 'Ilija Radosavovic', 'Rahul Ravishankar', 'Yossi Gandelsman', 'Christoph Feichtenhofer', 'Jitendra Malik'], 'affiliations': ['Meta FAIR', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.05453.jpg', 'data': {'categories': ['#training', '#dataset', '#benchmark', '#architecture', '#robotics', '#video', '#cv'], 'emoji': '🎬', 'ru': {'title': 'Авторегрессионное предобучение видео: путь к универсальному компьютерному зрению', 'desc': 'В статье исследуется авторегрессионное предобучение на видеоданных с использованием модели Toto. Авторы рассматривают видео как последовательности визуальных токенов и обучают трансформеры предсказывать будущие токены. Модели предобучаются на разнообразном наборе данных из более чем триллиона визуальных токенов. Результаты показывают, что такой подход дает конкурентоспособную производительность на различных задачах компьютерного зрения.'}, 'en': {'title': 'Unlocking Video Understanding with Autoregressive Models', 'desc': 'This paper investigates the use of autoregressive pre-training for video data through a series of models named Toto. The authors treat videos as sequences of visual tokens and employ transformer architectures to predict future tokens in these sequences. They pre-train their models on a massive dataset containing over 1 trillion visual tokens, exploring various design choices in architecture and training. The results show that these autoregressive models achieve strong performance on tasks like image recognition and video classification, indicating that scaling video models can yield similar benefits as seen in language models.'}, 'zh': {'title': '自回归预训练:视频模型的新突破', 'desc': '本文研究了视频的自回归预训练。我们构建了一系列名为Toto的自回归视频模型,将视频视为视觉标记的序列,并训练变换器模型以自回归方式预测未来的标记。我们的模型在一个包含超过1万亿视觉标记的多样化视频和图像数据集上进行预训练,并在多个下游任务上评估学习到的视觉表示。结果表明,尽管诱导偏差较小,自回归预训练在所有基准测试中表现出竞争力的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04003', 'title': 'Are VLMs Ready for Autonomous Driving? An Empirical Study from the Reliability, Data, and Metric Perspectives', 'url': 'https://huggingface.co/papers/2501.04003', 'abstract': "Recent advancements in Vision-Language Models (VLMs) have sparked interest in their use for autonomous driving, particularly in generating interpretable driving decisions through natural language. However, the assumption that VLMs inherently provide visually grounded, reliable, and interpretable explanations for driving remains largely unexamined. To address this gap, we introduce DriveBench, a benchmark dataset designed to evaluate VLM reliability across 17 settings (clean, corrupted, and text-only inputs), encompassing 19,200 frames, 20,498 question-answer pairs, three question types, four mainstream driving tasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often generate plausible responses derived from general knowledge or textual cues rather than true visual grounding, especially under degraded or missing visual inputs. This behavior, concealed by dataset imbalances and insufficient evaluation metrics, poses significant risks in safety-critical scenarios like autonomous driving. We further observe that VLMs struggle with multi-modal reasoning and display heightened sensitivity to input corruptions, leading to inconsistencies in performance. To address these challenges, we propose refined evaluation metrics that prioritize robust visual grounding and multi-modal understanding. Additionally, we highlight the potential of leveraging VLMs' awareness of corruptions to enhance their reliability, offering a roadmap for developing more trustworthy and interpretable decision-making systems in real-world autonomous driving contexts. The benchmark toolkit is publicly accessible.", 'score': 20, 'issue_id': 1599, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '720b493a608f478a', 'authors': ['Shaoyuan Xie', 'Lingdong Kong', 'Yuhao Dong', 'Chonghao Sima', 'Wenwei Zhang', 'Qi Alfred Chen', 'Ziwei Liu', 'Liang Pan'], 'affiliations': ['National University of Singapore', 'S-Lab, Nanyang Technological University', 'Shanghai AI Laboratory', 'The University of Hong Kong', 'University of California, Irvine'], 'pdf_title_img': 'assets/pdf/title_img/2501.04003.jpg', 'data': {'categories': ['#security', '#interpretability', '#dataset', '#multimodal', '#reasoning', '#benchmark', '#cv'], 'emoji': '🚗', 'ru': {'title': 'Проверка надёжности VLM для безопасного автономного вождения', 'desc': 'Статья представляет DriveBench - набор данных для оценки надёжности мультимодальных языковых моделей (VLM) в контексте автономного вождения. Исследование выявило, что VLM часто генерируют правдоподобные ответы на основе общих знаний, а не визуальной информации, что опасно в критически важных сценариях. Авторы предлагают усовершенствованные метрики оценки, ориентированные на надёжную визуальную привязку и мультимодальное понимание. Также отмечается потенциал использования осведомленности VLM о искажениях для повышения их надёжности.'}, 'en': {'title': 'Enhancing Trust in Vision-Language Models for Safer Autonomous Driving', 'desc': 'This paper discusses the limitations of Vision-Language Models (VLMs) in the context of autonomous driving, particularly their ability to provide reliable and interpretable driving decisions. The authors introduce DriveBench, a comprehensive benchmark dataset that tests VLM performance across various conditions, including clean and corrupted inputs. Their research shows that VLMs often rely on general knowledge rather than true visual understanding, especially when visual data is compromised. To improve VLM reliability, the paper suggests new evaluation metrics focused on visual grounding and multi-modal reasoning, aiming to enhance the safety of autonomous driving systems.'}, 'zh': {'title': '提升自动驾驶决策的可靠性与可解释性', 'desc': '本文介绍了DriveBench,一个用于评估视觉语言模型(VLMs)在自动驾驶中可靠性的基准数据集。该数据集包含19200帧图像和20498个问答对,涵盖了多种驾驶任务和输入类型。研究发现,VLMs在处理受损或缺失的视觉输入时,往往依赖于一般知识而非真实的视觉信息,导致安全隐患。为了解决这些问题,本文提出了改进的评估指标,强调视觉基础和多模态理解的重要性。'}}}, {'id': 'https://huggingface.co/papers/2501.05122', 'title': 'Centurio: On Drivers of Multilingual Ability of Large Vision-Language Model', 'url': 'https://huggingface.co/papers/2501.05122', 'abstract': 'Most Large Vision-Language Models (LVLMs) to date are trained predominantly on English data, which makes them struggle to understand non-English input and fail to generate output in the desired target language. Existing efforts mitigate these issues by adding multilingual training data, but do so in a largely ad-hoc manner, lacking insight into how different training mixes tip the scale for different groups of languages. In this work, we present a comprehensive investigation into the training strategies for massively multilingual LVLMs. First, we conduct a series of multi-stage experiments spanning 13 downstream vision-language tasks and 43 languages, systematically examining: (1) the number of training languages that can be included without degrading English performance and (2) optimal language distributions of pre-training as well as (3) instruction-tuning data. Further, we (4) investigate how to improve multilingual text-in-image understanding, and introduce a new benchmark for the task. Surprisingly, our analysis reveals that one can (i) include as many as 100 training languages simultaneously (ii) with as little as 25-50\\% of non-English data, to greatly improve multilingual performance while retaining strong English performance. We further find that (iii) including non-English OCR data in pre-training and instruction-tuning is paramount for improving multilingual text-in-image understanding. Finally, we put all our findings together and train Centurio, a 100-language LVLM, offering state-of-the-art performance in an evaluation covering 14 tasks and 56 languages.', 'score': 13, 'issue_id': 1604, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '92d74f3bbeb4a400', 'authors': ['Gregor Geigle', 'Florian Schneider', 'Carolin Holtermann', 'Chris Biemann', 'Radu Timofte', 'Anne Lauscher', 'Goran Glavaš'], 'affiliations': ['Data Science Group, University of Hamburg', 'Language Technology Group', 'WüNLP, Computer Vision Lab, CAIDAS, University of Würzburg'], 'pdf_title_img': 'assets/pdf/title_img/2501.05122.jpg', 'data': {'categories': ['#machine_translation', '#multilingual', '#benchmark', '#low_resource'], 'emoji': '🌍', 'ru': {'title': 'Centurio: Прорыв в многоязычном визуально-языковом ИИ', 'desc': 'В статье описывается исследование стратегий обучения многоязычных крупномасштабных визуально-языковых моделей (LVLMs). Авторы проводят эксперименты на 13 задачах и 43 языках, изучая оптимальное распределение языков в данных для предобучения и инструктивной настройки. Они обнаруживают, что можно включить до 100 языков обучения, используя всего 25-50% неанглийских данных, значительно улучшив многоязычную производительность при сохранении высокой эффективности на английском. На основе полученных результатов авторы обучают Centurio - 100-язычную LVLM, демонстрирующую передовые результаты на 14 задачах и 56 языках.'}, 'en': {'title': 'Unlocking Multilingual Mastery in Vision-Language Models', 'desc': 'This paper investigates how to effectively train Large Vision-Language Models (LVLMs) on multiple languages, particularly focusing on improving their performance in non-English languages. The authors conduct experiments across various tasks and languages to determine the best strategies for including multilingual data without harming English performance. They discover that including up to 100 languages and using a smaller proportion of non-English data can enhance multilingual capabilities while maintaining strong English results. Additionally, they emphasize the importance of incorporating non-English OCR data to boost understanding of text within images, culminating in the development of Centurio, a 100-language LVLM with state-of-the-art performance.'}, 'zh': {'title': '提升多语言理解,Centurio引领新潮流', 'desc': '本文研究了大规模多语言视觉-语言模型(LVLM)的训练策略,特别关注如何提高模型对非英语输入的理解和输出能力。我们通过多阶段实验,分析了包含多种语言的训练数据对英语性能的影响,并探索了最佳的语言分布策略。研究发现,最多可以同时包含100种语言的训练数据,并且只需25-50%的非英语数据即可显著提升多语言性能。最后,我们结合所有发现,训练了Centurio,一个支持100种语言的LVLM,在14个任务和56种语言的评估中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.03489', 'title': 'Entropy-Guided Attention for Private LLMs', 'url': 'https://huggingface.co/papers/2501.03489', 'abstract': "The pervasiveness of proprietary language models has raised critical privacy concerns, necessitating advancements in private inference (PI), where computations are performed directly on encrypted data without revealing users' sensitive information. While PI offers a promising solution, its practical deployment is hindered by substantial communication and latency overheads, primarily stemming from nonlinear operations. To address this, we introduce an information-theoretic framework to characterize the role of nonlinearities in decoder-only language models, laying a principled foundation for optimizing transformer-architectures tailored to the demands of PI. By leveraging Shannon's entropy as a quantitative measure, we uncover the previously unexplored dual significance of nonlinearities: beyond ensuring training stability, they are crucial for maintaining attention head diversity. Specifically, we find that their removal triggers two critical failure modes: {\\em entropy collapse} in deeper layers that destabilizes training, and {\\em entropic overload} in earlier layers that leads to under-utilization of Multi-Head Attention's (MHA) representational capacity. We propose an entropy-guided attention mechanism paired with a novel entropy regularization technique to mitigate entropic overload. Additionally, we explore PI-friendly alternatives to layer normalization for preventing entropy collapse and stabilizing the training of LLMs with reduced-nonlinearities. Our study bridges the gap between information theory and architectural design, establishing entropy dynamics as a principled guide for developing efficient PI architectures. The code and implementation are available at https://github.com/Nandan91/entropy-guided-attention-llm{entropy-guided-llm}.", 'score': 11, 'issue_id': 1597, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '18abcfb3fe1b209b', 'authors': ['Nandan Kumar Jha', 'Brandon Reagen'], 'affiliations': ['New York University'], 'pdf_title_img': 'assets/pdf/title_img/2501.03489.jpg', 'data': {'categories': ['#security', '#inference', '#optimization', '#architecture', '#training', '#open_source'], 'emoji': '🔐', 'ru': {'title': 'Энтропия как ключ к конфиденциальным языковым моделям', 'desc': 'Статья рассматривает проблему конфиденциальности при использовании языковых моделей и предлагает решение через частное вычисление (PI). Авторы представляют информационно-теоретическую основу для оптимизации архитектур трансформеров под задачи PI, используя энтропию Шеннона как количественную меру. Исследование выявляет двойную роль нелинейностей в моделях: обеспечение стабильности обучения и поддержание разнообразия в механизме внимания. Предложен энтропийно-управляемый механизм внимания и новая техника регуляризации энтропии для улучшения эффективности PI-архитектур.'}, 'en': {'title': 'Optimizing Language Models for Privacy with Entropy Dynamics', 'desc': 'This paper addresses privacy concerns related to proprietary language models by focusing on private inference (PI), which allows computations on encrypted data. The authors introduce an information-theoretic framework to analyze the impact of nonlinearities in decoder-only language models, which are essential for optimizing transformer architectures for PI. They identify two critical issues caused by the removal of nonlinearities: entropy collapse in deeper layers and entropic overload in earlier layers, both of which affect training stability and attention mechanisms. To resolve these issues, the paper proposes an entropy-guided attention mechanism and explores alternatives to layer normalization, aiming to enhance the efficiency of PI architectures while maintaining model performance.'}, 'zh': {'title': '优化私密推理架构的熵动态', 'desc': '本论文探讨了在加密数据上进行私密推理(PI)时,非线性操作对解码器语言模型的影响。我们提出了一种信息论框架,帮助优化适合PI需求的变换器架构。研究发现,非线性不仅确保了训练的稳定性,还对注意力头的多样性至关重要。为了解决熵崩溃和熵过载问题,我们提出了一种基于熵的注意力机制和新的熵正则化技术。'}}}, {'id': 'https://huggingface.co/papers/2501.05040', 'title': 'SWE-Fixer: Training Open-Source LLMs for Effective and Efficient GitHub Issue Resolution', 'url': 'https://huggingface.co/papers/2501.05040', 'abstract': 'Large Language Models (LLMs) have demonstrated remarkable proficiency across a variety of complex tasks. One significant application of LLMs is in tackling software engineering challenges, particularly in resolving real-world tasks on GitHub by fixing code based on the issues reported by the users. However, many current approaches rely on proprietary LLMs, which limits reproducibility, accessibility, and transparency. The critical components of LLMs for addressing software engineering issues and how their capabilities can be effectively enhanced remain unclear. To address these challenges, we introduce SWE-Fixer, a novel open-source LLM designed to effectively and efficiently resolve GitHub issues. SWE-Fixer comprises two essential modules: a code file retrieval module and a code editing module. The retrieval module employs BM25 along with a lightweight LLM model to achieve coarse-to-fine file retrieval. Subsequently, the code editing module utilizes the other LLM model to generate patches for the identified files. Then, to mitigate the lack of publicly available datasets, we compile an extensive dataset that includes 110K GitHub issues along with their corresponding patches, and train the two modules of SWE-Fixer separately. We assess our approach on the SWE-Bench Lite and Verified benchmarks, achieving state-of-the-art performance among open-source models with scores of 23.3% and 30.2%, respectively. These outcomes highlight the efficacy of our approach. We will make our model, dataset, and code publicly available at https://github.com/InternLM/SWE-Fixer.', 'score': 8, 'issue_id': 1608, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '54d8f8a0fe5436c6', 'authors': ['Chengxing Xie', 'Bowen Li', 'Chang Gao', 'He Du', 'Wai Lam', 'Difan Zou', 'Kai Chen'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong', 'The University of Hong Kong', 'Xidian University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05040.jpg', 'data': {'categories': ['#data', '#open_source', '#dataset', '#architecture', '#benchmark', '#training', '#science'], 'emoji': '🛠️', 'ru': {'title': 'Открытая языковая модель для эффективного решения проблем на GitHub', 'desc': 'SWE-Fixer - это новая модель с открытым исходным кодом для решения проблем на GitHub. Она состоит из модуля поиска файлов кода и модуля редактирования кода, использующих легковесные языковые модели. Авторы создали обширный датасет из 110 тысяч GitHub-issues с патчами для обучения модели. SWE-Fixer достигла лучших результатов среди моделей с открытым кодом на бенчмарках SWE-Bench Lite и Verified.'}, 'en': {'title': 'SWE-Fixer: Open-Source Solutions for GitHub Issues', 'desc': 'This paper presents SWE-Fixer, an open-source Large Language Model (LLM) specifically designed to address software engineering challenges on GitHub. It features two main components: a code file retrieval module that uses BM25 and a lightweight LLM for efficient file identification, and a code editing module that generates code patches using another LLM. The authors also created a comprehensive dataset of 110,000 GitHub issues and their corresponding patches to train the model effectively. SWE-Fixer achieves state-of-the-art performance on benchmark tests, demonstrating its potential to enhance accessibility and transparency in software engineering solutions.'}, 'zh': {'title': '开源LLM助力软件工程问题解决', 'desc': '大型语言模型(LLMs)在处理复杂任务方面表现出色,尤其是在软件工程领域。本文介绍了一种新颖的开源LLM,名为SWE-Fixer,旨在有效解决GitHub上的问题。SWE-Fixer包含两个主要模块:代码文件检索模块和代码编辑模块,前者使用BM25和轻量级LLM进行文件检索,后者生成代码补丁。通过构建包含11万个GitHub问题及其补丁的数据集,SWE-Fixer在开源模型中实现了领先的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04377', 'title': 'On Computational Limits and Provably Efficient Criteria of Visual Autoregressive Models: A Fine-Grained Complexity Analysis', 'url': 'https://huggingface.co/papers/2501.04377', 'abstract': 'Recently, Visual Autoregressive (VAR) Models introduced a groundbreaking advancement in the field of image generation, offering a scalable approach through a coarse-to-fine "next-scale prediction" paradigm. However, the state-of-the-art algorithm of VAR models in [Tian, Jiang, Yuan, Peng and Wang, NeurIPS 2024] takes O(n^4) time, which is computationally inefficient. In this work, we analyze the computational limits and efficiency criteria of VAR Models through a fine-grained complexity lens. Our key contribution is identifying the conditions under which VAR computations can achieve sub-quadratic time complexity. Specifically, we establish a critical threshold for the norm of input matrices used in VAR attention mechanisms. Above this threshold, assuming the Strong Exponential Time Hypothesis (SETH) from fine-grained complexity theory, a sub-quartic time algorithm for VAR models is impossible. To substantiate our theoretical findings, we present efficient constructions leveraging low-rank approximations that align with the derived criteria. This work initiates the study of the computational efficiency of the VAR model from a theoretical perspective. Our technique will shed light on advancing scalable and efficient image generation in VAR frameworks.', 'score': 8, 'issue_id': 1597, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'be8a0f20db676680', 'authors': ['Yekun Ke', 'Xiaoyu Li', 'Yingyu Liang', 'Zhizhou Sha', 'Zhenmei Shi', 'Zhao Song'], 'affiliations': ['The Simons Institute for the Theory of Computing at UC Berkeley', 'The University of Hong Kong', 'Tsinghua University', 'University of Wisconsin-Madison'], 'pdf_title_img': 'assets/pdf/title_img/2501.04377.jpg', 'data': {'categories': ['#math', '#optimization', '#cv'], 'emoji': '🔬', 'ru': {'title': 'Преодоление вычислительных барьеров в VAR моделях', 'desc': 'Статья исследует вычислительные ограничения и критерии эффективности Визуальных Авторегрессионных (VAR) моделей с точки зрения тонкой теории сложности. Авторы определяют условия, при которых вычисления VAR могут достичь субквадратичной временной сложности. Они устанавливают критический порог для нормы входных матриц, используемых в механизмах внимания VAR, выше которого невозможен субкварцевый алгоритм времени для моделей VAR. Представлены эффективные конструкции, использующие аппроксимации низкого ранга, которые соответствуют выведенным критериям.'}, 'en': {'title': 'Unlocking Efficiency in Image Generation with VAR Models', 'desc': 'This paper explores the computational efficiency of Visual Autoregressive (VAR) Models, which are used for generating images. The authors identify that the current state-of-the-art VAR algorithm is computationally expensive, operating in O(n^4) time complexity. They establish conditions under which VAR computations can be optimized to achieve sub-quadratic time complexity, particularly focusing on the input matrix norms in the attention mechanisms. By applying low-rank approximations, the authors provide practical constructions that meet their theoretical criteria, paving the way for more efficient image generation techniques in VAR frameworks.'}, 'zh': {'title': '提升VAR模型的计算效率', 'desc': '最近,视觉自回归(VAR)模型在图像生成领域取得了突破性进展,采用粗到细的“下一个尺度预测”范式。然而,VAR模型的最新算法在计算上效率低下,时间复杂度为O(n^4)。本文通过细粒度复杂性分析,探讨了VAR模型的计算限制和效率标准。我们确定了VAR计算可以实现亚二次时间复杂度的条件,并提出了利用低秩近似的高效构造,以支持我们的理论发现。'}}}, {'id': 'https://huggingface.co/papers/2501.04828', 'title': 'Building Foundations for Natural Language Processing of Historical Turkish: Resources and Models', 'url': 'https://huggingface.co/papers/2501.04828', 'abstract': 'This paper introduces foundational resources and models for natural language processing (NLP) of historical Turkish, a domain that has remained underexplored in computational linguistics. We present the first named entity recognition (NER) dataset, HisTR and the first Universal Dependencies treebank, OTA-BOUN for a historical form of the Turkish language along with transformer-based models trained using these datasets for named entity recognition, dependency parsing, and part-of-speech tagging tasks. Additionally, we introduce Ottoman Text Corpus (OTC), a clean corpus of transliterated historical Turkish texts that spans a wide range of historical periods. Our experimental results show significant improvements in the computational analysis of historical Turkish, achieving promising results in tasks that require understanding of historical linguistic structures. They also highlight existing challenges, such as domain adaptation and language variations across time periods. All of the presented resources and models are made available at https://huggingface.co/bucolin to serve as a benchmark for future progress in historical Turkish NLP.', 'score': 6, 'issue_id': 1603, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '40fe69c40d907fc4', 'authors': ['Şaziye Betül Özateş', 'Tarık Emre Tıraş', 'Ece Elif Adak', 'Berat Doğan', 'Fatih Burak Karagöz', 'Efe Eren Genç', 'Esma F. Bilgin Taşdemir'], 'affiliations': ['Bogaziçi University', 'Medeniyet University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04828.jpg', 'data': {'categories': ['#dataset', '#data', '#low_resource', '#science', '#multilingual', '#benchmark'], 'emoji': '🏛️', 'ru': {'title': 'Прорыв в NLP для исторического турецкого языка', 'desc': 'Статья представляет первые ресурсы и модели для обработки естественного языка (NLP) исторического турецкого языка. Авторы создали первый датасет для распознавания именованных сущностей (NER) HisTR и первый Universal Dependencies тривбанк OTA-BOUN для исторической формы турецкого языка. Также были разработаны трансформерные модели для задач NER, синтаксического анализа и морфологической разметки. Дополнительно представлен Османский текстовый корпус (OTC) - очищенный корпус транслитерированных исторических турецких текстов разных периодов.'}, 'en': {'title': 'Unlocking Historical Turkish: New Resources for NLP', 'desc': 'This paper provides essential resources and models for processing historical Turkish language using natural language processing (NLP) techniques. It introduces the first named entity recognition (NER) dataset, HisTR, and the first Universal Dependencies treebank, OTA-BOUN, specifically for historical Turkish. The authors also present the Ottoman Text Corpus (OTC), a comprehensive collection of transliterated texts from various historical periods. The results demonstrate advancements in analyzing historical Turkish, while also addressing challenges like domain adaptation and linguistic variations over time.'}, 'zh': {'title': '推动历史土耳其语NLP的进步', 'desc': '本文介绍了历史土耳其语自然语言处理(NLP)的基础资源和模型,这是一个在计算语言学中尚未深入研究的领域。我们首次发布了命名实体识别(NER)数据集HisTR和历史土耳其语的Universal Dependencies树库OTA-BOUN,并基于这些数据集训练了用于命名实体识别、依存句法分析和词性标注任务的变换器模型。此外,我们还推出了奥斯曼文本语料库(OTC),这是一个涵盖多个历史时期的清晰转写历史土耳其语文本的语料库。实验结果显示,在历史土耳其语的计算分析中取得了显著进展,但也突显了领域适应和语言随时间变化等挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.11425', 'title': 'Agent-R: Training Language Model Agents to Reflect via Iterative Self-Training', 'url': 'https://huggingface.co/papers/2501.11425', 'abstract': "Large Language Models (LLMs) agents are increasingly pivotal for addressing complex tasks in interactive environments. Existing work mainly focuses on enhancing performance through behavior cloning from stronger experts, yet such approaches often falter in real-world applications, mainly due to the inability to recover from errors. However, step-level critique data is difficult and expensive to collect. Automating and dynamically constructing self-critique datasets is thus crucial to empowering models with intelligent agent capabilities. In this work, we propose an iterative self-training framework, Agent-R, that enables language Agent to Reflect on the fly. Unlike traditional methods that reward or penalize actions based on correctness, Agent-R leverages MCTS to construct training data that recover correct trajectories from erroneous ones. A key challenge of agent reflection lies in the necessity for timely revision rather than waiting until the end of a rollout. To address this, we introduce a model-guided critique construction mechanism: the actor model identifies the first error step (within its current capability) in a failed trajectory. Starting from it, we splice it with the adjacent correct path, which shares the same parent node in the tree. This strategy enables the model to learn reflection based on its current policy, therefore yielding better learning efficiency. To further explore the scalability of this self-improvement paradigm, we investigate iterative refinement of both error correction capabilities and dataset construction. Our findings demonstrate that Agent-R continuously improves the model's ability to recover from errors and enables timely error correction. Experiments on three interactive environments show that Agent-R effectively equips agents to correct erroneous actions while avoiding loops, achieving superior performance compared to baseline methods (+5.59%).", 'score': 54, 'issue_id': 1798, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': '96d073b4606b0493', 'authors': ['Siyu Yuan', 'Zehui Chen', 'Zhiheng Xi', 'Junjie Ye', 'Zhengyin Du', 'Jiecao Chen'], 'affiliations': ['ByteDance', 'Fudan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.11425.jpg', 'data': {'categories': ['#reasoning', '#optimization', '#agents', '#training', '#agi'], 'emoji': '🤖', 'ru': {'title': 'Самообучающиеся ИИ-агенты: исправление ошибок на лету', 'desc': 'Статья представляет новый метод обучения языковых агентов на основе искусственного интеллекта под названием Agent-R. Этот подход использует самообучение и самокритику для улучшения способности модели исправлять ошибки в процессе выполнения задач. Agent-R применяет метод Монте-Карло для построения дерева поиска (MCTS) для создания обучающих данных, которые помогают агенту восстанавливаться после ошибочных действий. Эксперименты показывают, что Agent-R значительно повышает производительность агентов в интерактивных средах по сравнению с базовыми методами.'}, 'en': {'title': 'Empowering Language Agents with Real-Time Self-Critique', 'desc': "This paper introduces Agent-R, an iterative self-training framework designed to enhance the performance of Large Language Models (LLMs) in interactive environments. Unlike traditional methods that rely on static feedback, Agent-R utilizes Monte Carlo Tree Search (MCTS) to dynamically create training data that helps models recover from mistakes in real-time. The framework focuses on timely error correction by identifying the first error in a trajectory and splicing it with a correct path, allowing the model to learn from its current policy. Experimental results show that Agent-R significantly improves the model's error recovery capabilities and overall performance, outperforming baseline methods by 5.59%."}, 'zh': {'title': 'Agent-R:实时反思,提升学习效率', 'desc': '大型语言模型(LLMs)在复杂任务的交互环境中变得越来越重要。现有研究主要通过模仿更强专家的行为来提升性能,但这种方法在实际应用中常常失败,主要是因为无法从错误中恢复。为了解决这个问题,我们提出了一种迭代自我训练框架Agent-R,使语言代理能够实时反思。Agent-R通过构建训练数据来纠正错误轨迹,从而提高模型的学习效率和错误恢复能力。'}}}, {'id': 'https://huggingface.co/papers/2501.11873', 'title': 'Demons in the Detail: On Implementing Load Balancing Loss for Training Specialized Mixture-of-Expert Models', 'url': 'https://huggingface.co/papers/2501.11873', 'abstract': 'This paper revisits the implementation of Load-balancing Loss (LBL) when training Mixture-of-Experts (MoEs) models. Specifically, LBL for MoEs is defined as N_E sum_{i=1}^{N_E} f_i p_i, where N_E is the total number of experts, f_i represents the frequency of expert i being selected, and p_i denotes the average gating score of the expert i. Existing MoE training frameworks usually employ the parallel training strategy so that f_i and the LBL are calculated within a micro-batch and then averaged across parallel groups. In essence, a micro-batch for training billion-scale LLMs normally contains very few sequences. So, the micro-batch LBL is almost at the sequence level, and the router is pushed to distribute the token evenly within each sequence. Under this strict constraint, even tokens from a domain-specific sequence (e.g., code) are uniformly routed to all experts, thereby inhibiting expert specialization. In this work, we propose calculating LBL using a global-batch to loose this constraint. Because a global-batch contains much more diverse sequences than a micro-batch, which will encourage load balance at the corpus level. Specifically, we introduce an extra communication step to synchronize f_i across micro-batches and then use it to calculate the LBL. Through experiments on training MoEs-based LLMs (up to 42.8B total parameters and 400B tokens), we surprisingly find that the global-batch LBL strategy yields excellent performance gains in both pre-training perplexity and downstream tasks. Our analysis reveals that the global-batch LBL also greatly improves the domain specialization of MoE experts.', 'score': 48, 'issue_id': 1797, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '370d057fec504963', 'authors': ['Zihan Qiu', 'Zeyu Huang', 'Bo Zheng', 'Kaiyue Wen', 'Zekun Wang', 'Rui Men', 'Ivan Titov', 'Dayiheng Liu', 'Jingren Zhou', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Group', 'Stanford University', 'University of Edinburgh'], 'pdf_title_img': 'assets/pdf/title_img/2501.11873.jpg', 'data': {'categories': ['#optimization', '#architecture', '#training'], 'emoji': '⚖️', 'ru': {'title': 'Глобальный подход к балансировке нагрузки экспертов в MoE моделях', 'desc': 'Статья предлагает новый подход к реализации функции потерь балансировки нагрузки (LBL) при обучении моделей Mixture-of-Experts (MoE). Авторы предлагают вычислять LBL на уровне глобального батча, а не микро-батча, что позволяет ослабить ограничения на распределение токенов между экспертами. Эксперименты на крупномасштабных языковых моделях показывают, что этот метод улучшает перплексию при предобучении и результаты на задачах downstream. Анализ также демонстрирует улучшение специализации экспертов по доменам.'}, 'en': {'title': 'Enhancing Expert Specialization with Global-Batch Load-Balancing', 'desc': 'This paper focuses on improving the Load-balancing Loss (LBL) in training Mixture-of-Experts (MoEs) models. The authors highlight that traditional methods use micro-batches, which limit the diversity of sequences and hinder expert specialization. They propose a new approach that utilizes global-batches, allowing for a broader range of sequences and better load balancing across the entire dataset. Experimental results show that this global-batch LBL method significantly enhances model performance and expert specialization in large language models.'}, 'zh': {'title': '全局批次提升混合专家模型的负载均衡与专业化', 'desc': '本文重新审视了在训练混合专家模型(MoEs)时的负载均衡损失(LBL)实现。我们提出使用全局批次来计算LBL,以打破微批次的严格约束,从而在语料库层面上促进负载均衡。通过在训练中引入额外的通信步骤来同步专家选择频率,实验结果显示全局批次LBL策略在预训练困惑度和下游任务中均显著提升了性能。我们的分析表明,全局批次LBL还大大改善了MoE专家的领域专业化。'}}}, {'id': 'https://huggingface.co/papers/2501.12380', 'title': 'MMVU: Measuring Expert-Level Multi-Discipline Video Understanding', 'url': 'https://huggingface.co/papers/2501.12380', 'abstract': 'We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark for evaluating foundation models in video understanding. MMVU includes 3,000 expert-annotated questions spanning 27 subjects across four core disciplines: Science, Healthcare, Humanities & Social Sciences, and Engineering. Compared to prior benchmarks, MMVU features three key advancements. First, it challenges models to apply domain-specific knowledge and perform expert-level reasoning to analyze specialized-domain videos, moving beyond the basic visual perception typically assessed in current video benchmarks. Second, each example is annotated by human experts from scratch. We implement strict data quality controls to ensure the high quality of the dataset. Finally, each example is enriched with expert-annotated reasoning rationals and relevant domain knowledge, facilitating in-depth analysis. We conduct an extensive evaluation of 32 frontier multimodal foundation models on MMVU. The latest System-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest performance among the tested models. However, they still fall short of matching human expertise. Through in-depth error analyses and case studies, we offer actionable insights for future advancements in expert-level, knowledge-intensive video understanding for specialized domains.', 'score': 48, 'issue_id': 1797, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'dcb04aaca349cc32', 'authors': ['Yilun Zhao', 'Lujing Xie', 'Haowei Zhang', 'Guo Gan', 'Yitao Long', 'Zhiyuan Hu', 'Tongyan Hu', 'Weiyuan Chen', 'Chuhan Li', 'Junyang Song', 'Zhijian Xu', 'Chengye Wang', 'Weifeng Pan', 'Ziyao Shangguan', 'Xiangru Tang', 'Zhenwen Liang', 'Yixin Liu', 'Chen Zhao', 'Arman Cohan'], 'affiliations': ['Yale NLP'], 'pdf_title_img': 'assets/pdf/title_img/2501.12380.jpg', 'data': {'categories': ['#multimodal', '#science', '#benchmark', '#video', '#healthcare', '#reasoning'], 'emoji': '🎓', 'ru': {'title': 'Новый рубеж в понимании видео: от базового восприятия к экспертному анализу', 'desc': 'Статья представляет MMVU - многодисциплинарный экспертный бенчмарк для оценки фундаментальных моделей в понимании видео. MMVU включает 3000 вопросов по 27 предметам в четырех основных дисциплинах, требующих применения специализированных знаний и экспертного анализа. Бенчмарк отличается высоким качеством данных, аннотированных экспертами, и включает обоснования и релевантные знания для каждого примера. Оценка 32 мультимодальных моделей на MMVU показала, что даже лучшие модели пока не достигают уровня человека-эксперта в этой задаче.'}, 'en': {'title': 'MMVU: Elevating Video Understanding to Expert Levels', 'desc': 'The paper presents MMVU, a new benchmark designed to evaluate foundation models specifically in video understanding across various expert domains. It includes 3,000 questions that require advanced reasoning and domain-specific knowledge, moving beyond simple visual recognition tasks. Each question is meticulously annotated by human experts, ensuring high data quality and providing reasoning rationales to enhance analysis. The evaluation of 32 advanced multimodal models reveals that while some perform well, they still do not reach the level of human expertise, highlighting areas for future improvement in this field.'}, 'zh': {'title': 'MMVU:视频理解的新标准', 'desc': '我们介绍了MMVU,这是一个全面的专家级多学科基准,用于评估基础模型在视频理解方面的表现。MMVU包含3000个专家注释的问题,涵盖科学、医疗、人文学科与社会科学和工程四个核心学科。与之前的基准相比,MMVU在三个关键方面有所改进,包括要求模型应用领域特定知识进行专家级推理,确保数据集的高质量,以及为每个示例提供专家注释的推理依据和相关领域知识。我们对32个前沿多模态基础模型在MMVU上的表现进行了广泛评估,发现最新的系统2能力模型o1和Gemini 2.0 Flash Thinking在测试模型中表现最佳,但仍未能达到人类专家的水平。'}}}, {'id': 'https://huggingface.co/papers/2501.12224', 'title': 'TokenVerse: Versatile Multi-concept Personalization in Token Modulation Space', 'url': 'https://huggingface.co/papers/2501.12224', 'abstract': "We present TokenVerse -- a method for multi-concept personalization, leveraging a pre-trained text-to-image diffusion model. Our framework can disentangle complex visual elements and attributes from as little as a single image, while enabling seamless plug-and-play generation of combinations of concepts extracted from multiple images. As opposed to existing works, TokenVerse can handle multiple images with multiple concepts each, and supports a wide-range of concepts, including objects, accessories, materials, pose, and lighting. Our work exploits a DiT-based text-to-image model, in which the input text affects the generation through both attention and modulation (shift and scale). We observe that the modulation space is semantic and enables localized control over complex concepts. Building on this insight, we devise an optimization-based framework that takes as input an image and a text description, and finds for each word a distinct direction in the modulation space. These directions can then be used to generate new images that combine the learned concepts in a desired configuration. We demonstrate the effectiveness of TokenVerse in challenging personalization settings, and showcase its advantages over existing methods. project's webpage in https://token-verse.github.io/", 'score': 31, 'issue_id': 1804, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '20dcd865e2d7bc5c', 'authors': ['Daniel Garibi', 'Shahar Yadin', 'Roni Paiss', 'Omer Tov', 'Shiran Zada', 'Ariel Ephrat', 'Tomer Michaeli', 'Inbar Mosseri', 'Tali Dekel'], 'affiliations': ['Google DeepMind', 'Technion', 'Tel Aviv University', 'Weizmann Institute'], 'pdf_title_img': 'assets/pdf/title_img/2501.12224.jpg', 'data': {'categories': ['#multimodal', '#cv', '#optimization', '#diffusion'], 'emoji': '🎨', 'ru': {'title': 'Персонализация изображений с помощью семантического пространства модуляции', 'desc': 'TokenVerse - это метод многоконцептуальной персонализации, использующий предобученную модель диффузии текста в изображение. Он позволяет выделять сложные визуальные элементы и атрибуты даже из одного изображения, обеспечивая при этом возможность комбинировать концепты из нескольких изображений. TokenVerse использует модель DiT, где входной текст влияет на генерацию через внимание и модуляцию. Метод оптимизирует направления в пространстве модуляции для каждого слова, что позволяет генерировать новые изображения с желаемой комбинацией выученных концептов.'}, 'en': {'title': 'TokenVerse: Mastering Multi-Concept Image Personalization', 'desc': 'TokenVerse is a novel approach for personalizing images by using a pre-trained text-to-image diffusion model. It can separate and manipulate various visual elements from just one image, allowing for the creation of new images that combine concepts from multiple sources. Unlike previous methods, TokenVerse effectively manages multiple images with different concepts, covering a wide array of attributes such as objects, poses, and lighting. The framework utilizes a DiT-based model that enables precise control over image generation through semantic modulation, making it a powerful tool for complex personalization tasks.'}, 'zh': {'title': 'TokenVerse:多概念个性化的新方法', 'desc': 'TokenVerse是一种多概念个性化的方法,利用预训练的文本到图像扩散模型。该框架能够从单张图像中解耦复杂的视觉元素和属性,并支持从多张图像中提取概念的无缝组合生成。与现有方法不同,TokenVerse可以处理每张图像中包含多个概念的情况,并支持广泛的概念类型,包括物体、配件、材料、姿势和光照。我们的研究利用基于DiT的文本到图像模型,通过注意力和调制(偏移和缩放)来影响生成过程,从而实现对复杂概念的局部控制。'}}}, {'id': 'https://huggingface.co/papers/2501.12326', 'title': 'UI-TARS: Pioneering Automated GUI Interaction with Native Agents', 'url': 'https://huggingface.co/papers/2501.12326', 'abstract': 'This paper introduces UI-TARS, a native GUI agent model that solely perceives the screenshots as input and performs human-like interactions (e.g., keyboard and mouse operations). Unlike prevailing agent frameworks that depend on heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts and workflows, UI-TARS is an end-to-end model that outperforms these sophisticated frameworks. Experiments demonstrate its superior performance: UI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating perception, grounding, and GUI task execution. Notably, in the OSWorld benchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15 steps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld, UI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several key innovations: (1) Enhanced Perception: leveraging a large-scale dataset of GUI screenshots for context-aware understanding of UI elements and precise captioning; (2) Unified Action Modeling, which standardizes actions into a unified space across platforms and achieves precise grounding and interaction through large-scale action traces; (3) System-2 Reasoning, which incorporates deliberate reasoning into multi-step decision making, involving multiple reasoning patterns such as task decomposition, reflection thinking, milestone recognition, etc. (4) Iterative Training with Reflective Online Traces, which addresses the data bottleneck by automatically collecting, filtering, and reflectively refining new interaction traces on hundreds of virtual machines. Through iterative training and reflection tuning, UI-TARS continuously learns from its mistakes and adapts to unforeseen situations with minimal human intervention. We also analyze the evolution path of GUI agents to guide the further development of this domain.', 'score': 27, 'issue_id': 1797, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '1f98d8f49b073983', 'authors': ['Yujia Qin', 'Yining Ye', 'Junjie Fang', 'Haoming Wang', 'Shihao Liang', 'Shizuo Tian', 'Junda Zhang', 'Jiahao Li', 'Yunxin Li', 'Shijue Huang', 'Wanjun Zhong', 'Kuanye Li', 'Jiale Yang', 'Yu Miao', 'Woyu Lin', 'Longxiang Liu', 'Xu Jiang', 'Qianli Ma', 'Jingyu Li', 'Xiaojun Xiao', 'Kai Cai', 'Chuang Li', 'Yaowei Zheng', 'Chaolin Jin', 'Chen Li', 'Xiao Zhou', 'Minchao Wang', 'Haoli Chen', 'Zhaojian Li', 'Haihua Yang', 'Haifeng Liu', 'Feng Lin', 'Tao Peng', 'Xin Liu', 'Guang Shi'], 'affiliations': ['ByteDance Seed', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.12326.jpg', 'data': {'categories': ['#optimization', '#dataset', '#agents', '#training', '#reasoning'], 'emoji': '🖥️', 'ru': {'title': 'UI-TARS: Революция в мире GUI-агентов', 'desc': 'Статья представляет UI-TARS - модель графического агента, которая воспринимает только скриншоты и выполняет операции, подобные человеческим. UI-TARS превосходит существующие фреймворки агентов, достигая лучших результатов в более чем 10 бенчмарках для GUI-агентов. Модель включает в себя несколько ключевых инноваций: улучшенное восприятие, унифицированное моделирование действий, рассуждение по системе-2 и итеративное обучение с рефлексивными онлайн-трассами. UI-TARS постоянно учится на своих ошибках и адаптируется к непредвиденным ситуациям с минимальным вмешательством человека.'}, 'en': {'title': 'Revolutionizing GUI Interaction with UI-TARS: The End-to-End Agent Model', 'desc': 'UI-TARS is a novel GUI agent model that processes screenshots to perform tasks like a human would, using keyboard and mouse actions. Unlike existing models that rely on complex commercial frameworks and pre-defined prompts, UI-TARS operates end-to-end and shows superior performance in various benchmarks. It achieves state-of-the-art results in GUI task execution by utilizing enhanced perception, unified action modeling, and system-2 reasoning for better decision-making. Additionally, its iterative training approach allows it to learn from past interactions, improving its adaptability with minimal human input.'}, 'zh': {'title': 'UI-TARS:革新图形用户界面代理的全新模型', 'desc': '本文介绍了UI-TARS,这是一种原生的图形用户界面(GUI)代理模型,能够仅通过屏幕截图进行人类般的交互。与依赖复杂商业模型的现有代理框架不同,UI-TARS是一个端到端的模型,在多个GUI代理基准测试中表现优异,尤其在感知、定位和任务执行方面。UI-TARS通过增强感知、统一动作建模、系统-2推理和反思在线追踪等创新,显著提高了其性能。通过迭代训练和反思调优,UI-TARS能够不断学习并适应新的情况,减少对人类干预的需求。'}}}, {'id': 'https://huggingface.co/papers/2501.12368', 'title': 'InternLM-XComposer2.5-Reward: A Simple Yet Effective Multi-Modal Reward Model', 'url': 'https://huggingface.co/papers/2501.12368', 'abstract': 'Despite the promising performance of Large Vision Language Models (LVLMs) in visual understanding, they occasionally generate incorrect outputs. While reward models (RMs) with reinforcement learning or test-time scaling offer the potential for improving generation quality, a critical gap remains: publicly available multi-modal RMs for LVLMs are scarce, and the implementation details of proprietary models are often unclear. We bridge this gap with InternLM-XComposer2.5-Reward (IXC-2.5-Reward), a simple yet effective multi-modal reward model that aligns LVLMs with human preferences. To ensure the robustness and versatility of IXC-2.5-Reward, we set up a high-quality multi-modal preference corpus spanning text, image, and video inputs across diverse domains, such as instruction following, general understanding, text-rich documents, mathematical reasoning, and video understanding. IXC-2.5-Reward achieves excellent results on the latest multi-modal reward model benchmark and shows competitive performance on text-only reward model benchmarks. We further demonstrate three key applications of IXC-2.5-Reward: (1) Providing a supervisory signal for RL training. We integrate IXC-2.5-Reward with Proximal Policy Optimization (PPO) yields IXC-2.5-Chat, which shows consistent improvements in instruction following and multi-modal open-ended dialogue; (2) Selecting the best response from candidate responses for test-time scaling; and (3) Filtering outlier or noisy samples from existing image and video instruction tuning training data. To ensure reproducibility and facilitate further research, we have open-sourced all model weights and training recipes at https://github.com/InternLM/InternLM-XComposer', 'score': 20, 'issue_id': 1804, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'd51d195276c2215d', 'authors': ['Yuhang Zang', 'Xiaoyi Dong', 'Pan Zhang', 'Yuhang Cao', 'Ziyu Liu', 'Shengyuan Ding', 'Shenxi Wu', 'Yubo Ma', 'Haodong Duan', 'Wenwei Zhang', 'Kai Chen', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Fudan University', 'Nanjing University', 'Nanyang Technological University', 'Shanghai Artificial Intelligence Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.12368.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#open_source', '#benchmark', '#training', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'Улучшение LVLM с помощью мультимодальной модели вознаграждения', 'desc': 'В статье представлена мультимодальная модель вознаграждения InternLM-XComposer2.5-Reward (IXC-2.5-Reward) для улучшения качества генерации больших визуально-языковых моделей (LVLM). Модель обучена на высококачественном наборе данных, охватывающем различные домены и типы входных данных. IXC-2.5-Reward показывает отличные результаты на бенчмарках мультимодальных и текстовых моделей вознаграждения. Авторы демонстрируют три ключевых применения модели: обучение с подкреплением, выбор лучшего ответа из кандидатов и фильтрация шумных данных.'}, 'en': {'title': 'Bridging the Gap in Multi-Modal Reward Models for LVLMs', 'desc': 'This paper introduces InternLM-XComposer2.5-Reward (IXC-2.5-Reward), a multi-modal reward model designed to enhance the performance of Large Vision Language Models (LVLMs) by aligning them with human preferences. The authors address the lack of publicly available multi-modal reward models by creating a comprehensive preference corpus that includes text, images, and videos across various domains. IXC-2.5-Reward demonstrates strong performance on multi-modal benchmarks and effectively supports reinforcement learning training, response selection, and data filtering. The model and its training methods are open-sourced to promote reproducibility and further research in the field.'}, 'zh': {'title': '提升视觉语言模型生成质量的多模态奖励模型', 'desc': '本文介绍了一种新的多模态奖励模型,名为InternLM-XComposer2.5-Reward(IXC-2.5-Reward),旨在提高大型视觉语言模型(LVLMs)的生成质量。该模型通过对文本、图像和视频等多种输入形式进行高质量的偏好学习,来对齐LVLMs与人类的偏好。IXC-2.5-Reward在最新的多模态奖励模型基准测试中表现优异,并在文本奖励模型基准测试中也展现了竞争力。我们还展示了IXC-2.5-Reward的三种关键应用,包括强化学习训练的监督信号、候选响应的最佳选择以及过滤噪声样本。'}}}, {'id': 'https://huggingface.co/papers/2501.11733', 'title': 'Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks', 'url': 'https://huggingface.co/papers/2501.11733', 'abstract': 'Smartphones have become indispensable in modern life, yet navigating complex tasks on mobile devices often remains frustrating. Recent advancements in large multimodal model (LMM)-based mobile agents have demonstrated the ability to perceive and act in mobile environments. However, current approaches face significant limitations: they fall short in addressing real-world human needs, struggle with reasoning-intensive and long-horizon tasks, and lack mechanisms to learn and improve from prior experiences. To overcome these challenges, we introduce Mobile-Agent-E, a hierarchical multi-agent framework capable of self-evolution through past experience. By hierarchical, we mean an explicit separation of high-level planning and low-level action execution. The framework comprises a Manager, responsible for devising overall plans by breaking down complex tasks into subgoals, and four subordinate agents--Perceptor, Operator, Action Reflector, and Notetaker--which handle fine-grained visual perception, immediate action execution, error verification, and information aggregation, respectively. Mobile-Agent-E also features a novel self-evolution module which maintains a persistent long-term memory comprising Tips and Shortcuts. Tips are general guidance and lessons learned from prior tasks on how to effectively interact with the environment. Shortcuts are reusable, executable sequences of atomic operations tailored for specific subroutines. The inclusion of Tips and Shortcuts facilitates continuous refinement in performance and efficiency. Alongside this framework, we introduce Mobile-Eval-E, a new benchmark featuring complex mobile tasks requiring long-horizon, multi-app interactions. Empirical results show that Mobile-Agent-E achieves a 22% absolute improvement over previous state-of-the-art approaches across three foundation model backbones. Project page: https://x-plug.github.io/MobileAgent.', 'score': 17, 'issue_id': 1798, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': 'a9cddb8786536def', 'authors': ['Zhenhailong Wang', 'Haiyang Xu', 'Junyang Wang', 'Xi Zhang', 'Ming Yan', 'Ji Zhang', 'Fei Huang', 'Heng Ji'], 'affiliations': ['Alibaba Group', 'University of Illinois Urbana-Champaign'], 'pdf_title_img': 'assets/pdf/title_img/2501.11733.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#optimization', '#agents', '#multimodal', '#long_context'], 'emoji': '📱', 'ru': {'title': 'Мобильный ИИ-ассистент с самообучением для сложных задач', 'desc': 'Статья представляет Mobile-Agent-E - иерархическую мультиагентную систему для выполнения сложных задач на мобильных устройствах. Система включает Менеджера для планирования и четыре подчиненных агента для восприятия, выполнения действий, проверки ошибок и агрегации информации. Ключевой особенностью является модуль самоэволюции с долговременной памятью, содержащей Подсказки и Ярлыки для улучшения производительности. Эмпирические результаты показывают значительное улучшение по сравнению с предыдущими подходами на новом бенчмарке Mobile-Eval-E.'}, 'en': {'title': 'Empowering Mobile Agents with Self-Evolution for Enhanced Task Performance', 'desc': 'This paper presents Mobile-Agent-E, a hierarchical multi-agent framework designed to enhance mobile task performance by learning from past experiences. The framework separates high-level planning from low-level execution, utilizing a Manager for task decomposition and four specialized agents for perception, action, error checking, and information management. A key feature is the self-evolution module, which incorporates a long-term memory of Tips and Shortcuts to improve task efficiency and effectiveness. Experimental results demonstrate that Mobile-Agent-E significantly outperforms existing methods, achieving a 22% improvement in complex mobile tasks.'}, 'zh': {'title': '智能手机任务执行的新突破', 'desc': '本论文介绍了一种名为Mobile-Agent-E的层次化多智能体框架,旨在提升智能手机上的任务执行能力。该框架通过将高层规划与低层执行明确分离,包含一个管理者和四个子代理,分别负责视觉感知、动作执行、错误验证和信息聚合。Mobile-Agent-E还引入了自我进化模块,利用长期记忆中的提示和捷径来不断优化性能。实验结果表明,该框架在复杂移动任务中相较于现有方法有22%的绝对提升。'}}}, {'id': 'https://huggingface.co/papers/2501.11223', 'title': 'Reasoning Language Models: A Blueprint', 'url': 'https://huggingface.co/papers/2501.11223', 'abstract': 'Reasoning language models (RLMs), also known as Large Reasoning Models (LRMs), such as OpenAI\'s o1 and o3, DeepSeek-V3, and Alibaba\'s QwQ, have redefined AI\'s problem-solving capabilities by extending large language models (LLMs) with advanced reasoning mechanisms. Yet, their high costs, proprietary nature, and complex architectures - uniquely combining Reinforcement Learning (RL), search heuristics, and LLMs - present accessibility and scalability challenges. To address these, we propose a comprehensive blueprint that organizes RLM components into a modular framework, based on a survey and analysis of all RLM works. This blueprint incorporates diverse reasoning structures (chains, trees, graphs, and nested forms), reasoning strategies (e.g., Monte Carlo Tree Search, Beam Search), RL concepts (policy, value models and others), and supervision schemes (Output-Based and Process-Based Supervision). We also provide detailed mathematical formulations and algorithmic specifications to simplify RLM implementation. By showing how schemes like LLaMA-Berry, QwQ, Journey Learning, and Graph of Thoughts fit as special cases, we demonstrate the blueprint\'s versatility and unifying potential. To illustrate its utility, we introduce x1, a modular implementation for rapid RLM prototyping and experimentation. Using x1 and a literature review, we provide key insights, such as multi-phase training for policy and value models, and the importance of familiar training distributions. Finally, we outline how RLMs can integrate with a broader LLM ecosystem, including tools and databases. Our work demystifies RLM construction, democratizes advanced reasoning capabilities, and fosters innovation, aiming to mitigate the gap between "rich AI" and "poor AI" by lowering barriers to RLM development and experimentation.', 'score': 16, 'issue_id': 1797, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': 'f554416ad9af3344', 'authors': ['Maciej Besta', 'Julia Barth', 'Eric Schreiber', 'Ales Kubicek', 'Afonso Catarino', 'Robert Gerstenberger', 'Piotr Nyczyk', 'Patrick Iff', 'Yueling Li', 'Sam Houliston', 'Tomasz Sternal', 'Marcin Copik', 'Grzegorz Kwaśniewski', 'Jürgen Müller', 'Łukasz Flis', 'Hannes Eberhard', 'Hubert Niewiadomski', 'Torsten Hoefler'], 'affiliations': ['BASF SE', 'Cledar', 'Cyfronet AGH', 'ETH Zurich'], 'pdf_title_img': 'assets/pdf/title_img/2501.11223.jpg', 'data': {'categories': ['#rl', '#math', '#training', '#survey', '#reasoning', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Демократизация искусственного интеллекта: модульный подход к созданию моделей рассуждений', 'desc': 'Статья представляет комплексный подход к созданию моделей рассуждений (RLM), объединяющих языковые модели с механизмами продвинутых рассуждений. Авторы предлагают модульную структуру, включающую различные стратегии рассуждений, концепции обучения с подкреплением и схемы обучения. Они демонстрируют применимость этой структуры на примере существующих моделей и представляют x1 - модульную реализацию для быстрого прототипирования RLM. Исследование направлено на демократизацию возможностей продвинутых рассуждений в ИИ и снижение барьеров для разработки RLM.'}, 'en': {'title': 'Democratizing Advanced Reasoning in AI', 'desc': 'This paper introduces a modular framework for Reasoning Language Models (RLMs), which enhance traditional Large Language Models (LLMs) with advanced reasoning capabilities. The authors address the challenges of high costs and complex architectures by organizing RLM components into a comprehensive blueprint that includes various reasoning structures and strategies. They provide mathematical formulations and algorithmic specifications to facilitate easier implementation of RLMs. Additionally, the paper presents x1, a tool for rapid prototyping, and discusses how RLMs can be integrated into the larger LLM ecosystem to promote accessibility and innovation in AI development.'}, 'zh': {'title': '简化推理语言模型,促进AI创新', 'desc': '推理语言模型(RLMs)通过结合强化学习、搜索启发式和大型语言模型(LLMs),重新定义了人工智能的解决问题能力。尽管它们具有强大的推理机制,但高成本和复杂架构使得其可访问性和可扩展性面临挑战。为了解决这些问题,我们提出了一个模块化框架,组织RLM组件,并提供详细的数学公式和算法规范,以简化RLM的实现。我们的工作旨在降低RLM开发和实验的门槛,促进创新,缩小“富有AI”和“贫穷AI”之间的差距。'}}}, {'id': 'https://huggingface.co/papers/2501.12202', 'title': 'Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D Assets Generation', 'url': 'https://huggingface.co/papers/2501.12202', 'abstract': 'We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for generating high-resolution textured 3D assets. This system includes two foundation components: a large-scale shape generation model -- Hunyuan3D-DiT, and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape generative model, built on a scalable flow-based diffusion transformer, aims to create geometry that properly aligns with a given condition image, laying a solid foundation for downstream applications. The texture synthesis model, benefiting from strong geometric and diffusion priors, produces high-resolution and vibrant texture maps for either generated or hand-crafted meshes. Furthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production platform that simplifies the re-creation process of 3D assets. It allows both professional and amateur users to manipulate or even animate their meshes efficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0 outperforms previous state-of-the-art models, including the open-source models and closed-source models in geometry details, condition alignment, texture quality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps in the open-source 3D community for large-scale foundation generative models. The code and pre-trained weights of our models are available at: https://github.com/Tencent/Hunyuan3D-2', 'score': 14, 'issue_id': 1798, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'f95f069cba0bd83e', 'authors': ['Zibo Zhao', 'Zeqiang Lai', 'Qingxiang Lin', 'Yunfei Zhao', 'Haolin Liu', 'Shuhui Yang', 'Yifei Feng', 'Mingxin Yang', 'Sheng Zhang', 'Xianghui Yang', 'Huiwen Shi', 'Sicong Liu', 'Junta Wu', 'Yihang Lian', 'Fan Yang', 'Ruining Tang', 'Zebin He', 'Xinzhou Wang', 'Jian Liu', 'Xuhui Zuo', 'Zhuo Chen', 'Biwen Lei', 'Haohan Weng', 'Jing Xu', 'Yiling Zhu', 'Xinhai Liu', 'Lixin Xu', 'Changrong Hu', 'Tianyu Huang', 'Lifu Wang', 'Jihong Zhang', 'Meng Chen', 'Liang Dong', 'Yiwen Jia', 'Yulin Cai', 'Jiaao Yu', 'Yixuan Tang', 'Hao Zhang', 'Zheng Ye', 'Peng He', 'Runzhou Wu', 'Chao Zhang', 'Yonghao Tan', 'Jie Xiao', 'Yangyu Tao', 'Jianchen Zhu', 'Jinbao Xue', 'Kai Liu', 'Chongqing Zhao', 'Xinming Wu', 'Zhichao Hu', 'Lei Qin', 'Jianbing Peng', 'Zhan Li', 'Minghui Chen', 'Xipeng Zhang', 'Lin Niu', 'Paige Wang', 'Yingkai Wang', 'Haozhao Kuang', 'Zhongyi Fan', 'Xu Zheng', 'Weihao Zhuang', 'YingPing He', 'Tian Liu', 'Yong Yang', 'Di Wang', 'Yuhong Liu', 'Jie Jiang', 'Jingwei Huang', 'Chunchao Guo'], 'affiliations': ['Tencent'], 'pdf_title_img': 'assets/pdf/title_img/2501.12202.jpg', 'data': {'categories': ['#diffusion', '#open_source', '#3d'], 'emoji': '🎨', 'ru': {'title': 'Революция в 3D-генерации: от формы к текстуре', 'desc': 'Hunyuan3D 2.0 - это продвинутая система для создания трехмерных текстурированных объектов высокого разрешения. Она состоит из двух основных компонентов: модели генерации форм Hunyuan3D-DiT и модели синтеза текстур Hunyuan3D-Paint. Модель генерации форм основана на масштабируемом диффузионном трансформере и создает геометрию, соответствующую заданному изображению. Модель синтеза текстур, используя геометрические и диффузионные праймы, создает высококачественные текстурные карты для сгенерированных или созданных вручную мешей.'}, 'en': {'title': 'Revolutionizing 3D Asset Creation with Hunyuan3D 2.0', 'desc': 'Hunyuan3D 2.0 is a sophisticated system designed for creating high-quality 3D models with detailed textures. It consists of two main components: Hunyuan3D-DiT for generating 3D shapes and Hunyuan3D-Paint for applying textures. The shape model uses a flow-based diffusion transformer to ensure that the generated geometry matches the input conditions, while the texture model leverages geometric and diffusion principles to create vibrant textures. This system not only enhances the quality of 3D assets but also provides an accessible platform for users to create and animate their models easily.'}, 'zh': {'title': 'Hunyuan3D 2.0:高效生成高质量3D资产的系统', 'desc': 'Hunyuan3D 2.0 是一个先进的大规模 3D 合成系统,能够生成高分辨率的纹理 3D 资产。该系统包含两个基础组件:Hunyuan3D-DiT 形状生成模型和 Hunyuan3D-Paint 纹理合成模型。形状生成模型基于可扩展的流式扩散变换器,旨在创建与给定条件图像相匹配的几何形状。纹理合成模型则利用强大的几何和扩散先验,为生成或手工制作的网格生成高分辨率的生动纹理图。'}}}, {'id': 'https://huggingface.co/papers/2501.12375', 'title': 'Video Depth Anything: Consistent Depth Estimation for Super-Long Videos', 'url': 'https://huggingface.co/papers/2501.12375', 'abstract': 'Depth Anything has achieved remarkable success in monocular depth estimation with strong generalization ability. However, it suffers from temporal inconsistency in videos, hindering its practical applications. Various methods have been proposed to alleviate this issue by leveraging video generation models or introducing priors from optical flow and camera poses. Nonetheless, these methods are only applicable to short videos (< 10 seconds) and require a trade-off between quality and computational efficiency. We propose Video Depth Anything for high-quality, consistent depth estimation in super-long videos (over several minutes) without sacrificing efficiency. We base our model on Depth Anything V2 and replace its head with an efficient spatial-temporal head. We design a straightforward yet effective temporal consistency loss by constraining the temporal depth gradient, eliminating the need for additional geometric priors. The model is trained on a joint dataset of video depth and unlabeled images, similar to Depth Anything V2. Moreover, a novel key-frame-based strategy is developed for long video inference. Experiments show that our model can be applied to arbitrarily long videos without compromising quality, consistency, or generalization ability. Comprehensive evaluations on multiple video benchmarks demonstrate that our approach sets a new state-of-the-art in zero-shot video depth estimation. We offer models of different scales to support a range of scenarios, with our smallest model capable of real-time performance at 30 FPS.', 'score': 13, 'issue_id': 1798, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '00640fb6adcf39e3', 'authors': ['Sili Chen', 'Hengkai Guo', 'Shengnan Zhu', 'Feihu Zhang', 'Zilong Huang', 'Jiashi Feng', 'Bingyi Kang'], 'affiliations': ['ByteDance'], 'pdf_title_img': 'assets/pdf/title_img/2501.12375.jpg', 'data': {'categories': ['#benchmark', '#optimization', '#small_models', '#video', '#cv', '#training'], 'emoji': '🎥', 'ru': {'title': 'Согласованная оценка глубины для сверхдлинных видео', 'desc': 'В статье представлен метод Video Depth Anything для оценки глубины в сверхдлинных видео с высоким качеством и временной согласованностью. Модель основана на Depth Anything V2 с новой пространственно-временной головой и использует эффективную функцию потерь для обеспечения временной согласованности. Предложенный подход позволяет обрабатывать видео произвольной длительности без ущерба для качества и обобщающей способности. Метод достигает наилучших результатов в задаче zero-shot оценки глубины видео на нескольких бенчмарках.'}, 'en': {'title': 'Achieving Consistent Depth Estimation in Long Videos', 'desc': 'This paper introduces Video Depth Anything, a model designed for accurate depth estimation in long videos, overcoming the limitations of previous methods that struggled with temporal consistency. The model builds on Depth Anything V2, enhancing it with a spatial-temporal head and a novel temporal consistency loss that focuses on the depth gradient over time. By training on a combined dataset of video depth and unlabeled images, the model achieves high-quality depth estimation without the need for complex geometric priors. The results demonstrate that Video Depth Anything can handle videos of any length while maintaining efficiency and setting new benchmarks in zero-shot video depth estimation.'}, 'zh': {'title': '超长视频深度估计的新突破', 'desc': '本文提出了一种名为Video Depth Anything的新模型,旨在解决单目深度估计在视频中的时间一致性问题。该模型能够在超长视频(超过几分钟)中实现高质量和一致性的深度估计,而不牺牲计算效率。我们通过设计一个简单有效的时间一致性损失,来约束时间深度梯度,从而避免了额外几何先验的需求。实验结果表明,该模型在多个视频基准测试中表现出色,设定了零-shot视频深度估计的新状态。'}}}, {'id': 'https://huggingface.co/papers/2501.10893', 'title': 'Learn-by-interact: A Data-Centric Framework for Self-Adaptive Agents in Realistic Environments', 'url': 'https://huggingface.co/papers/2501.10893', 'abstract': 'Autonomous agents powered by large language models (LLMs) have the potential to enhance human capabilities, assisting with digital tasks from sending emails to performing data analysis. The abilities of existing LLMs at such tasks are often hindered by the lack of high-quality agent data from the corresponding environments they interact with. We propose Learn-by-interact, a data-centric framework to adapt LLM agents to any given environments without human annotations. Learn-by-interact synthesizes trajectories of agent-environment interactions based on documentations, and constructs instructions by summarizing or abstracting the interaction histories, a process called backward construction. We assess the quality of our synthetic data by using them in both training-based scenarios and training-free in-context learning (ICL), where we craft innovative retrieval approaches optimized for agents. Extensive experiments on SWE-bench, WebArena, OSWorld and Spider2-V spanning across realistic coding, web, and desktop environments show the effectiveness of Learn-by-interact in various downstream agentic tasks -- baseline results are improved by up to 12.2\\% for ICL with Claude-3.5 and 19.5\\% for training with Codestral-22B. We further demonstrate the critical role of backward construction, which provides up to 14.0\\% improvement for training. Our ablation studies demonstrate the efficiency provided by our synthesized data in ICL and the superiority of our retrieval pipeline over alternative approaches like conventional retrieval-augmented generation (RAG). We expect that Learn-by-interact will serve as a foundation for agent data synthesis as LLMs are increasingly deployed at real-world environments.', 'score': 13, 'issue_id': 1798, 'pub_date': '2025-01-18', 'pub_date_card': {'ru': '18 января', 'en': 'January 18', 'zh': '1月18日'}, 'hash': 'b6ab4c9ac3809941', 'authors': ['Hongjin Su', 'Ruoxi Sun', 'Jinsung Yoon', 'Pengcheng Yin', 'Tao Yu', 'Sercan Ö. Arık'], 'affiliations': ['Google', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.10893.jpg', 'data': {'categories': ['#optimization', '#agents', '#synthetic', '#training', '#data', '#rag', '#dataset'], 'emoji': '🤖', 'ru': {'title': 'Обучение ИИ-агентов через синтетическое взаимодействие', 'desc': 'Статья представляет Learn-by-interact - фреймворк для адаптации агентов на основе больших языковых моделей (LLM) к различным средам без аннотаций человека. Метод синтезирует траектории взаимодействия агента со средой на основе документации и создает инструкции путем обобщения истории взаимодействий. Эксперименты показывают эффективность подхода в различных задачах, улучшая базовые результаты до 19.5% при обучении. Авторы демонстрируют критическую роль обратного конструирования и превосходство их метода над альтернативными подходами.'}, 'en': {'title': 'Empowering LLM Agents through Synthetic Interaction Data', 'desc': "This paper introduces Learn-by-interact, a framework designed to enhance the performance of large language model (LLM) agents in various environments without needing human-generated data. The framework generates synthetic data by simulating interactions between agents and their environments, using documentation to guide the process. A key innovation is the backward construction method, which summarizes interaction histories to create effective instructions for the agents. Experimental results show significant improvements in agent performance across multiple tasks, highlighting the framework's potential for real-world applications."}, 'zh': {'title': '通过交互学习,提升智能代理能力', 'desc': '本文提出了一种名为Learn-by-interact的数据中心框架,旨在使大型语言模型(LLMs)能够适应不同的环境,而无需人工标注。该框架通过文档生成代理与环境交互的轨迹,并通过总结或抽象交互历史来构建指令,这一过程称为反向构建。实验结果表明,Learn-by-interact在多种下游任务中显著提高了性能,尤其是在无监督学习和训练场景中。我们还展示了反向构建在训练中的重要性,进一步验证了合成数据的有效性和检索管道的优越性。'}}}, {'id': 'https://huggingface.co/papers/2501.08331', 'title': 'Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using Real-Time Warped Noise', 'url': 'https://huggingface.co/papers/2501.08331', 'abstract': 'Generative modeling aims to transform random noise into structured outputs. In this work, we enhance video diffusion models by allowing motion control via structured latent noise sampling. This is achieved by just a change in data: we pre-process training videos to yield structured noise. Consequently, our method is agnostic to diffusion model design, requiring no changes to model architectures or training pipelines. Specifically, we propose a novel noise warping algorithm, fast enough to run in real time, that replaces random temporal Gaussianity with correlated warped noise derived from optical flow fields, while preserving the spatial Gaussianity. The efficiency of our algorithm enables us to fine-tune modern video diffusion base models using warped noise with minimal overhead, and provide a one-stop solution for a wide range of user-friendly motion control: local object motion control, global camera movement control, and motion transfer. The harmonization between temporal coherence and spatial Gaussianity in our warped noise leads to effective motion control while maintaining per-frame pixel quality. Extensive experiments and user studies demonstrate the advantages of our method, making it a robust and scalable approach for controlling motion in video diffusion models. Video results are available on our webpage: https://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow. Source code and model checkpoints are available on GitHub: https://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow.', 'score': 11, 'issue_id': 1798, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'c48e19ef08e8d758', 'authors': ['Ryan Burgert', 'Yuancheng Xu', 'Wenqi Xian', 'Oliver Pilarski', 'Pascal Clausen', 'Mingming He', 'Li Ma', 'Yitong Deng', 'Lingxiao Li', 'Mohsen Mousavi', 'Michael Ryoo', 'Paul Debevec', 'Ning Yu'], 'affiliations': ['Eyeline Studios', 'Netflix', 'Stanford University', 'Stony Brook University', 'University of Maryland'], 'pdf_title_img': 'assets/pdf/title_img/2501.08331.jpg', 'data': {'categories': ['#diffusion', '#video', '#data'], 'emoji': '🎬', 'ru': {'title': 'Контроль движения в видео-диффузии через структурированный шум', 'desc': 'Исследователи предложили метод улучшения видео-диффузионных моделей путем изменения структуры шумовых данных при обучении. Они разработали алгоритм искажения шума в реальном времени, который сохраняет пространственную гауссовость, но вводит временную корреляцию на основе оптического потока. Этот подход позволяет контролировать движение в генерируемых видео без изменения архитектуры модели. Эксперименты показали эффективность метода для управления локальным движением объектов, глобальным движением камеры и переносом движения.'}, 'en': {'title': 'Transforming Noise into Motion: Enhanced Control in Video Diffusion Models', 'desc': 'This paper presents an improvement in video diffusion models by introducing a method for controlling motion through structured latent noise sampling. The authors propose a novel noise warping algorithm that modifies the training data to replace random noise with correlated noise based on optical flow, enhancing temporal coherence while maintaining spatial quality. This approach allows for real-time processing and fine-tuning of existing video diffusion models without altering their architecture or training methods. The results show that this method effectively enables various motion control tasks, making it a versatile tool for video generation applications.'}, 'zh': {'title': '运动控制的新方法:扭曲噪声的力量', 'desc': '生成建模的目标是将随机噪声转化为结构化输出。本文通过结构化潜在噪声采样增强视频扩散模型,实现了运动控制。我们提出了一种新颖的噪声扭曲算法,能够实时运行,并用光流场导出的相关扭曲噪声替代随机时间高斯噪声,同时保持空间高斯性。我们的算法高效性使得在现代视频扩散基础模型中使用扭曲噪声进行微调成为可能,提供了用户友好的运动控制解决方案。'}}}, {'id': 'https://huggingface.co/papers/2501.12273', 'title': 'Condor: Enhance LLM Alignment with Knowledge-Driven Data Synthesis and Refinement', 'url': 'https://huggingface.co/papers/2501.12273', 'abstract': 'The quality of Supervised Fine-Tuning (SFT) data plays a critical role in enhancing the conversational capabilities of Large Language Models (LLMs). However, as LLMs become more advanced, the availability of high-quality human-annotated SFT data has become a significant bottleneck, necessitating a greater reliance on synthetic training data. In this work, we introduce Condor, a novel two-stage synthetic data generation framework that incorporates World Knowledge Tree and Self-Reflection Refinement to produce high-quality SFT data at scale. Our experimental results demonstrate that a base model fine-tuned on only 20K Condor-generated samples achieves superior performance compared to counterparts. The additional refinement stage in Condor further enables iterative self-improvement for LLMs at various scales (up to 72B), validating the effectiveness of our approach. Furthermore, our investigation into the scaling for synthetic data in post-training reveals substantial unexplored potential for performance improvements, opening promising avenues for future research.', 'score': 11, 'issue_id': 1796, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '10499c8b820d5368', 'authors': ['Maosong Cao', 'Taolin Zhang', 'Mo Li', 'Chuyu Zhang', 'Yunxin Liu', 'Haodong Duan', 'Songyang Zhang', 'Kai Chen'], 'affiliations': ['Shanghai AI Laboratory', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.12273.jpg', 'data': {'categories': ['#optimization', '#synthetic', '#data', '#dataset', '#training'], 'emoji': '🦅', 'ru': {'title': 'Condor: прорыв в создании синтетических данных для обучения языковых моделей', 'desc': 'В статье представлен Condor - новый фреймворк для генерации синтетических данных для обучения больших языковых моделей (LLM). Он использует дерево мировых знаний и самоанализ для создания высококачественных обучающих данных. Эксперименты показали, что модель, обученная на 20 тысячах сгенерированных Condor примеров, превосходит аналоги. Исследование также выявило потенциал для улучшения производительности LLM при масштабировании синтетических данных.'}, 'en': {'title': 'Unlocking LLM Potential with Synthetic Data Generation', 'desc': 'This paper addresses the challenge of obtaining high-quality Supervised Fine-Tuning (SFT) data for Large Language Models (LLMs). It presents Condor, a two-stage framework that generates synthetic training data using World Knowledge Tree and Self-Reflection Refinement techniques. The results show that models fine-tuned with just 20,000 samples from Condor outperform those trained with traditional methods. Additionally, the framework allows for iterative self-improvement, suggesting significant potential for enhancing LLM performance through synthetic data.'}, 'zh': {'title': '合成数据生成,提升对话能力的关键', 'desc': '本论文探讨了监督微调(SFT)数据的质量对大型语言模型(LLMs)对话能力的重要性。随着LLMs的进步,高质量的人类标注SFT数据变得稀缺,因此需要更多依赖合成训练数据。我们提出了一种名为Condor的两阶段合成数据生成框架,结合了世界知识树和自我反思精炼,以大规模生成高质量的SFT数据。实验结果表明,仅用20K个Condor生成的样本微调的基础模型,其性能优于其他模型,验证了我们方法的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.10687', 'title': 'EMO2: End-Effector Guided Audio-Driven Avatar Video Generation', 'url': 'https://huggingface.co/papers/2501.10687', 'abstract': 'In this paper, we propose a novel audio-driven talking head method capable of simultaneously generating highly expressive facial expressions and hand gestures. Unlike existing methods that focus on generating full-body or half-body poses, we investigate the challenges of co-speech gesture generation and identify the weak correspondence between audio features and full-body gestures as a key limitation. To address this, we redefine the task as a two-stage process. In the first stage, we generate hand poses directly from audio input, leveraging the strong correlation between audio signals and hand movements. In the second stage, we employ a diffusion model to synthesize video frames, incorporating the hand poses generated in the first stage to produce realistic facial expressions and body movements. Our experimental results demonstrate that the proposed method outperforms state-of-the-art approaches, such as CyberHost and Vlogger, in terms of both visual quality and synchronization accuracy. This work provides a new perspective on audio-driven gesture generation and a robust framework for creating expressive and natural talking head animations.', 'score': 9, 'issue_id': 1798, 'pub_date': '2025-01-18', 'pub_date_card': {'ru': '18 января', 'en': 'January 18', 'zh': '1月18日'}, 'hash': '13c0931101eb51eb', 'authors': ['Linrui Tian', 'Siqi Hu', 'Qi Wang', 'Bang Zhang', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.10687.jpg', 'data': {'categories': ['#multimodal', '#audio', '#video', '#games', '#diffusion'], 'emoji': '🗣️', 'ru': {'title': 'Революция в анимации: от звука к выразительным жестам', 'desc': 'В статье предлагается новый метод создания говорящей головы на основе аудио, способный одновременно генерировать выразительные мимику и жесты рук. Авторы определяют задачу как двухэтапный процесс: сначала генерируются позы рук непосредственно из аудиовхода, затем применяется диффузионная модель для синтеза видеокадров. Экспериментальные результаты показывают, что предложенный метод превосходит современные подходы по качеству изображения и точности синхронизации. Работа предоставляет новый взгляд на генерацию жестов на основе аудио и надежную основу для создания выразительных и естественных анимаций говорящей головы.'}, 'en': {'title': 'Expressive Talking Heads: Bridging Audio and Gesture Generation', 'desc': 'This paper introduces a new method for creating talking head animations that are driven by audio. It focuses on generating both facial expressions and hand gestures, addressing the limitations of previous methods that often overlook the connection between audio and gestures. The approach is divided into two stages: first, it generates hand poses from audio signals, and then it uses a diffusion model to create video frames that combine these hand poses with realistic facial movements. The results show that this method is more effective than existing techniques, providing better visual quality and synchronization with the audio.'}, 'zh': {'title': '音频驱动的生动表情与手势生成新方法', 'desc': '本文提出了一种新颖的音频驱动的说话头方法,能够同时生成高度表现力的面部表情和手势。与现有方法不同,我们关注于共语手势生成的挑战,并识别音频特征与全身手势之间的弱对应关系。为了解决这个问题,我们将任务重新定义为两个阶段:第一阶段直接从音频输入生成手势,第二阶段使用扩散模型合成视频帧,结合第一阶段生成的手势,产生逼真的面部表情和身体动作。实验结果表明,该方法在视觉质量和同步精度方面优于现有的最先进方法。'}}}, {'id': 'https://huggingface.co/papers/2501.12390', 'title': 'GPS as a Control Signal for Image Generation', 'url': 'https://huggingface.co/papers/2501.12390', 'abstract': 'We show that the GPS tags contained in photo metadata provide a useful control signal for image generation. We train GPS-to-image models and use them for tasks that require a fine-grained understanding of how images vary within a city. In particular, we train a diffusion model to generate images conditioned on both GPS and text. The learned model generates images that capture the distinctive appearance of different neighborhoods, parks, and landmarks. We also extract 3D models from 2D GPS-to-image models through score distillation sampling, using GPS conditioning to constrain the appearance of the reconstruction from each viewpoint. Our evaluations suggest that our GPS-conditioned models successfully learn to generate images that vary based on location, and that GPS conditioning improves estimated 3D structure.', 'score': 9, 'issue_id': 1797, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '11d289e8a895bedd', 'authors': ['Chao Feng', 'Ziyang Chen', 'Aleksander Holynski', 'Alexei A. Efros', 'Andrew Owens'], 'affiliations': ['UC Berkeley', 'University of Michigan'], 'pdf_title_img': 'assets/pdf/title_img/2501.12390.jpg', 'data': {'categories': ['#synthetic', '#cv', '#multimodal', '#dataset', '#diffusion', '#3d'], 'emoji': '🗺️', 'ru': {'title': 'GPS-метки открывают новые горизонты в генерации изображений и 3D-моделировании', 'desc': 'Исследователи демонстрируют, как GPS-метки в метаданных фотографий могут использоваться для улучшения генерации изображений. Они обучают модели диффузии, генерирующие изображения на основе GPS-координат и текста, что позволяет точно отображать особенности различных районов и достопримечательностей. Авторы также извлекают 3D-модели из 2D GPS-моделей с помощью методики score distillation sampling. Результаты показывают, что GPS-обусловленные модели успешно генерируют изображения, варьирующиеся в зависимости от местоположения, и улучшают оценку 3D-структуры.'}, 'en': {'title': 'Harnessing GPS Data for Location-Aware Image Generation', 'desc': 'This paper explores the use of GPS data embedded in photo metadata as a control signal for generating images. The authors develop GPS-to-image models, particularly a diffusion model, that can create images based on both GPS coordinates and textual descriptions. The model effectively captures the unique characteristics of various urban environments, such as neighborhoods and landmarks. Additionally, they demonstrate the ability to extract 3D models from these images, enhancing the accuracy of 3D reconstructions by using GPS information to guide the process.'}, 'zh': {'title': '利用GPS标签生成城市图像的创新方法', 'desc': '本文展示了照片元数据中的GPS标签可以作为图像生成的有用控制信号。我们训练了GPS到图像的模型,并将其应用于需要细致理解城市中图像变化的任务。特别地,我们训练了一个扩散模型,生成同时依赖于GPS和文本的图像。评估结果表明,我们的GPS条件模型成功学习了基于位置生成变化图像,并且GPS条件改善了估计的3D结构。'}}}, {'id': 'https://huggingface.co/papers/2501.10057', 'title': 'MSTS: A Multimodal Safety Test Suite for Vision-Language Models', 'url': 'https://huggingface.co/papers/2501.10057', 'abstract': 'Vision-language models (VLMs), which process image and text inputs, are increasingly integrated into chat assistants and other consumer AI applications. Without proper safeguards, however, VLMs may give harmful advice (e.g. how to self-harm) or encourage unsafe behaviours (e.g. to consume drugs). Despite these clear hazards, little work so far has evaluated VLM safety and the novel risks created by multimodal inputs. To address this gap, we introduce MSTS, a Multimodal Safety Test Suite for VLMs. MSTS comprises 400 test prompts across 40 fine-grained hazard categories. Each test prompt consists of a text and an image that only in combination reveal their full unsafe meaning. With MSTS, we find clear safety issues in several open VLMs. We also find some VLMs to be safe by accident, meaning that they are safe because they fail to understand even simple test prompts. We translate MSTS into ten languages, showing non-English prompts to increase the rate of unsafe model responses. We also show models to be safer when tested with text only rather than multimodal prompts. Finally, we explore the automation of VLM safety assessments, finding even the best safety classifiers to be lacking.', 'score': 7, 'issue_id': 1802, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '05ea9cad57d3e1e6', 'authors': ['Paul Röttger', 'Giuseppe Attanasio', 'Felix Friedrich', 'Janis Goldzycher', 'Alicia Parrish', 'Rishabh Bhardwaj', 'Chiara Di Bonaventura', 'Roman Eng', 'Gaia El Khoury Geagea', 'Sujata Goswami', 'Jieun Han', 'Dirk Hovy', 'Seogyeong Jeong', 'Paloma Jeretič', 'Flor Miriam Plaza-del-Arco', 'Donya Rooein', 'Patrick Schramowski', 'Anastassia Shaitarova', 'Xudong Shen', 'Richard Willats', 'Andrea Zugarini', 'Bertie Vidgen'], 'affiliations': ['Bocconi University', 'CERTAIN', 'Clarkson University', 'Contextual AI', 'DFKI', 'Expert.ai', 'Google DeepMind', 'Hessian.AI', 'Imperial College London', 'Instituto de Telecomunicações', 'KAIST', 'Kings College London', 'Lawrence Berkeley National Laboratory', 'National University of Singapore', 'TU Darmstadt', 'University of Pennsylvania', 'University of Zurich', 'Walled AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.10057.jpg', 'data': {'categories': ['#security', '#dataset', '#benchmark', '#multimodal', '#ethics', '#multilingual'], 'emoji': '🔍', 'ru': {'title': 'Новый подход к оценке безопасности мультимодальных ИИ-моделей', 'desc': 'Статья представляет новый набор тестов MSTS для оценки безопасности мультимодальных моделей, работающих с изображениями и текстом. MSTS содержит 400 тестовых запросов в 40 категориях опасностей, где небезопасный смысл раскрывается только при сочетании текста и изображения. Исследование выявило проблемы безопасности в нескольких открытых мультимодальных моделях, а также показало, что некоторые модели безопасны случайно из-за непонимания даже простых запросов. Авторы также обнаружили, что модели менее безопасны при тестировании на других языках и с мультимодальными запросами по сравнению с только текстовыми.'}, 'en': {'title': 'Ensuring Safety in Vision-Language Models: A New Testing Approach', 'desc': 'This paper discusses the safety concerns associated with Vision-Language Models (VLMs) that combine image and text inputs. It introduces the Multimodal Safety Test Suite (MSTS), which includes 400 test prompts designed to evaluate the safety of VLMs across various hazard categories. The study reveals that many VLMs exhibit safety issues when processing multimodal inputs, while some are inadvertently safe due to their inability to comprehend simple prompts. Additionally, the research highlights the challenges in automating safety assessments for VLMs, indicating that even the most advanced safety classifiers have limitations.'}, 'zh': {'title': '确保视觉语言模型安全的关键测试', 'desc': '本文介绍了一种多模态安全测试套件(MSTS),用于评估视觉语言模型(VLMs)的安全性。MSTS包含400个测试提示,涵盖40个细分的危险类别,每个提示由文本和图像组合而成,以揭示其潜在的危险含义。研究发现,许多开放的VLM在安全性方面存在明显问题,而一些模型由于无法理解简单提示而意外地表现出安全性。此外,测试结果表明,单一文本提示的安全性高于多模态提示,且现有的安全分类器在自动化评估中仍存在不足。'}}}, {'id': 'https://huggingface.co/papers/2501.10573', 'title': 'The Geometry of Tokens in Internal Representations of Large Language Models', 'url': 'https://huggingface.co/papers/2501.10573', 'abstract': 'We investigate the relationship between the geometry of token embeddings and their role in the next token prediction within transformer models. An important aspect of this connection uses the notion of empirical measure, which encodes the distribution of token point clouds across transformer layers and drives the evolution of token representations in the mean-field interacting picture. We use metrics such as intrinsic dimension, neighborhood overlap, and cosine similarity to observationally probe these empirical measures across layers. To validate our approach, we compare these metrics to a dataset where the tokens are shuffled, which disrupts the syntactic and semantic structure. Our findings reveal a correlation between the geometric properties of token embeddings and the cross-entropy loss of next token predictions, implying that prompts with higher loss values have tokens represented in higher-dimensional spaces.', 'score': 5, 'issue_id': 1807, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '1b34301e721ccccd', 'authors': ['Karthik Viswanathan', 'Yuri Gardinazzi', 'Giada Panerai', 'Alberto Cazzaniga', 'Matteo Biagetti'], 'affiliations': ['Area Science Park, Trieste, Italy', 'University of Amsterdam, Amsterdam, the Netherlands', 'University of Trieste, Trieste, Italy'], 'pdf_title_img': 'assets/pdf/title_img/2501.10573.jpg', 'data': {'categories': ['#benchmark', '#optimization', '#training', '#dataset', '#interpretability', '#data', '#architecture'], 'emoji': '🧮', 'ru': {'title': 'Геометрия вложений токенов раскрывает тайны предсказания в трансформерах', 'desc': 'Исследование посвящено связи между геометрией вложений токенов и их ролью в предсказании следующего токена в трансформерных моделях. Авторы используют понятие эмпирической меры для анализа распределения облаков точек токенов по слоям трансформера. Они применяют метрики, такие как внутренняя размерность, перекрытие окрестностей и косинусное сходство, для изучения этих эмпирических мер. Результаты показывают корреляцию между геометрическими свойствами вложений токенов и кросс-энтропийной функцией потерь при предсказании следующего токена.'}, 'en': {'title': 'Geometry Matters: Token Embeddings Shape Prediction Success', 'desc': 'This paper explores how the shape and arrangement of token embeddings affect the ability of transformer models to predict the next token in a sequence. It introduces the concept of empirical measure to analyze how token representations change across different layers of the model. By examining metrics like intrinsic dimension and cosine similarity, the authors investigate the geometric properties of these embeddings. The results show that tokens associated with higher prediction errors are represented in more complex, higher-dimensional spaces, highlighting the importance of geometry in language modeling.'}, 'zh': {'title': '标记嵌入几何与预测损失的关系', 'desc': '本文研究了在变换器模型中,标记嵌入的几何形状与下一个标记预测之间的关系。我们使用经验测度的概念来编码标记点云在变换器层中的分布,并驱动标记表示的演变。通过内在维度、邻域重叠和余弦相似度等指标,我们观察了这些经验测度在各层之间的变化。研究结果表明,标记嵌入的几何特性与下一个标记预测的交叉熵损失之间存在相关性,损失值较高的提示对应的标记在更高维空间中表示。'}}}, {'id': 'https://huggingface.co/papers/2501.11900', 'title': 'Panoramic Interests: Stylistic-Content Aware Personalized Headline Generation', 'url': 'https://huggingface.co/papers/2501.11900', 'abstract': "Personalized news headline generation aims to provide users with attention-grabbing headlines that are tailored to their preferences. Prevailing methods focus on user-oriented content preferences, but most of them overlook the fact that diverse stylistic preferences are integral to users' panoramic interests, leading to suboptimal personalization. In view of this, we propose a novel Stylistic-Content Aware Personalized Headline Generation (SCAPE) framework. SCAPE extracts both content and stylistic features from headlines with the aid of large language model (LLM) collaboration. It further adaptively integrates users' long- and short-term interests through a contrastive learning-based hierarchical fusion network. By incorporating the panoramic interests into the headline generator, SCAPE reflects users' stylistic-content preferences during the generation process. Extensive experiments on the real-world dataset PENS demonstrate the superiority of SCAPE over baselines.", 'score': 3, 'issue_id': 1805, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'af7a432a54575398', 'authors': ['Junhong Lian', 'Xiang Ao', 'Xinyu Liu', 'Yang Liu', 'Qing He'], 'affiliations': ['Institute of Computing Technology, Chinese Academy of Sciences', 'Key Lab of Intelligent Information Processing of Chinese Academy of Sciences (CAS)'], 'pdf_title_img': 'assets/pdf/title_img/2501.11900.jpg', 'data': {'categories': ['#multimodal', '#training', '#story_generation', '#dataset'], 'emoji': '📰', 'ru': {'title': 'SCAPE: персонализация заголовков с учетом стиля и содержания', 'desc': 'Эта статья представляет новый подход к генерации персонализированных заголовков новостей, называемый SCAPE. Фреймворк SCAPE учитывает как содержательные, так и стилистические предпочтения пользователей с помощью большой языковой модели. Он адаптивно интегрирует долгосрочные и краткосрочные интересы пользователей через иерархическую сеть слияния на основе контрастного обучения. Эксперименты на реальном датасете PENS демонстрируют превосходство SCAPE над базовыми методами.'}, 'en': {'title': 'Tailored Headlines: Merging Style and Content for Personalization', 'desc': "This paper introduces a new framework called SCAPE for generating personalized news headlines that cater to both content and stylistic preferences of users. Unlike previous methods that primarily focus on content, SCAPE recognizes the importance of diverse stylistic choices in enhancing personalization. The framework utilizes large language models to extract relevant features and employs a contrastive learning-based hierarchical fusion network to integrate users' interests over time. Experimental results on the PENS dataset show that SCAPE outperforms existing approaches in generating more appealing and tailored headlines."}, 'zh': {'title': '个性化标题生成的新视角:风格与内容的结合', 'desc': '个性化新闻标题生成旨在为用户提供吸引眼球的标题,符合他们的偏好。现有方法主要关注用户的内容偏好,但往往忽视了用户多样化的风格偏好,这导致个性化效果不佳。为此,我们提出了一种新颖的风格内容感知个性化标题生成框架(SCAPE)。SCAPE通过大型语言模型提取标题的内容和风格特征,并通过对比学习的层次融合网络自适应整合用户的长期和短期兴趣,从而在生成过程中反映用户的风格内容偏好。'}}}, {'id': 'https://huggingface.co/papers/2501.12389', 'title': 'Taming Teacher Forcing for Masked Autoregressive Video Generation', 'url': 'https://huggingface.co/papers/2501.12389', 'abstract': 'We introduce MAGI, a hybrid video generation framework that combines masked modeling for intra-frame generation with causal modeling for next-frame generation. Our key innovation, Complete Teacher Forcing (CTF), conditions masked frames on complete observation frames rather than masked ones (namely Masked Teacher Forcing, MTF), enabling a smooth transition from token-level (patch-level) to frame-level autoregressive generation. CTF significantly outperforms MTF, achieving a +23% improvement in FVD scores on first-frame conditioned video prediction. To address issues like exposure bias, we employ targeted training strategies, setting a new benchmark in autoregressive video generation. Experiments show that MAGI can generate long, coherent video sequences exceeding 100 frames, even when trained on as few as 16 frames, highlighting its potential for scalable, high-quality video generation.', 'score': 1, 'issue_id': 1813, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '43a9c17394f0d637', 'authors': ['Deyu Zhou', 'Quan Sun', 'Yuang Peng', 'Kun Yan', 'Runpei Dong', 'Duomin Wang', 'Zheng Ge', 'Nan Duan', 'Xiangyu Zhang', 'Lionel M. Ni', 'Heung-Yeung Shum'], 'affiliations': ['HKUST', 'HKUST(GZ)', 'StepFun', 'THU', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2501.12389.jpg', 'data': {'categories': ['#training', '#video', '#benchmark'], 'emoji': '🎬', 'ru': {'title': 'MAGI: Революция в автоматической генерации видео', 'desc': 'MAGI - это гибридная система генерации видео, объединяющая маскированное моделирование для внутрикадровой генерации и каузальное моделирование для генерации следующего кадра. Ключевое нововведение - Complete Teacher Forcing (CTF), которое обусловливает маскированные кадры полными наблюдаемыми кадрами, а не маскированными. CTF значительно превосходит Masked Teacher Forcing (MTF), улучшая показатели FVD на 23% при прогнозировании видео на основе первого кадра. MAGI способна генерировать длинные, связные видеопоследовательности, превышающие 100 кадров, даже при обучении на всего 16 кадрах.'}, 'en': {'title': 'MAGI: Revolutionizing Video Generation with Complete Teacher Forcing', 'desc': 'MAGI is a new framework for generating videos that uses two main techniques: masked modeling for creating individual frames and causal modeling for predicting the next frame. The innovative approach called Complete Teacher Forcing (CTF) improves the process by using fully observed frames to guide the generation, rather than just partially masked frames. This method leads to a significant performance boost, as evidenced by a 23% increase in FVD scores compared to previous methods. Additionally, MAGI can produce long and coherent video sequences, demonstrating its effectiveness even with limited training data.'}, 'zh': {'title': 'MAGI:高效视频生成的新突破', 'desc': '本文介绍了一种名为MAGI的混合视频生成框架,它结合了掩码建模用于帧内生成和因果建模用于下一帧生成。我们提出的关键创新是完整教师强制(CTF),它基于完整观察帧而非掩码帧来条件化掩码帧,从而实现从标记级到帧级自回归生成的平滑过渡。CTF在第一帧条件视频预测中显著优于掩码教师强制(MTF),FVD分数提高了23%。实验表明,MAGI能够生成超过100帧的长时间连贯视频序列,即使在仅用16帧训练的情况下,也展现了其可扩展性和高质量视频生成的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.12206', 'title': 'Fixing Imbalanced Attention to Mitigate In-Context Hallucination of Large Vision-Language Model', 'url': 'https://huggingface.co/papers/2501.12206', 'abstract': 'Large Vision Language Models (LVLMs) have demonstrated remarkable capabilities in understanding and describing visual content, achieving state-of-the-art performance across various vision-language tasks. However, these models frequently exhibit hallucination behavior, where they generate descriptions containing objects or details absent in the input image. Our work investigates this phenomenon by analyzing attention patterns across transformer layers and heads, revealing that hallucinations often stem from progressive degradation of visual grounding in deeper layers. We propose a novel attention modification approach that combines selective token emphasis and head-specific modulation to maintain visual grounding throughout the generation process. Our method introduces two key components: (1) a dual-stream token selection mechanism that identifies and prioritizes both locally informative and spatially significant visual tokens, and (2) an attention head-specific modulation strategy that differentially amplifies visual information processing based on measured visual sensitivity of individual attention heads. Through extensive experimentation on the MSCOCO dataset, we demonstrate that our approach reduces hallucination rates by up to 62.3\\% compared to baseline models while maintaining comparable task performance. Our analysis reveals that selectively modulating tokens across attention heads with varying levels of visual sensitivity can significantly improve visual grounding without requiring model retraining.', 'score': 0, 'issue_id': 1812, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'd37fc59e414ab903', 'authors': ['Kazi Hasan Ibn Arif', 'Sajib Acharjee Dip', 'Khizar Hussain', 'Lang Zhang', 'Chris Thomas'], 'affiliations': ['Virginia Tech'], 'pdf_title_img': 'assets/pdf/title_img/2501.12206.jpg', 'data': {'categories': ['#multimodal', '#dataset', '#interpretability', '#architecture', '#cv', '#hallucinations'], 'emoji': '👁️', 'ru': {'title': 'Улучшение визуальной привязки для снижения галлюцинаций в LVLM', 'desc': 'Данная статья исследует проблему галлюцинаций в крупных визуально-языковых моделях (LVLM) при описании изображений. Авторы анализируют паттерны внимания в слоях трансформера и обнаруживают, что галлюцинации часто возникают из-за ослабления визуальной привязки в глубоких слоях. Предлагается новый подход модификации внимания, сочетающий выборочное усиление токенов и модуляцию головок внимания для сохранения визуальной привязки. Эксперименты показывают, что метод снижает уровень галлюцинаций на 62.3% по сравнению с базовыми моделями.'}, 'en': {'title': 'Enhancing Visual Grounding to Combat Hallucinations in LVLMs', 'desc': "This paper addresses the issue of hallucination in Large Vision Language Models (LVLMs), where the models generate incorrect descriptions that include non-existent objects. The authors analyze attention patterns in transformer layers to understand how visual grounding deteriorates in deeper layers, leading to these hallucinations. They propose a new method that enhances attention by focusing on important visual tokens and adjusting how different attention heads process visual information. Their experiments show that this approach can significantly reduce hallucination rates while keeping the model's performance on tasks intact."}, 'zh': {'title': '减少幻觉,提升视觉理解!', 'desc': '大型视觉语言模型(LVLMs)在理解和描述视觉内容方面表现出色,但它们常常会产生幻觉行为,即生成的描述中包含输入图像中不存在的对象或细节。我们的研究分析了变换器层和头部的注意力模式,发现幻觉通常源于深层次的视觉基础逐渐退化。我们提出了一种新的注意力修改方法,结合选择性标记强调和头部特定调制,以在生成过程中保持视觉基础。通过在MSCOCO数据集上的广泛实验,我们的方法将幻觉率降低了多达62.3%,同时保持了相似的任务性能。'}}}, {'id': 'https://huggingface.co/papers/2501.02976', 'title': 'STAR: Spatial-Temporal Augmentation with Text-to-Video Models for Real-World Video Super-Resolution', 'url': 'https://huggingface.co/papers/2501.02976', 'abstract': 'Image diffusion models have been adapted for real-world video super-resolution to tackle over-smoothing issues in GAN-based methods. However, these models struggle to maintain temporal consistency, as they are trained on static images, limiting their ability to capture temporal dynamics effectively. Integrating text-to-video (T2V) models into video super-resolution for improved temporal modeling is straightforward. However, two key challenges remain: artifacts introduced by complex degradations in real-world scenarios, and compromised fidelity due to the strong generative capacity of powerful T2V models (e.g., CogVideoX-5B). To enhance the spatio-temporal quality of restored videos, we introduce~\\name (Spatial-Temporal Augmentation with T2V models for Real-world video super-resolution), a novel approach that leverages T2V models for real-world video super-resolution, achieving realistic spatial details and robust temporal consistency. Specifically, we introduce a Local Information Enhancement Module (LIEM) before the global attention block to enrich local details and mitigate degradation artifacts. Moreover, we propose a Dynamic Frequency (DF) Loss to reinforce fidelity, guiding the model to focus on different frequency components across diffusion steps. Extensive experiments demonstrate~\\name~outperforms state-of-the-art methods on both synthetic and real-world datasets.', 'score': 36, 'issue_id': 1527, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '13ac412646c508f5', 'authors': ['Rui Xie', 'Yinhong Liu', 'Penghao Zhou', 'Chen Zhao', 'Jun Zhou', 'Kai Zhang', 'Zhenyu Zhang', 'Jian Yang', 'Zhenheng Yang', 'Ying Tai'], 'affiliations': ['ByteDance', 'Nanjing University', 'Southwest University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02976.jpg', 'data': {'categories': ['#cv', '#optimization', '#diffusion', '#multimodal', '#video'], 'emoji': '🎥', 'ru': {'title': 'Качественное суперразрешение видео с помощью T2V моделей', 'desc': 'Представлена новая методика STAR для суперразрешения видео в реальных условиях с использованием моделей text-to-video. Предложен модуль LIEM для улучшения локальных деталей и устранения артефактов деградации. Введена функция потерь Dynamic Frequency для усиления точности восстановления на разных частотах. Эксперименты показывают превосходство STAR над современными методами на синтетических и реальных датасетах.'}, 'en': {'title': 'Enhancing Video Quality with T2V Models for Real-World Super-Resolution', 'desc': 'This paper presents a new method called Spatial-Temporal Augmentation with T2V models for Real-world video super-resolution, which aims to improve video quality by addressing issues of over-smoothing and temporal consistency. Traditional image diffusion models struggle with video because they are designed for static images, leading to challenges in capturing motion dynamics. The proposed approach incorporates a Local Information Enhancement Module to enhance local details and reduce artifacts, along with a Dynamic Frequency Loss to maintain fidelity across different frequency components. Experimental results show that this method outperforms existing techniques in both synthetic and real-world scenarios, providing better spatial and temporal quality in restored videos.'}, 'zh': {'title': '提升视频超分辨率的时空一致性', 'desc': '本文提出了一种新方法,名为~\\name~,用于提高真实世界视频超分辨率的时空质量。该方法结合了文本到视频(T2V)模型,以解决传统生成对抗网络(GAN)方法中的过平滑问题。通过引入局部信息增强模块(LIEM)和动态频率损失(DF Loss),该方法能够有效改善视频的局部细节和时间一致性。实验结果表明,~\\name~在合成和真实世界数据集上均优于现有的最先进方法。'}}}, {'id': 'https://huggingface.co/papers/2501.03226', 'title': 'BoostStep: Boosting mathematical capability of Large Language Models via improved single-step reasoning', 'url': 'https://huggingface.co/papers/2501.03226', 'abstract': "Cutting-edge large language models (LLMs) demonstrate promising performance in solving complex math problems with a divide-and-conquer pipeline and the assistance of in-context learning (ICL) examples. However, their potential for improvement is limited by two critical problems within their ICL examples: granularity-mismatch and the ensuing negative-effect noise problem. Specifically, the LLMs are capable of the dividing process yet mostly failed by inaccurate reasoning within a few conquer steps, while the ICL examples retrieved in question-grained sometimes lack relevant steps for a specific challenging reasoning step. Further, this disconnect may hinder the correct reasoning due to its irrelevance. To this end, we focus on improving the reasoning quality within each step and present BoostStep. BoostStep aligns the granularity between the retrieving and reasoning on step grained, and provides highly related ICL examples for each reasoning step with a novel `first-try' strategy. BoostStep provides more relevant examples than the coarse question-grained strategy, enhancing the model reasoning quality within each step steadily. BoostStep is a general and robust reasoning-enhancing method that not only improves standalone reasoning performance but also integrates seamlessly with Monte Carlo Tree Search methods (MCTS) to refine both candidate generation and decision-making. Quantitatively, it improves GPT-4o and Qwen2.5-Math-72B by 3.6\\% and 2.0\\% respectively on various mathematical benchmarks, and 7.5\\% gain combined with MCTS.", 'score': 21, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '94a01c7d4516c725', 'authors': ['Beichen Zhang', 'Yuhong Liu', 'Xiaoyi Dong', 'Yuhang Zang', 'Pan Zhang', 'Haodong Duan', 'Yuhang Cao', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.03226.jpg', 'data': {'categories': ['#training', '#optimization', '#math', '#reasoning'], 'emoji': '🧮', 'ru': {'title': 'BoostStep: Повышение точности рассуждений ИИ в решении математических задач', 'desc': 'Статья представляет метод BoostStep для улучшения решения сложных математических задач большими языковыми моделями. BoostStep решает проблемы несоответствия детализации и негативного шума в примерах обучения в контексте. Метод выравнивает гранулярность между извлечением и рассуждением на уровне шагов, предоставляя релевантные примеры для каждого шага рассуждения. BoostStep повышает качество рассуждений модели и может интегрироваться с методами поиска по дереву Монте-Карло для улучшения генерации кандидатов и принятия решений.'}, 'en': {'title': 'Boosting Reasoning Quality in Large Language Models with BoostStep', 'desc': "This paper introduces BoostStep, a method designed to enhance the reasoning quality of large language models (LLMs) when solving complex math problems. It addresses two main issues: granularity-mismatch and negative-effect noise in in-context learning (ICL) examples, which can lead to inaccurate reasoning. By aligning the granularity of retrieved examples with the specific reasoning steps required, BoostStep provides more relevant ICL examples, improving the model's performance. The method not only boosts standalone reasoning but also integrates effectively with Monte Carlo Tree Search (MCTS) to enhance decision-making processes."}, 'zh': {'title': '提升推理质量的BoostStep方法', 'desc': '这篇论文探讨了大型语言模型(LLMs)在解决复杂数学问题时的表现,特别是通过分而治之的策略和上下文学习(ICL)示例的辅助。研究发现,ICL示例中的粒度不匹配和负面噪声问题限制了模型的改进潜力。为了解决这些问题,论文提出了BoostStep方法,它通过对每个推理步骤的粒度进行对齐,提供更相关的ICL示例,从而提高推理质量。BoostStep不仅提升了独立推理的性能,还能与蒙特卡洛树搜索(MCTS)方法无缝集成,进一步优化候选生成和决策过程。'}}}, {'id': 'https://huggingface.co/papers/2501.03218', 'title': 'Dispider: Enabling Video LLMs with Active Real-Time Interaction via Disentangled Perception, Decision, and Reaction', 'url': 'https://huggingface.co/papers/2501.03218', 'abstract': 'Active Real-time interaction with video LLMs introduces a new paradigm for human-computer interaction, where the model not only understands user intent but also responds while continuously processing streaming video on the fly. Unlike offline video LLMs, which analyze the entire video before answering questions, active real-time interaction requires three capabilities: 1) Perception: real-time video monitoring and interaction capturing. 2) Decision: raising proactive interaction in proper situations, 3) Reaction: continuous interaction with users. However, inherent conflicts exist among the desired capabilities. The Decision and Reaction require a contrary Perception scale and grain, and the autoregressive decoding blocks the real-time Perception and Decision during the Reaction. To unify the conflicted capabilities within a harmonious system, we present Dispider, a system that disentangles Perception, Decision, and Reaction. Dispider features a lightweight proactive streaming video processing module that tracks the video stream and identifies optimal moments for interaction. Once the interaction is triggered, an asynchronous interaction module provides detailed responses, while the processing module continues to monitor the video in the meantime. Our disentangled and asynchronous design ensures timely, contextually accurate, and computationally efficient responses, making Dispider ideal for active real-time interaction for long-duration video streams. Experiments show that Dispider not only maintains strong performance in conventional video QA tasks, but also significantly surpasses previous online models in streaming scenario responses, thereby validating the effectiveness of our architecture. The code and model are released at https://github.com/Mark12Ding/Dispider.', 'score': 20, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '1e9974be2d206516', 'authors': ['Rui Qian', 'Shuangrui Ding', 'Xiaoyi Dong', 'Pan Zhang', 'Yuhang Zang', 'Yuhang Cao', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.03218.jpg', 'data': {'categories': ['#long_context', '#video', '#optimization', '#architecture', '#interpretability'], 'emoji': '🎥', 'ru': {'title': 'Dispider: Интеллектуальное взаимодействие с видео в реальном времени', 'desc': 'Статья представляет систему Dispider для активного взаимодействия с видео в реальном времени с использованием языковых моделей. Система разделяет процессы восприятия, принятия решений и реакции, что позволяет эффективно обрабатывать потоковое видео и взаимодействовать с пользователем. Dispider использует легковесный модуль обработки видео для отслеживания потока и определения оптимальных моментов для взаимодействия. Асинхронная архитектура обеспечивает своевременные и точные ответы при длительной обработке видеопотоков.'}, 'en': {'title': 'Dispider: Real-time Interaction Redefined for Video LLMs', 'desc': 'This paper introduces Dispider, a system designed for active real-time interaction with video using large language models (LLMs). Unlike traditional offline models, Dispider can process video streams continuously while engaging with users, requiring three key capabilities: Perception, Decision, and Reaction. The system addresses conflicts between these capabilities by disentangling them, allowing for efficient monitoring and interaction without lag. Experimental results demonstrate that Dispider outperforms previous models in streaming scenarios, providing timely and contextually relevant responses during long-duration video interactions.'}, 'zh': {'title': '主动实时交互的新范式', 'desc': '本论文介绍了一种名为Dispider的系统,旨在实现视频大语言模型的主动实时交互。该系统通过分离感知、决策和反应三个能力,解决了实时交互中的固有冲突。Dispider具备轻量级的流媒体处理模块,能够实时监控视频流并识别最佳交互时机。实验结果表明,Dispider在传统视频问答任务中表现优异,并在流媒体场景响应上显著超越了之前的在线模型。'}}}, {'id': 'https://huggingface.co/papers/2501.02157', 'title': 'Personalized Graph-Based Retrieval for Large Language Models', 'url': 'https://huggingface.co/papers/2501.02157', 'abstract': 'As large language models (LLMs) evolve, their ability to deliver personalized and context-aware responses offers transformative potential for improving user experiences. Existing personalization approaches, however, often rely solely on user history to augment the prompt, limiting their effectiveness in generating tailored outputs, especially in cold-start scenarios with sparse data. To address these limitations, we propose Personalized Graph-based Retrieval-Augmented Generation (PGraphRAG), a framework that leverages user-centric knowledge graphs to enrich personalization. By directly integrating structured user knowledge into the retrieval process and augmenting prompts with user-relevant context, PGraphRAG enhances contextual understanding and output quality. We also introduce the Personalized Graph-based Benchmark for Text Generation, designed to evaluate personalized text generation tasks in real-world settings where user history is sparse or unavailable. Experimental results show that PGraphRAG significantly outperforms state-of-the-art personalization methods across diverse tasks, demonstrating the unique advantages of graph-based retrieval for personalization.', 'score': 16, 'issue_id': 1527, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': '65e3736cfc1e3295', 'authors': ['Steven Au', 'Cameron J. Dimacali', 'Ojasmitha Pedirappagari', 'Namyong Park', 'Franck Dernoncourt', 'Yu Wang', 'Nikos Kanakaris', 'Hanieh Deilamsalehy', 'Ryan A. Rossi', 'Nesreen K. Ahmed'], 'affiliations': ['Adobe Research', 'Cisco AI Research', 'Meta AI', 'University of California Santa Cruz', 'University of Oregon', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.02157.jpg', 'data': {'categories': ['#rag', '#optimization', '#graphs', '#multimodal', '#benchmark', '#games'], 'emoji': '🕸️', 'ru': {'title': 'Графы знаний на службе персонализации языковых моделей', 'desc': 'Статья представляет новый подход к персонализации ответов больших языковых моделей (LLM) под названием PGraphRAG. В отличие от существующих методов, полагающихся на историю пользователя, PGraphRAG использует ориентированные на пользователя графы знаний для обогащения контекста. Этот метод улучшает понимание контекста и качество генерируемых ответов, особенно в сценариях с ограниченными данными о пользователе. Экспериментальные результаты показывают, что PGraphRAG превосходит современные методы персонализации в различных задачах.'}, 'en': {'title': 'Revolutionizing Personalization with Graph-based Retrieval', 'desc': "This paper introduces a new framework called Personalized Graph-based Retrieval-Augmented Generation (PGraphRAG) that enhances the personalization of large language models (LLMs). Unlike traditional methods that depend only on user history, PGraphRAG utilizes user-centric knowledge graphs to provide richer context for generating responses. By integrating structured user information into the retrieval process, it improves the model's understanding and the quality of its outputs, especially in situations where user data is limited. The authors also present a benchmark for evaluating personalized text generation, showing that PGraphRAG outperforms existing methods in various tasks."}, 'zh': {'title': '个性化图谱提升生成质量', 'desc': '随着大型语言模型的发展,它们在提供个性化和上下文感知的响应方面展现出巨大的潜力。现有的个性化方法通常仅依赖用户历史数据来增强提示,这在数据稀疏的冷启动场景中效果有限。为了解决这些问题,我们提出了个性化图谱检索增强生成(PGraphRAG)框架,利用以用户为中心的知识图谱来丰富个性化。实验结果表明,PGraphRAG在多种任务中显著优于现有的个性化方法,展示了基于图谱的检索在个性化中的独特优势。'}}}, {'id': 'https://huggingface.co/papers/2501.02497', 'title': 'Test-time Computing: from System-1 Thinking to System-2 Thinking', 'url': 'https://huggingface.co/papers/2501.02497', 'abstract': "The remarkable performance of the o1 model in complex reasoning demonstrates that test-time computing scaling can further unlock the model's potential, enabling powerful System-2 thinking. However, there is still a lack of comprehensive surveys for test-time computing scaling. We trace the concept of test-time computing back to System-1 models. In System-1 models, test-time computing addresses distribution shifts and improves robustness and generalization through parameter updating, input modification, representation editing, and output calibration. In System-2 models, it enhances the model's reasoning ability to solve complex problems through repeated sampling, self-correction, and tree search. We organize this survey according to the trend of System-1 to System-2 thinking, highlighting the key role of test-time computing in the transition from System-1 models to weak System-2 models, and then to strong System-2 models. We also point out a few possible future directions.", 'score': 15, 'issue_id': 1528, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': '7d9414c60fe7701d', 'authors': ['Yixin Ji', 'Juntao Li', 'Hai Ye', 'Kaixin Wu', 'Jia Xu', 'Linjian Mo', 'Min Zhang'], 'affiliations': ['Ant Group', 'Department of Computer Science, National University of Singapore', 'School of Computer Science and Technology, Soochow University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02497.jpg', 'data': {'categories': ['#reasoning', '#math', '#survey', '#training'], 'emoji': '🧠', 'ru': {'title': 'Масштабирование вычислений: путь к мышлению System-2', 'desc': 'Эта статья рассматривает масштабирование вычислений во время тестирования для улучшения производительности моделей машинного обучения. Авторы прослеживают эволюцию этой концепции от моделей System-1 до моделей System-2. В работе описываются различные методы, такие как обновление параметров, модификация входных данных и древовидный поиск. Исследование подчеркивает ключевую роль вычислений во время тестирования в переходе от моделей System-1 к сильным моделям System-2.'}, 'en': {'title': 'Unlocking Model Potential: The Power of Test-Time Computing', 'desc': 'This paper explores the concept of test-time computing scaling and its impact on machine learning models, particularly in enhancing reasoning capabilities. It distinguishes between System-1 models, which focus on improving robustness and generalization through techniques like parameter updating and output calibration, and System-2 models, which utilize methods such as repeated sampling and self-correction for complex problem-solving. The authors trace the evolution from System-1 to System-2 thinking, emphasizing how test-time computing plays a crucial role in this transition. Additionally, the paper identifies potential future research directions in this area.'}, 'zh': {'title': '测试时计算:从系统-1到强系统-2的关键转变', 'desc': '这篇论文探讨了测试时计算扩展对机器学习模型的影响,特别是在复杂推理中的应用。作者指出,测试时计算可以通过参数更新、输入修改、表示编辑和输出校准来提高模型的鲁棒性和泛化能力。对于系统-2模型,测试时计算通过重复采样、自我修正和树搜索来增强模型的推理能力。论文还强调了测试时计算在从系统-1模型向弱系统-2模型再到强系统-2模型转变中的关键作用,并提出了一些未来的研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.02045', 'title': 'METAGENE-1: Metagenomic Foundation Model for Pandemic Monitoring', 'url': 'https://huggingface.co/papers/2501.02045', 'abstract': 'We pretrain METAGENE-1, a 7-billion-parameter autoregressive transformer model, which we refer to as a metagenomic foundation model, on a novel corpus of diverse metagenomic DNA and RNA sequences comprising over 1.5 trillion base pairs. This dataset is sourced from a large collection of human wastewater samples, processed and sequenced using deep metagenomic (next-generation) sequencing methods. Unlike genomic models that focus on individual genomes or curated sets of specific species, the aim of METAGENE-1 is to capture the full distribution of genomic information present within this wastewater, to aid in tasks relevant to pandemic monitoring and pathogen detection. We carry out byte-pair encoding (BPE) tokenization on our dataset, tailored for metagenomic sequences, and then pretrain our model. In this paper, we first detail the pretraining dataset, tokenization strategy, and model architecture, highlighting the considerations and design choices that enable the effective modeling of metagenomic data. We then show results of pretraining this model on our metagenomic dataset, providing details about our losses, system metrics, and training stability over the course of pretraining. Finally, we demonstrate the performance of METAGENE-1, which achieves state-of-the-art results on a set of genomic benchmarks and new evaluations focused on human-pathogen detection and genomic sequence embedding, showcasing its potential for public health applications in pandemic monitoring, biosurveillance, and early detection of emerging health threats.', 'score': 12, 'issue_id': 1528, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '60a3568f555ed60f', 'authors': ['Ollie Liu', 'Sami Jaghouar', 'Johannes Hagemann', 'Shangshang Wang', 'Jason Wiemels', 'Jeff Kaufman', 'Willie Neiswanger'], 'affiliations': ['Nucleic Acid Observatory', 'Prime Intellect', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.02045.jpg', 'data': {'categories': ['#benchmark', '#data', '#training', '#architecture', '#science', '#dataset', '#healthcare'], 'emoji': '🧬', 'ru': {'title': 'METAGENE-1: Метагеномная модель для мониторинга здоровья населения', 'desc': 'METAGENE-1 - это автореграссивная трансформерная модель с 7 миллиардами параметров, обученная на разнообразных метагеномных последовательностях ДНК и РНК. Модель создана для анализа геномной информации из образцов сточных вод с целью мониторинга пандемий и обнаружения патогенов. Авторы описывают процесс предобучения, включая токенизацию и архитектуру модели, а также демонстрируют результаты на различных геномных задачах. METAGENE-1 показывает высокую эффективность в обнаружении патогенов человека и встраивании геномных последовательностей, что открывает перспективы для применения в общественном здравоохранении.'}, 'en': {'title': 'Unlocking Metagenomics: METAGENE-1 for Pandemic Preparedness', 'desc': 'The paper introduces METAGENE-1, a large autoregressive transformer model designed for metagenomic data analysis. It is pretrained on a vast dataset of metagenomic DNA and RNA sequences derived from human wastewater, totaling over 1.5 trillion base pairs. The model aims to enhance pandemic monitoring and pathogen detection by capturing the diverse genomic information present in wastewater samples. The authors detail their tokenization strategy and model architecture, demonstrating that METAGENE-1 achieves state-of-the-art performance in genomic benchmarks and applications related to public health.'}, 'zh': {'title': 'METAGENE-1:元基因组基础模型助力公共卫生监测', 'desc': '我们预训练了METAGENE-1,这是一个拥有70亿参数的自回归变换器模型,称为元基因组基础模型。该模型在一个包含超过1.5万亿碱基对的多样化元基因组DNA和RNA序列的新数据集上进行训练,这些数据来自大量人类废水样本。METAGENE-1的目标是捕捉废水中存在的基因组信息的完整分布,以帮助进行疫情监测和病原体检测。我们展示了该模型在元基因组数据集上的预训练结果,证明其在公共卫生应用中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.02690', 'title': 'GS-DiT: Advancing Video Generation with Pseudo 4D Gaussian Fields through Efficient Dense 3D Point Tracking', 'url': 'https://huggingface.co/papers/2501.02690', 'abstract': '4D video control is essential in video generation as it enables the use of sophisticated lens techniques, such as multi-camera shooting and dolly zoom, which are currently unsupported by existing methods. Training a video Diffusion Transformer (DiT) directly to control 4D content requires expensive multi-view videos. Inspired by Monocular Dynamic novel View Synthesis (MDVS) that optimizes a 4D representation and renders videos according to different 4D elements, such as camera pose and object motion editing, we bring pseudo 4D Gaussian fields to video generation. Specifically, we propose a novel framework that constructs a pseudo 4D Gaussian field with dense 3D point tracking and renders the Gaussian field for all video frames. Then we finetune a pretrained DiT to generate videos following the guidance of the rendered video, dubbed as GS-DiT. To boost the training of the GS-DiT, we also propose an efficient Dense 3D Point Tracking (D3D-PT) method for the pseudo 4D Gaussian field construction. Our D3D-PT outperforms SpatialTracker, the state-of-the-art sparse 3D point tracking method, in accuracy and accelerates the inference speed by two orders of magnitude. During the inference stage, GS-DiT can generate videos with the same dynamic content while adhering to different camera parameters, addressing a significant limitation of current video generation models. GS-DiT demonstrates strong generalization capabilities and extends the 4D controllability of Gaussian splatting to video generation beyond just camera poses. It supports advanced cinematic effects through the manipulation of the Gaussian field and camera intrinsics, making it a powerful tool for creative video production. Demos are available at https://wkbian.github.io/Projects/GS-DiT/.', 'score': 11, 'issue_id': 1530, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'b4c147a2637166a8', 'authors': ['Weikang Bian', 'Zhaoyang Huang', 'Xiaoyu Shi', 'Yijin Li', 'Fu-Yun Wang', 'Hongsheng Li'], 'affiliations': ['Avolution AI', 'Centre for Perceptual and Interactive Intelligence', 'Multimedia Laboratory, The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.02690.jpg', 'data': {'categories': ['#video', '#games', '#diffusion', '#3d'], 'emoji': '🎥', 'ru': {'title': 'Революция в генерации видео: 4D-контроль с помощью гауссовых полей', 'desc': 'Эта статья представляет инновационный подход к генерации видео с 4D-контролем, используя псевдо-4D гауссовы поля и модель Diffusion Transformer (DiT). Авторы предлагают метод Dense 3D Point Tracking (D3D-PT) для эффективного построения гауссовых полей, превосходящий существующие решения по точности и скорости. Разработанная система GS-DiT позволяет генерировать видео с одинаковым динамическим содержанием, но с разными параметрами камеры, что открывает новые возможности для создания кинематографических эффектов. Метод демонстрирует сильные обобщающие способности и расширяет возможности 4D-контроля в генерации видео.'}, 'en': {'title': 'Revolutionizing Video Generation with 4D Control', 'desc': 'This paper introduces a new method for generating videos that can be controlled in four dimensions (4D), which includes both camera movement and object motion. The authors propose a framework called GS-DiT that utilizes pseudo 4D Gaussian fields to enhance video generation, allowing for advanced cinematic effects. They also present a Dense 3D Point Tracking (D3D-PT) technique that improves the accuracy and speed of tracking 3D points compared to existing methods. Overall, GS-DiT enables the creation of dynamic videos with flexible camera parameters, significantly advancing the capabilities of video generation models.'}, 'zh': {'title': '伪4D高斯场:视频生成的新突破', 'desc': '本论文提出了一种新颖的框架,利用伪4D高斯场进行视频生成,以支持复杂的镜头技术。我们通过密集的3D点跟踪构建伪4D高斯场,并为所有视频帧渲染该高斯场。为了提升GS-DiT的训练效果,我们还提出了一种高效的密集3D点跟踪方法,显著提高了准确性和推理速度。GS-DiT能够在不同的相机参数下生成具有相同动态内容的视频,扩展了视频生成的4D可控性,成为创意视频制作的强大工具。'}}}, {'id': 'https://huggingface.co/papers/2501.03059', 'title': 'Through-The-Mask: Mask-based Motion Trajectories for Image-to-Video Generation', 'url': 'https://huggingface.co/papers/2501.03059', 'abstract': "We consider the task of Image-to-Video (I2V) generation, which involves transforming static images into realistic video sequences based on a textual description. While recent advancements produce photorealistic outputs, they frequently struggle to create videos with accurate and consistent object motion, especially in multi-object scenarios. To address these limitations, we propose a two-stage compositional framework that decomposes I2V generation into: (i) An explicit intermediate representation generation stage, followed by (ii) A video generation stage that is conditioned on this representation. Our key innovation is the introduction of a mask-based motion trajectory as an intermediate representation, that captures both semantic object information and motion, enabling an expressive but compact representation of motion and semantics. To incorporate the learned representation in the second stage, we utilize object-level attention objectives. Specifically, we consider a spatial, per-object, masked-cross attention objective, integrating object-specific prompts into corresponding latent space regions and a masked spatio-temporal self-attention objective, ensuring frame-to-frame consistency for each object. We evaluate our method on challenging benchmarks with multi-object and high-motion scenarios and empirically demonstrate that the proposed method achieves state-of-the-art results in temporal coherence, motion realism, and text-prompt faithfulness. Additionally, we introduce \\benchmark, a new challenging benchmark for single-object and multi-object I2V generation, and demonstrate our method's superiority on this benchmark. Project page is available at https://guyyariv.github.io/TTM/.", 'score': 10, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '4f24667b663efb7d', 'authors': ['Guy Yariv', 'Yuval Kirstain', 'Amit Zohar', 'Shelly Sheynin', 'Yaniv Taigman', 'Yossi Adi', 'Sagie Benaim', 'Adam Polyak'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'The Hebrew University of Jerusalem'], 'pdf_title_img': 'assets/pdf/title_img/2501.03059.jpg', 'data': {'categories': ['#video', '#multimodal', '#benchmark'], 'emoji': '🎬', 'ru': {'title': 'Генерация реалистичных видео из статичных изображений с помощью масок траекторий движения', 'desc': 'Статья представляет новый подход к генерации видео из изображений (I2V) на основе текстового описания. Авторы предлагают двухэтапную композиционную модель, которая сначала генерирует промежуточное представление в виде маски траектории движения объектов. Затем это представление используется для генерации видео с применением объектно-ориентированных целевых функций внимания. Эксперименты показывают, что предложенный метод достигает лучших результатов по временной согласованности, реалистичности движения и соответствию текстовому описанию.'}, 'en': {'title': 'Transforming Images into Realistic Videos with Motion Precision', 'desc': 'This paper addresses the challenge of generating videos from static images using textual descriptions, known as Image-to-Video (I2V) generation. The authors propose a two-stage framework that first creates an intermediate representation to capture object semantics and motion, followed by a video generation stage that utilizes this representation. A key innovation is the use of a mask-based motion trajectory, which helps maintain accurate object motion and consistency across frames. The method is evaluated against challenging benchmarks and shows superior performance in terms of motion realism and coherence, while also introducing a new benchmark for I2V generation.'}, 'zh': {'title': '图像到视频生成的新突破', 'desc': '本文探讨了图像到视频(I2V)生成的任务,即根据文本描述将静态图像转换为逼真的视频序列。尽管近期的进展能够生成照片级真实感的输出,但在多物体场景中,视频的物体运动准确性和一致性仍然存在挑战。为了解决这些问题,我们提出了一种两阶段的组合框架,首先生成明确的中间表示,然后基于该表示生成视频。我们的创新在于引入了一种基于掩码的运动轨迹作为中间表示,能够捕捉语义物体信息和运动,从而实现运动和语义的紧凑而富有表现力的表示。'}}}, {'id': 'https://huggingface.co/papers/2501.03006', 'title': 'TransPixar: Advancing Text-to-Video Generation with Transparency', 'url': 'https://huggingface.co/papers/2501.03006', 'abstract': 'Text-to-video generative models have made significant strides, enabling diverse applications in entertainment, advertising, and education. However, generating RGBA video, which includes alpha channels for transparency, remains a challenge due to limited datasets and the difficulty of adapting existing models. Alpha channels are crucial for visual effects (VFX), allowing transparent elements like smoke and reflections to blend seamlessly into scenes. We introduce TransPixar, a method to extend pretrained video models for RGBA generation while retaining the original RGB capabilities. TransPixar leverages a diffusion transformer (DiT) architecture, incorporating alpha-specific tokens and using LoRA-based fine-tuning to jointly generate RGB and alpha channels with high consistency. By optimizing attention mechanisms, TransPixar preserves the strengths of the original RGB model and achieves strong alignment between RGB and alpha channels despite limited training data. Our approach effectively generates diverse and consistent RGBA videos, advancing the possibilities for VFX and interactive content creation.', 'score': 8, 'issue_id': 1527, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'e85e5fa9a03d5d04', 'authors': ['Luozhou Wang', 'Yijun Li', 'Zhifei Chen', 'Jui-Hsien Wang', 'Zhifei Zhang', 'He Zhang', 'Zhe Lin', 'Yingcong Chen'], 'affiliations': ['Adobe Research', 'HKUST', 'HKUST(GZ)'], 'pdf_title_img': 'assets/pdf/title_img/2501.03006.jpg', 'data': {'categories': ['#optimization', '#architecture', '#training', '#diffusion', '#video'], 'emoji': '🎬', 'ru': {'title': 'TransPixar: Прорыв в генерации RGBA-видео для визуальных эффектов', 'desc': 'TransPixar - это новый метод генерации RGBA-видео, расширяющий возможности предобученных видеомоделей. Он использует архитектуру диффузионного трансформера (DiT) и токены, специфичные для альфа-канала, для совместной генерации RGB и альфа-каналов с высокой согласованностью. Метод применяет тонкую настройку на основе LoRA и оптимизирует механизмы внимания для сохранения сильных сторон исходной RGB-модели. TransPixar эффективно генерирует разнообразные и согласованные RGBA-видео, открывая новые возможности для создания визуальных эффектов и интерактивного контента.'}, 'en': {'title': 'TransPixar: Bridging RGB and Alpha for Enhanced Video Generation', 'desc': 'This paper presents TransPixar, a novel method for generating RGBA videos, which include transparency information crucial for visual effects. The challenge lies in the limited datasets and the need to adapt existing models to handle alpha channels effectively. TransPixar utilizes a diffusion transformer architecture and incorporates alpha-specific tokens, allowing it to generate both RGB and alpha channels simultaneously. By optimizing attention mechanisms and employing LoRA-based fine-tuning, TransPixar achieves high consistency between RGB and alpha outputs, enhancing the quality of video generation for applications in VFX and interactive media.'}, 'zh': {'title': 'TransPixar:生成高质量RGBA视频的新方法', 'desc': '本文介绍了一种名为TransPixar的方法,旨在生成包含透明通道的RGBA视频。传统的视频生成模型在处理透明效果时面临挑战,TransPixar通过扩展预训练模型来解决这一问题。该方法利用扩散变换器架构,结合特定的透明通道标记,并通过LoRA微调实现RGB和透明通道的高一致性生成。最终,TransPixar在有限的数据集上优化了注意力机制,成功生成多样且一致的RGBA视频,推动了视觉特效和互动内容创作的可能性。'}}}, {'id': 'https://huggingface.co/papers/2501.01790', 'title': 'Ingredients: Blending Custom Photos with Video Diffusion Transformers', 'url': 'https://huggingface.co/papers/2501.01790', 'abstract': 'This paper presents a powerful framework to customize video creations by incorporating multiple specific identity (ID) photos, with video diffusion Transformers, referred to as Ingredients. Generally, our method consists of three primary modules: (i) a facial extractor that captures versatile and precise facial features for each human ID from both global and local perspectives; (ii) a multi-scale projector that maps face embeddings into the contextual space of image query in video diffusion transformers; (iii) an ID router that dynamically combines and allocates multiple ID embedding to the corresponding space-time regions. Leveraging a meticulously curated text-video dataset and a multi-stage training protocol, Ingredients demonstrates superior performance in turning custom photos into dynamic and personalized video content. Qualitative evaluations highlight the advantages of proposed method, positioning it as a significant advancement toward more effective generative video control tools in Transformer-based architecture, compared to existing methods. The data, code, and model weights are publicly available at: https://github.com/feizc/Ingredients.', 'score': 6, 'issue_id': 1528, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'dd1ccebdd2fcf276', 'authors': ['Zhengcong Fei', 'Debang Li', 'Di Qiu', 'Changqian Yu', 'Mingyuan Fan'], 'affiliations': ['Kunlun Inc. Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01790.jpg', 'data': {'categories': ['#open_source', '#training', '#architecture', '#video', '#dataset', '#diffusion', '#multimodal'], 'emoji': '🎬', 'ru': {'title': 'Персонализированное видео из фотографий: новый уровень контроля в генеративных моделях', 'desc': 'Статья представляет новый метод под названием Ingredients для создания персонализированных видео с использованием нескольких фотографий конкретных людей. Метод состоит из трех основных модулей: экстрактора лицевых признаков, многомасштабного проектора и маршрутизатора идентификаторов. Ingredients использует тщательно подобранный набор данных текст-видео и многоэтапный протокол обучения для достижения превосходных результатов. Качественная оценка показывает преимущества предложенного метода по сравнению с существующими подходами в области генеративного контроля видео на основе архитектуры Transformer.'}, 'en': {'title': 'Transforming Photos into Personalized Videos with Ingredients', 'desc': 'This paper introduces a novel framework called Ingredients for creating personalized videos using multiple identity photos. It employs a facial extractor to accurately capture facial features, a multi-scale projector to integrate these features into video diffusion transformers, and an ID router to manage the allocation of identity embeddings across different time and space regions in the video. The framework is trained on a carefully selected text-video dataset, enhancing its ability to generate dynamic video content from custom images. The results show that Ingredients outperforms existing methods, marking a significant step forward in generative video control using Transformer architectures.'}, 'zh': {'title': '个性化视频创作的新突破', 'desc': '本文提出了一种强大的框架,通过结合多个特定身份照片,定制视频创作,称为Ingredients。该方法主要由三个模块组成:面部提取器、多个尺度投影器和身份路由器,分别用于提取面部特征、映射面部嵌入和动态分配身份嵌入。通过精心策划的文本-视频数据集和多阶段训练协议,Ingredients在将自定义照片转化为动态个性化视频内容方面表现出色。定性评估显示,该方法在基于Transformer的架构中,相较于现有方法,显著提升了生成视频控制工具的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.02576', 'title': 'DepthMaster: Taming Diffusion Models for Monocular Depth Estimation', 'url': 'https://huggingface.co/papers/2501.02576', 'abstract': "Monocular depth estimation within the diffusion-denoising paradigm demonstrates impressive generalization ability but suffers from low inference speed. Recent methods adopt a single-step deterministic paradigm to improve inference efficiency while maintaining comparable performance. However, they overlook the gap between generative and discriminative features, leading to suboptimal results. In this work, we propose DepthMaster, a single-step diffusion model designed to adapt generative features for the discriminative depth estimation task. First, to mitigate overfitting to texture details introduced by generative features, we propose a Feature Alignment module, which incorporates high-quality semantic features to enhance the denoising network's representation capability. Second, to address the lack of fine-grained details in the single-step deterministic framework, we propose a Fourier Enhancement module to adaptively balance low-frequency structure and high-frequency details. We adopt a two-stage training strategy to fully leverage the potential of the two modules. In the first stage, we focus on learning the global scene structure with the Feature Alignment module, while in the second stage, we exploit the Fourier Enhancement module to improve the visual quality. Through these efforts, our model achieves state-of-the-art performance in terms of generalization and detail preservation, outperforming other diffusion-based methods across various datasets. Our project page can be found at https://indu1ge.github.io/DepthMaster_page.", 'score': 5, 'issue_id': 1536, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'a8429b95ef4eb7b7', 'authors': ['Ziyang Song', 'Zerong Wang', 'Bo Li', 'Hao Zhang', 'Ruijie Zhu', 'Li Liu', 'Peng-Tao Jiang', 'Tianzhu Zhang'], 'affiliations': ['School of Information Science and Technology, University of Science and Technology of China (USTC), Hefei 230026, P.R.China', 'vivo Mobile Communication Co., Ltd., Hangzhou 310030, P.R.China'], 'pdf_title_img': 'assets/pdf/title_img/2501.02576.jpg', 'data': {'categories': ['#optimization', '#training', '#diffusion', '#cv', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'DepthMaster: Однопроходная диффузионная модель для точной оценки глубины с улучшенной генерализацией', 'desc': 'DepthMaster - это однопроходная диффузионная модель для монокулярной оценки глубины. Она использует модуль выравнивания признаков для улучшения представления семантических особенностей и модуль улучшения Фурье для балансировки низкочастотной структуры и высокочастотных деталей. Модель обучается в два этапа: сначала фокусируется на глобальной структуре сцены, затем улучшает визуальное качество. DepthMaster превосходит другие диффузионные методы по обобщающей способности и сохранению деталей на различных наборах данных.'}, 'en': {'title': 'DepthMaster: Bridging Generative and Discriminative Depth Estimation', 'desc': 'This paper introduces DepthMaster, a single-step diffusion model aimed at improving monocular depth estimation. It addresses the inefficiencies of previous methods by integrating a Feature Alignment module to enhance the representation of semantic features and reduce overfitting to textures. Additionally, a Fourier Enhancement module is proposed to balance low-frequency structures with high-frequency details, ensuring finer depth estimation. The two-stage training strategy allows the model to first learn global scene structures and then refine visual quality, resulting in state-of-the-art performance across various datasets.'}, 'zh': {'title': 'DepthMaster:提升深度估计的单步扩散模型', 'desc': '本文提出了一种名为DepthMaster的单步扩散模型,用于单目深度估计。该模型通过特征对齐模块和傅里叶增强模块,优化生成特征以适应判别性深度估计任务。特征对齐模块增强了去噪网络的表示能力,而傅里叶增强模块则平衡了低频结构和高频细节。通过两阶段训练策略,DepthMaster在泛化能力和细节保留方面达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.01830', 'title': 'Auto-RT: Automatic Jailbreak Strategy Exploration for Red-Teaming Large Language Models', 'url': 'https://huggingface.co/papers/2501.01830', 'abstract': 'Automated red-teaming has become a crucial approach for uncovering vulnerabilities in large language models (LLMs). However, most existing methods focus on isolated safety flaws, limiting their ability to adapt to dynamic defenses and uncover complex vulnerabilities efficiently. To address this challenge, we propose Auto-RT, a reinforcement learning framework that automatically explores and optimizes complex attack strategies to effectively uncover security vulnerabilities through malicious queries. Specifically, we introduce two key mechanisms to reduce exploration complexity and improve strategy optimization: 1) Early-terminated Exploration, which accelerate exploration by focusing on high-potential attack strategies; and 2) Progressive Reward Tracking algorithm with intermediate downgrade models, which dynamically refine the search trajectory toward successful vulnerability exploitation. Extensive experiments across diverse LLMs demonstrate that, by significantly improving exploration efficiency and automatically optimizing attack strategies, Auto-RT detects a boarder range of vulnerabilities, achieving a faster detection speed and 16.63\\% higher success rates compared to existing methods.', 'score': 5, 'issue_id': 1529, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '5b08b81c52ec8da8', 'authors': ['Yanjiang Liu', 'Shuhen Zhou', 'Yaojie Lu', 'Huijia Zhu', 'Weiqiang Wang', 'Hongyu Lin', 'Ben He', 'Xianpei Han', 'Le Sun'], 'affiliations': ['Ant Group', 'Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences, Beijing, China', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01830.jpg', 'data': {'categories': ['#security', '#rl', '#rlhf'], 'emoji': '🛡️', 'ru': {'title': 'Auto-RT: Умная защита больших языковых моделей', 'desc': 'Авторы представляют Auto-RT - фреймворк на основе обучения с подкреплением для автоматизированного поиска уязвимостей в больших языковых моделях (LLM). Система использует механизмы раннего прекращения исследования и прогрессивного отслеживания наград для оптимизации стратегий атак. Auto-RT превосходит существующие методы, обнаруживая более широкий спектр уязвимостей с большей скоростью и на 16.63% более высоким уровнем успеха. Этот подход позволяет эффективно выявлять сложные уязвимости в LLM через вредоносные запросы.'}, 'en': {'title': 'Auto-RT: Revolutionizing Vulnerability Detection in LLMs', 'desc': 'This paper presents Auto-RT, a reinforcement learning framework designed to enhance automated red-teaming for large language models (LLMs). Unlike traditional methods that target isolated safety flaws, Auto-RT efficiently uncovers complex vulnerabilities by optimizing attack strategies through malicious queries. It introduces two innovative mechanisms: Early-terminated Exploration to prioritize promising attack strategies, and Progressive Reward Tracking to refine the search process dynamically. Experimental results show that Auto-RT significantly improves exploration efficiency and detection success rates, outperforming existing approaches.'}, 'zh': {'title': '自动化红队:高效发现语言模型漏洞的利器', 'desc': '自动化红队技术在发现大型语言模型(LLMs)中的漏洞方面变得至关重要。现有方法大多集中于孤立的安全缺陷,限制了其适应动态防御和高效发现复杂漏洞的能力。为了解决这个问题,我们提出了Auto-RT,一个强化学习框架,能够自动探索和优化复杂的攻击策略,通过恶意查询有效发现安全漏洞。我们的实验表明,Auto-RT显著提高了探索效率和攻击策略的自动优化,检测到更广泛的漏洞,检测速度更快,成功率提高了16.63%。'}}}, {'id': 'https://huggingface.co/papers/2501.02506', 'title': 'ToolHop: A Query-Driven Benchmark for Evaluating Large Language Models in Multi-Hop Tool Use', 'url': 'https://huggingface.co/papers/2501.02506', 'abstract': 'Effective evaluation of multi-hop tool use is critical for analyzing the understanding, reasoning, and function-calling capabilities of large language models (LLMs). However, progress has been hindered by a lack of reliable evaluation datasets. To address this, we present ToolHop, a dataset comprising 995 user queries and 3,912 associated tools, specifically designed for rigorous evaluation of multi-hop tool use. ToolHop ensures diverse queries, meaningful interdependencies, locally executable tools, detailed feedback, and verifiable answers through a novel query-driven data construction approach that includes tool creation, document refinement, and code generation. We evaluate 14 LLMs across five model families (i.e., LLaMA3.1, Qwen2.5, Gemini1.5, Claude3.5, and GPT), uncovering significant challenges in handling multi-hop tool-use scenarios. The leading model, GPT-4o, achieves an accuracy of 49.04%, underscoring substantial room for improvement. Further analysis reveals variations in tool-use strategies for various families, offering actionable insights to guide the development of more effective approaches. Code and data can be found in https://huggingface.co/bytedance-research/ToolHop.', 'score': 5, 'issue_id': 1529, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'f785173226e5f9fc', 'authors': ['Junjie Ye', 'Zhengyin Du', 'Xuesong Yao', 'Weijian Lin', 'Yufei Xu', 'Zehui Chen', 'Zaiyuan Wang', 'Sining Zhu', 'Zhiheng Xi', 'Siyu Yuan', 'Tao Gui', 'Qi Zhang', 'Xuanjing Huang', 'Jiechao Chen'], 'affiliations': ['ByteDance', 'Institute of Modern Languages and Linguistics, Fudan University', 'School of Computer Science, Fudan University', 'School of Data Science, Fudan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02506.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#dataset', '#optimization'], 'emoji': '🛠️', 'ru': {'title': 'ToolHop: новый стандарт для оценки многоэтапного использования инструментов в LLM', 'desc': 'Статья представляет новый набор данных ToolHop для оценки многоэтапного использования инструментов большими языковыми моделями (LLM). ToolHop содержит 995 пользовательских запросов и 3912 связанных инструментов, обеспечивая разнообразие запросов, взаимозависимости и возможность локального выполнения. Авторы оценили 14 LLM из пяти семейств моделей, выявив значительные трудности в обработке сценариев многоэтапного использования инструментов. Лучшая модель, GPT-4o, достигла точности 49.04%, что указывает на большой потенциал для улучшения.'}, 'en': {'title': 'ToolHop: Advancing Multi-Hop Tool Use Evaluation for LLMs', 'desc': 'This paper introduces ToolHop, a new dataset designed to evaluate how well large language models (LLMs) can use multiple tools in a single task. It includes 995 user queries and 3,912 tools, focusing on diverse and interdependent queries that can be executed locally. The authors tested 14 different LLMs, revealing that even the best-performing model, GPT-4o, only achieved 49.04% accuracy, indicating significant challenges in multi-hop tool use. The findings highlight different strategies employed by various model families, providing insights for future improvements in LLM capabilities.'}, 'zh': {'title': 'ToolHop:多跳工具使用的有效评估数据集', 'desc': '本文介绍了ToolHop数据集,该数据集包含995个用户查询和3912个相关工具,旨在有效评估大型语言模型(LLMs)在多跳工具使用中的理解、推理和功能调用能力。通过新颖的查询驱动数据构建方法,ToolHop确保了查询的多样性、工具的局部可执行性和可验证的答案。我们对14个不同模型(如LLaMA3.1、Qwen2.5等)进行了评估,发现它们在处理多跳工具使用场景时面临显著挑战。尽管GPT-4o模型的准确率为49.04%,但仍有很大的改进空间,分析还揭示了不同模型家族在工具使用策略上的差异,为未来的研究提供了有价值的见解。'}}}, {'id': 'https://huggingface.co/papers/2501.02423', 'title': 'Scaling Laws for Floating Point Quantization Training', 'url': 'https://huggingface.co/papers/2501.02423', 'abstract': 'Low-precision training is considered an effective strategy for reducing both training and downstream inference costs. Previous scaling laws for precision mainly focus on integer quantization, which pay less attention to the constituents in floating-point quantization and thus cannot well fit the LLM losses in this scenario. In contrast, while floating-point quantization training is more commonly implemented in production, the research on it has been relatively superficial. In this paper, we thoroughly explore the effects of floating-point quantization targets, exponent bits, mantissa bits, and the calculation granularity of the scaling factor in floating-point quantization training performance of LLM models. While presenting an accurate floating-point quantization unified scaling law, we also provide valuable suggestions for the community: (1) Exponent bits contribute slightly more to the model performance than mantissa bits. We provide the optimal exponent-mantissa bit ratio for different bit numbers, which is available for future reference by hardware manufacturers; (2) We discover the formation of the critical data size in low-precision LLM training. Too much training data exceeding the critical data size will inversely bring in degradation of LLM performance; (3) The optimal floating-point quantization precision is directly proportional to the computational power, but within a wide computational power range, we estimate that the best cost-performance precision lies between 4-8 bits.', 'score': 4, 'issue_id': 1537, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'be6872257cb9a129', 'authors': ['Xingwu Sun', 'Shuaipeng Li', 'Ruobing Xie', 'Weidong Han', 'Kan Wu', 'Zhen Yang', 'Yixing Li', 'An Wang', 'Shuai Li', 'Jinbao Xue', 'Yu Cheng', 'Yangyu Tao', 'Zhanhui Kang', 'Chengzhong Xu', 'Di Wang', 'Jie Jiang'], 'affiliations': ['Tencent Hunyuan', 'The Chinese University of Hong Kong', 'Tokyo Institute of Technology', 'University of Macau'], 'pdf_title_img': 'assets/pdf/title_img/2501.02423.jpg', 'data': {'categories': ['#training', '#optimization', '#inference'], 'emoji': '🧮', 'ru': {'title': 'Оптимизация точности вычислений в обучении языковых моделей', 'desc': 'Статья исследует влияние квантования с плавающей запятой на обучение больших языковых моделей (LLM). Авторы анализируют роль экспоненциальных и мантиссных битов, а также размера обучающих данных в производительности моделей. Они представляют унифицированный закон масштабирования для квантования с плавающей запятой и дают рекомендации по оптимальному соотношению битов и размеру данных. Исследование показывает, что оптимальная точность квантования находится в диапазоне 4-8 бит для широкого спектра вычислительных мощностей.'}, 'en': {'title': 'Optimizing Floating-Point Quantization for Better LLM Performance', 'desc': 'This paper investigates the impact of floating-point quantization on the training performance of large language models (LLMs). It highlights that previous research primarily focused on integer quantization, neglecting the nuances of floating-point quantization. The authors establish a unified scaling law for floating-point quantization and provide insights on the optimal ratio of exponent to mantissa bits, emphasizing that exponent bits have a greater influence on model performance. Additionally, they identify a critical data size threshold, beyond which performance may degrade, and suggest that the best precision for cost-performance lies between 4-8 bits, depending on computational power.'}, 'zh': {'title': '低精度训练:优化浮点量化的关键', 'desc': '低精度训练被认为是降低训练和推理成本的有效策略。以往的研究主要集中在整数量化上,而对浮点量化的研究相对较少,导致无法很好地适应大语言模型的损失情况。本文深入探讨了浮点量化训练中目标、指数位、尾数位和缩放因子的计算粒度对大语言模型性能的影响,并提出了统一的浮点量化缩放法则。研究结果表明,指数位对模型性能的贡献略高于尾数位,并发现了低精度训练中的关键数据大小。'}}}, {'id': 'https://huggingface.co/papers/2501.02832', 'title': 'Samba-asr state-of-the-art speech recognition leveraging structured state-space models', 'url': 'https://huggingface.co/papers/2501.02832', 'abstract': 'We propose Samba ASR, the first state-of-the-art Automatic Speech Recognition (ASR) model leveraging the novel Mamba architecture as both encoder and decoder, built on the foundation of state-space models (SSMs). Unlike transformer-based ASR models, which rely on self-attention mechanisms to capture dependencies, Samba ASR effectively models both local and global temporal dependencies using efficient state-space dynamics, achieving remarkable performance gains. By addressing the limitations of transformers, such as quadratic scaling with input length and difficulty in handling long-range dependencies, Samba ASR achieves superior accuracy and efficiency. Experimental results demonstrate that Samba ASR surpasses existing open-source transformer-based ASR models across various standard benchmarks, establishing it as the new state of the art in ASR. Extensive evaluations on benchmark datasets show significant improvements in Word Error Rate (WER), with competitive performance even in low-resource scenarios. Furthermore, the computational efficiency and parameter optimization of the Mamba architecture make Samba ASR a scalable and robust solution for diverse ASR tasks. Our contributions include: A new Samba ASR architecture demonstrating the superiority of SSMs over transformer-based models for speech sequence processing. A comprehensive evaluation on public benchmarks showcasing state-of-the-art performance. An analysis of computational efficiency, robustness to noise, and sequence generalization. This work highlights the viability of Mamba SSMs as a transformer-free alternative for efficient and accurate ASR. By leveraging state-space modeling advancements, Samba ASR sets a new benchmark for ASR performance and future research.', 'score': 4, 'issue_id': 1530, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'ed3c4a6192d0c5f9', 'authors': ['Syed Abdul Gaffar Shakhadri', 'Kruthika KR', 'Kartik Basavaraj Angadi'], 'affiliations': ['SandLogic Technologies Pvt Ltd'], 'pdf_title_img': 'assets/pdf/title_img/2501.02832.jpg', 'data': {'categories': ['#audio', '#architecture', '#benchmark', '#low_resource', '#open_source'], 'emoji': '🎙️', 'ru': {'title': 'Samba ASR: революция в распознавании речи с помощью моделей пространства состояний', 'desc': 'Представлена модель Samba ASR - первая современная система автоматического распознавания речи, использующая архитектуру Mamba в качестве энкодера и декодера на основе моделей пространства состояний (SSM). В отличие от трансформерных моделей, Samba ASR эффективно моделирует локальные и глобальные временные зависимости, достигая значительных улучшений производительности. Экспериментальные результаты показывают, что Samba ASR превосходит существующие модели с открытым исходным кодом на основе трансформеров по различным стандартным показателям. Модель демонстрирует значительное снижение показателя Word Error Rate (WER) и высокую эффективность даже при ограниченных ресурсах.'}, 'en': {'title': 'Samba ASR: Redefining Speech Recognition with State-Space Models', 'desc': 'Samba ASR is a groundbreaking Automatic Speech Recognition model that utilizes the innovative Mamba architecture, which functions as both the encoder and decoder. This model departs from traditional transformer-based approaches by employing state-space models (SSMs) to effectively capture both local and global temporal dependencies, leading to enhanced performance. By overcoming the challenges associated with transformers, such as their inefficiency with long input sequences, Samba ASR achieves superior accuracy and efficiency in recognizing speech. Extensive testing shows that Samba ASR not only outperforms existing transformer-based models but also excels in low-resource environments, making it a robust solution for various ASR applications.'}, 'zh': {'title': 'Samba ASR:超越变换器的语音识别新标杆', 'desc': '我们提出了Samba ASR,这是第一个利用新型Mamba架构作为编码器和解码器的最先进自动语音识别(ASR)模型。与基于变换器的ASR模型不同,Samba ASR通过高效的状态空间动态建模局部和全局时间依赖关系,从而实现显著的性能提升。该模型克服了变换器在处理长距离依赖和输入长度的平方扩展等方面的局限性,展现出更高的准确性和效率。实验结果表明,Samba ASR在多个标准基准测试中超越了现有的开源变换器ASR模型,确立了其在ASR领域的新标杆。'}}}, {'id': 'https://huggingface.co/papers/2501.00912', 'title': 'AutoPresent: Designing Structured Visuals from Scratch', 'url': 'https://huggingface.co/papers/2501.00912', 'abstract': "Designing structured visuals such as presentation slides is essential for communicative needs, necessitating both content creation and visual planning skills. In this work, we tackle the challenge of automated slide generation, where models produce slide presentations from natural language (NL) instructions. We first introduce the SlidesBench benchmark, the first benchmark for slide generation with 7k training and 585 testing examples derived from 310 slide decks across 10 domains. SlidesBench supports evaluations that are (i)reference-based to measure similarity to a target slide, and (ii)reference-free to measure the design quality of generated slides alone. We benchmark end-to-end image generation and program generation methods with a variety of models, and find that programmatic methods produce higher-quality slides in user-interactable formats. Built on the success of program generation, we create AutoPresent, an 8B Llama-based model trained on 7k pairs of instructions paired with code for slide generation, and achieve results comparable to the closed-source model GPT-4o. We further explore iterative design refinement where the model is tasked to self-refine its own output, and we found that this process improves the slide's quality. We hope that our work will provide a basis for future work on generating structured visuals.", 'score': 3, 'issue_id': 1539, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'ea7b88fcc0a2025b', 'authors': ['Jiaxin Ge', 'Zora Zhiruo Wang', 'Xuhui Zhou', 'Yi-Hao Peng', 'Sanjay Subramanian', 'Qinyue Tan', 'Maarten Sap', 'Alane Suhr', 'Daniel Fried', 'Graham Neubig', 'Trevor Darrell'], 'affiliations': ['Carnegie Mellon University', 'University of California, Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.00912.jpg', 'data': {'categories': ['#dataset', '#story_generation', '#training', '#benchmark', '#multimodal'], 'emoji': '🎯', 'ru': {'title': 'Автоматизация создания презентаций: от текста к структурированным визуальным материалам', 'desc': 'Эта статья представляет новый бенчмарк SlidesBench для автоматической генерации слайдов презентаций на основе текстовых инструкций. Авторы сравнивают методы генерации изображений и программного кода, обнаружив преимущество последнего подхода. Они создают модель AutoPresent на базе Llama для генерации кода слайдов, достигающую результатов, сопоставимых с GPT-4. Исследователи также изучают итеративное улучшение дизайна слайдов с помощью самооптимизации модели.'}, 'en': {'title': 'Automating Slide Generation with Advanced Models', 'desc': 'This paper addresses the challenge of creating automated slide presentations from natural language instructions. It introduces the SlidesBench benchmark, which includes a large dataset for training and testing slide generation models. The authors evaluate various methods, finding that programmatic approaches yield higher-quality slides. They also present AutoPresent, a model that competes with advanced models like GPT-4o, and demonstrate that iterative design refinement enhances the quality of generated slides.'}, 'zh': {'title': '自动生成高质量演示幻灯片的未来', 'desc': '本研究旨在自动生成演示幻灯片,解决内容创作和视觉规划的挑战。我们首次引入SlidesBench基准,包含7000个训练样本和585个测试样本,涵盖10个领域的310个幻灯片集。通过对比不同模型的图像生成和程序生成方法,我们发现程序生成方法在用户交互格式中生成的幻灯片质量更高。基于程序生成的成功,我们开发了AutoPresent模型,并通过自我优化过程进一步提升幻灯片的质量。'}}}, {'id': 'https://huggingface.co/papers/2501.03225', 'title': 'Automated Generation of Challenging Multiple-Choice Questions for Vision Language Model Evaluation', 'url': 'https://huggingface.co/papers/2501.03225', 'abstract': 'The rapid development of vision language models (VLMs) demands rigorous and reliable evaluation. However, current visual question answering (VQA) benchmarks often depend on open-ended questions, making accurate evaluation difficult due to the variability in natural language responses. To address this, we introduce AutoConverter, an agentic framework that automatically converts these open-ended questions into multiple-choice format, enabling objective evaluation while reducing the costly question creation process. Our experiments demonstrate that AutoConverter can generate correct and challenging multiple-choice questions, with VLMs demonstrating consistently similar or lower accuracy on these questions compared to human-created ones. Using AutoConverter, we construct VMCBench, a benchmark created by transforming 20 existing VQA datasets into a unified multiple-choice format, totaling 9,018 questions. We comprehensively evaluate 33 state-of-the-art VLMs on VMCBench, setting a new standard for scalable, consistent, and reproducible VLM evaluation.', 'score': 1, 'issue_id': 1542, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'aa212f5e596ed0e6', 'authors': ['Yuhui Zhang', 'Yuchang Su', 'Yiming Liu', 'Xiaohan Wang', 'James Burgess', 'Elaine Sui', 'Chenyu Wang', 'Josiah Aklilu', 'Alejandro Lozano', 'Anjiang Wei', 'Ludwig Schmidt', 'Serena Yeung-Levy'], 'affiliations': ['MIT', 'Stanford University', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.03225.jpg', 'data': {'categories': ['#interpretability', '#agents', '#benchmark', '#cv', '#survey', '#games', '#optimization'], 'emoji': '🔄', 'ru': {'title': 'Автоматизация оценки моделей машинного зрения и языка', 'desc': 'Исследователи представили AutoConverter - агентную систему для автоматического преобразования открытых вопросов в вопросы с множественным выбором для оценки моделей машинного зрения и языка (VLM). Эта система позволяет объективно оценивать VLM, избегая сложностей, связанных с вариативностью естественно-языковых ответов. На основе AutoConverter был создан бенчмарк VMCBench, включающий 9018 вопросов из 20 существующих наборов данных для визуальных вопросов и ответов (VQA). VMCBench был использован для всесторонней оценки 33 современных VLM, устанавливая новый стандарт масштабируемой и воспроизводимой оценки таких моделей.'}, 'en': {'title': 'Transforming VQA for Objective Evaluation with AutoConverter', 'desc': 'This paper presents AutoConverter, a framework designed to improve the evaluation of vision language models (VLMs) by converting open-ended visual question answering (VQA) questions into a multiple-choice format. This transformation allows for more objective assessments of VLM performance, addressing the challenges posed by the variability of natural language responses. The authors demonstrate that VLMs perform similarly or worse on these newly generated questions compared to those created by humans, indicating the rigor of the new benchmark. Additionally, they introduce VMCBench, a comprehensive dataset that standardizes 20 existing VQA datasets into a unified multiple-choice format, facilitating scalable and reproducible evaluations of VLMs.'}, 'zh': {'title': '自动化评估视觉语言模型的新标准', 'desc': '随着视觉语言模型(VLMs)的快速发展,评估这些模型的准确性变得尤为重要。现有的视觉问答(VQA)基准往往依赖开放式问题,这使得评估变得困难,因为自然语言回答的多样性很大。为了解决这个问题,我们提出了AutoConverter,这是一种自动将开放式问题转换为多项选择格式的框架,从而实现客观评估并减少问题创建的成本。通过使用AutoConverter,我们构建了VMCBench,这是一个将20个现有VQA数据集转化为统一多项选择格式的基准,包含9,018个问题,全面评估了33个最先进的VLMs,设定了可扩展、一致和可重复的VLM评估新标准。'}}}, {'id': 'https://huggingface.co/papers/2501.05874', 'title': 'VideoRAG: Retrieval-Augmented Generation over Video Corpus', 'url': 'https://huggingface.co/papers/2501.05874', 'abstract': 'Retrieval-Augmented Generation (RAG) is a powerful strategy to address the issue of generating factually incorrect outputs in foundation models by retrieving external knowledge relevant to queries and incorporating it into their generation process. However, existing RAG approaches have primarily focused on textual information, with some recent advancements beginning to consider images, and they largely overlook videos, a rich source of multimodal knowledge capable of representing events, processes, and contextual details more effectively than any other modality. While a few recent studies explore the integration of videos in the response generation process, they either predefine query-associated videos without retrieving them according to queries, or convert videos into the textual descriptions without harnessing their multimodal richness. To tackle these, we introduce VideoRAG, a novel framework that not only dynamically retrieves relevant videos based on their relevance with queries but also utilizes both visual and textual information of videos in the output generation. Further, to operationalize this, our method revolves around the recent advance of Large Video Language Models (LVLMs), which enable the direct processing of video content to represent it for retrieval and seamless integration of the retrieved videos jointly with queries. We experimentally validate the effectiveness of VideoRAG, showcasing that it is superior to relevant baselines.', 'score': 39, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'a6a86d4d49a42b4d', 'authors': ['Soyeong Jeong', 'Kangsan Kim', 'Jinheon Baek', 'Sung Ju Hwang'], 'affiliations': ['DeepAuto.ai', 'KAIST'], 'pdf_title_img': 'assets/pdf/title_img/2501.05874.jpg', 'data': {'categories': ['#multimodal', '#rag', '#interpretability', '#hallucinations', '#video'], 'emoji': '🎥', 'ru': {'title': 'VideoRAG: Обогащение генерации ответов с помощью видеоконтента', 'desc': 'VideoRAG - это новая система для улучшения генерации ответов с использованием видеоконтента. В отличие от существующих подходов, она динамически извлекает релевантные видео и использует как визуальную, так и текстовую информацию из них. VideoRAG основан на Больших Видеоязыковых Моделях (LVLM), которые позволяют напрямую обрабатывать видеоконтент. Экспериментальные результаты показывают превосходство VideoRAG над существующими методами.'}, 'en': {'title': 'Enhancing Generation with Dynamic Video Retrieval', 'desc': "This paper presents VideoRAG, a new framework that enhances the Retrieval-Augmented Generation (RAG) approach by incorporating video content into the generation process. Unlike previous methods that primarily focused on text or predefined videos, VideoRAG dynamically retrieves relevant videos based on the user's query. It leverages both visual and textual information from the videos, allowing for a richer and more accurate output generation. The framework utilizes Large Video Language Models (LVLMs) to effectively process and integrate video content, demonstrating superior performance compared to existing methods."}, 'zh': {'title': '视频检索增强生成:提升多模态知识的利用', 'desc': '检索增强生成(RAG)是一种强大的策略,用于解决基础模型生成事实不准确输出的问题。现有的RAG方法主要集中在文本信息上,最近的一些进展开始考虑图像,但大多数忽视了视频这一丰富的多模态知识源。我们提出了VideoRAG框架,它不仅根据查询动态检索相关视频,还利用视频的视觉和文本信息进行输出生成。实验结果验证了VideoRAG的有效性,显示其优于相关基线。'}}}, {'id': 'https://huggingface.co/papers/2501.03841', 'title': 'OmniManip: Towards General Robotic Manipulation via Object-Centric Interaction Primitives as Spatial Constraints', 'url': 'https://huggingface.co/papers/2501.03841', 'abstract': "The development of general robotic systems capable of manipulating in unstructured environments is a significant challenge. While Vision-Language Models(VLM) excel in high-level commonsense reasoning, they lack the fine-grained 3D spatial understanding required for precise manipulation tasks. Fine-tuning VLM on robotic datasets to create Vision-Language-Action Models(VLA) is a potential solution, but it is hindered by high data collection costs and generalization issues. To address these challenges, we propose a novel object-centric representation that bridges the gap between VLM's high-level reasoning and the low-level precision required for manipulation. Our key insight is that an object's canonical space, defined by its functional affordances, provides a structured and semantically meaningful way to describe interaction primitives, such as points and directions. These primitives act as a bridge, translating VLM's commonsense reasoning into actionable 3D spatial constraints. In this context, we introduce a dual closed-loop, open-vocabulary robotic manipulation system: one loop for high-level planning through primitive resampling, interaction rendering and VLM checking, and another for low-level execution via 6D pose tracking. This design ensures robust, real-time control without requiring VLM fine-tuning. Extensive experiments demonstrate strong zero-shot generalization across diverse robotic manipulation tasks, highlighting the potential of this approach for automating large-scale simulation data generation.", 'score': 37, 'issue_id': 1628, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'c2dc8cc20b9b990a', 'authors': ['Mingjie Pan', 'Jiyao Zhang', 'Tianshu Wu', 'Yinghao Zhao', 'Wenlong Gao', 'Hao Dong'], 'affiliations': ['AgiBot', 'CFCS, School of CS, Peking University', 'PKU-AgiBot Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.03841.jpg', 'data': {'categories': ['#agents', '#reasoning', '#robotics', '#3d', '#transfer_learning', '#agi'], 'emoji': '🤖', 'ru': {'title': 'Объектно-ориентированный подход к роботизированной манипуляции с использованием VLM', 'desc': 'Статья представляет новый подход к робототехнике, объединяющий возможности моделей визуального языка (VLM) с точным 3D-пониманием, необходимым для манипуляций. Авторы предлагают объектно-ориентированное представление, использующее каноническое пространство объекта для описания примитивов взаимодействия. Система включает два цикла: планирование высокого уровня с использованием VLM и низкоуровневое выполнение с отслеживанием 6D-позы. Эксперименты показывают сильную обобщающую способность в различных задачах робототехнической манипуляции.'}, 'en': {'title': 'Bridging High-Level Reasoning and Low-Level Manipulation in Robotics', 'desc': "This paper addresses the challenge of enabling robots to manipulate objects in unpredictable environments by enhancing Vision-Language Models (VLM) with a new approach. The authors propose a Vision-Language-Action Model (VLA) that utilizes an object-centric representation, focusing on an object's canonical space defined by its functional affordances. This representation helps translate high-level reasoning from VLM into specific 3D spatial actions needed for manipulation tasks. The proposed dual closed-loop system allows for effective planning and execution without the need for extensive fine-tuning, demonstrating strong performance in various robotic tasks."}, 'zh': {'title': '打破高层推理与低层操作的壁垒', 'desc': '本论文探讨了在非结构化环境中操作的通用机器人系统的开发挑战。虽然视觉-语言模型(VLM)在高层次的常识推理方面表现出色,但缺乏精细的三维空间理解能力。我们提出了一种新颖的以对象为中心的表示方法,旨在弥合VLM的高层推理与操作所需的低层精度之间的差距。通过引入双闭环、开放词汇的机器人操作系统,我们实现了高效的实时控制,且无需对VLM进行微调。'}}}, {'id': 'https://huggingface.co/papers/2501.06186', 'title': 'LlamaV-o1: Rethinking Step-by-step Visual Reasoning in LLMs', 'url': 'https://huggingface.co/papers/2501.06186', 'abstract': "Reasoning is a fundamental capability for solving complex multi-step problems, particularly in visual contexts where sequential step-wise understanding is essential. Existing approaches lack a comprehensive framework for evaluating visual reasoning and do not emphasize step-wise problem-solving. To this end, we propose a comprehensive framework for advancing step-by-step visual reasoning in large language models (LMMs) through three key contributions. First, we introduce a visual reasoning benchmark specifically designed to evaluate multi-step reasoning tasks. The benchmark presents a diverse set of challenges with eight different categories ranging from complex visual perception to scientific reasoning with over 4k reasoning steps in total, enabling robust evaluation of LLMs' abilities to perform accurate and interpretable visual reasoning across multiple steps. Second, we propose a novel metric that assesses visual reasoning quality at the granularity of individual steps, emphasizing both correctness and logical coherence. The proposed metric offers deeper insights into reasoning performance compared to traditional end-task accuracy metrics. Third, we present a new multimodal visual reasoning model, named LlamaV-o1, trained using a multi-step curriculum learning approach, where tasks are progressively organized to facilitate incremental skill acquisition and problem-solving. The proposed LlamaV-o1 is designed for multi-step reasoning and learns step-by-step through a structured training paradigm. Extensive experiments show that our LlamaV-o1 outperforms existing open-source models and performs favorably against close-source proprietary models. Compared to the recent Llava-CoT, our LlamaV-o1 achieves an average score of 67.3 with an absolute gain of 3.8\\% across six benchmarks while being 5 times faster during inference scaling. Our benchmark, model, and code are publicly available.", 'score': 31, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '40e1a0d2c562cda5', 'authors': ['Omkar Thawakar', 'Dinura Dissanayake', 'Ketan More', 'Ritesh Thawkar', 'Ahmed Heakl', 'Noor Ahsan', 'Yuhao Li', 'Mohammed Zumri', 'Jean Lahoud', 'Rao Muhammad Anwer', 'Hisham Cholakkal', 'Ivan Laptev', 'Mubarak Shah', 'Fahad Shahbaz Khan', 'Salman Khan'], 'affiliations': ['Australian National University', 'Linköping University', 'Mohamed bin Zayed University of AI', 'University of Central Florida'], 'pdf_title_img': 'assets/pdf/title_img/2501.06186.jpg', 'data': {'categories': ['#cv', '#benchmark', '#training', '#multimodal', '#open_source', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Шаг за шагом к совершенному визуальному рассуждению', 'desc': 'Статья представляет комплексный подход к улучшению пошагового визуального рассуждения в больших языковых моделях (LLM). Авторы вводят новый бенчмарк для оценки многошаговых задач визуального рассуждения и метрику для оценки качества рассуждения на уровне отдельных шагов. Они также предлагают новую мультимодальную модель визуального рассуждения LlamaV-o1, обученную с использованием подхода многоступенчатого куррикулярного обучения. Эксперименты показывают, что LlamaV-o1 превосходит существующие модели с открытым исходным кодом и демонстрирует хорошие результаты по сравнению с проприетарными моделями.'}, 'en': {'title': 'Advancing Step-by-Step Visual Reasoning in LLMs', 'desc': "This paper introduces a new framework to enhance visual reasoning in large language models (LLMs) by focusing on step-by-step problem-solving. It presents a visual reasoning benchmark with over 4,000 reasoning steps across eight categories, allowing for thorough evaluation of LLMs' multi-step reasoning capabilities. Additionally, a novel metric is proposed to assess the quality of visual reasoning at each step, providing insights beyond traditional accuracy measures. The authors also introduce LlamaV-o1, a multimodal model trained with a curriculum learning approach, which shows significant performance improvements over existing models."}, 'zh': {'title': '提升视觉推理能力的全新框架', 'desc': '本论文提出了一种新的框架,旨在提升大型语言模型(LLMs)在视觉推理中的逐步推理能力。我们设计了一个视觉推理基准,包含多达4000个推理步骤,涵盖复杂的视觉感知和科学推理等八个类别,以便全面评估模型的推理能力。我们还提出了一种新颖的度量标准,专注于逐步推理的正确性和逻辑一致性,提供比传统的任务准确率更深入的洞察。最后,我们介绍了名为LlamaV-o1的多模态视觉推理模型,通过逐步课程学习的方法进行训练,显著提升了推理性能。'}}}, {'id': 'https://huggingface.co/papers/2501.05510', 'title': 'OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video Understanding?', 'url': 'https://huggingface.co/papers/2501.05510', 'abstract': 'Temporal Awareness, the ability to reason dynamically based on the timestamp when a question is raised, is the key distinction between offline and online video LLMs. Unlike offline models, which rely on complete videos for static, post hoc analysis, online models process video streams incrementally and dynamically adapt their responses based on the timestamp at which the question is posed. Despite its significance, temporal awareness has not been adequately evaluated in existing benchmarks. To fill this gap, we present OVO-Bench (Online-VideO-Benchmark), a novel video benchmark that emphasizes the importance of timestamps for advanced online video understanding capability benchmarking. OVO-Bench evaluates the ability of video LLMs to reason and respond to events occurring at specific timestamps under three distinct scenarios: (1) Backward tracing: trace back to past events to answer the question. (2) Real-time understanding: understand and respond to events as they unfold at the current timestamp. (3) Forward active responding: delay the response until sufficient future information becomes available to answer the question accurately. OVO-Bench comprises 12 tasks, featuring 644 unique videos and approximately human-curated 2,800 fine-grained meta-annotations with precise timestamps. We combine automated generation pipelines with human curation. With these high-quality samples, we further developed an evaluation pipeline to systematically query video LLMs along the video timeline. Evaluations of nine Video-LLMs reveal that, despite advancements on traditional benchmarks, current models struggle with online video understanding, showing a significant gap compared to human agents. We hope OVO-Bench will drive progress in video LLMs and inspire future research in online video reasoning. Our benchmark and code can be accessed at https://github.com/JoeLeelyf/OVO-Bench.', 'score': 26, 'issue_id': 1631, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '6f833a01519603d5', 'authors': ['Yifei Li', 'Junbo Niu', 'Ziyang Miao', 'Chunjiang Ge', 'Yuanhang Zhou', 'Qihao He', 'Xiaoyi Dong', 'Haodong Duan', 'Shuangrui Ding', 'Rui Qian', 'Pan Zhang', 'Yuhang Zang', 'Yuhang Cao', 'Conghui He', 'Jiaqi Wang'], 'affiliations': ['Beihang University', 'Communication University of China', 'SenseTime Group', 'Shanghai Artificial Intelligence Laboratory', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05510.jpg', 'data': {'categories': ['#benchmark', '#survey', '#video', '#reasoning'], 'emoji': '⏱️', 'ru': {'title': 'Временная осведомленность как ключ к онлайн-анализу видео для LLM', 'desc': 'Статья представляет новый бенчмарк OVO-Bench для оценки способности видео-LLM моделей к онлайн-анализу видео с учетом временных меток. Бенчмарк включает 12 задач, 644 уникальных видео и около 2800 мета-аннотаций с точными временными метками. OVO-Bench оценивает три сценария: обратное отслеживание, понимание в реальном времени и активное реагирование на будущие события. Результаты тестирования девяти видео-LLM моделей показывают значительное отставание от человеческих возможностей в онлайн-анализе видео.'}, 'en': {'title': 'Enhancing Online Video Understanding with Temporal Awareness', 'desc': 'This paper introduces OVO-Bench, a new benchmark designed to evaluate the temporal awareness of online video language models (LLMs). Unlike offline models that analyze complete videos, online models must dynamically respond to questions based on the specific timestamp of the inquiry. OVO-Bench assesses video LLMs through three scenarios: backward tracing, real-time understanding, and forward active responding, using a dataset of 644 videos and 2,800 meta-annotations. The findings indicate that current video LLMs still lag behind human performance in understanding and reasoning about events in real-time video streams.'}, 'zh': {'title': '提升视频理解能力的时间意识基准', 'desc': '本文提出了OVO-Bench,这是一个新的视频基准,旨在评估视频大语言模型(LLMs)在时间意识方面的能力。时间意识是指模型根据提问时的时间戳动态推理的能力,这与传统的离线模型不同,后者依赖于完整视频进行静态分析。OVO-Bench包含12个任务,使用644个独特视频和约2800个精细的元注释,强调了时间戳在在线视频理解中的重要性。通过对九个视频LLMs的评估,结果显示当前模型在在线视频理解方面仍存在显著差距,远不及人类代理。'}}}, {'id': 'https://huggingface.co/papers/2501.05727', 'title': 'Enabling Scalable Oversight via Self-Evolving Critic', 'url': 'https://huggingface.co/papers/2501.05727', 'abstract': "Despite their remarkable performance, the development of Large Language Models (LLMs) faces a critical challenge in scalable oversight: providing effective feedback for tasks where human evaluation is difficult or where LLMs outperform humans. While there is growing interest in using LLMs for critique, current approaches still rely on human annotations or more powerful models, leaving the issue of enhancing critique capabilities without external supervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework that enables genuine self-evolution of critique abilities. Technically, SCRIT self-improves by training on synthetic data, generated by a contrastive-based self-critic that uses reference solutions for step-by-step critique, and a self-validation mechanism that ensures critique quality through correction outcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs, SCRIT achieves up to a 10.3\\% improvement on critique-correction and error identification benchmarks. Our analysis reveals that SCRIT's performance scales positively with data and model size, outperforms alternative approaches, and benefits critically from its self-validation component.", 'score': 17, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '5a9e3b95b6aa1312', 'authors': ['Zhengyang Tang', 'Ziniu Li', 'Zhenyang Xiao', 'Tian Ding', 'Ruoyu Sun', 'Benyou Wang', 'Dayiheng Liu', 'Fei Huang', 'Tianyu Liu', 'Bowen Yu', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Inc., Beijing, China', 'Shenzhen Research Institute of Big Data, Shenzhen, China', 'The Chinese University of Hong Kong, Shenzhen, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.05727.jpg', 'data': {'categories': ['#training', '#benchmark', '#optimization', '#rlhf', '#synthetic'], 'emoji': '🔬', 'ru': {'title': 'SCRIT: Самосовершенствующийся критик для LLM', 'desc': 'SCRIT - это новая система для улучшения способностей больших языковых моделей (LLM) к самокритике без внешнего надзора. Она использует синтетические данные, созданные с помощью самокритика на основе контрастного обучения и механизма самопроверки. Реализованная на базе Qwen2.5-72B-Instruct, SCRIT демонстрирует значительное улучшение в задачах критики-коррекции и идентификации ошибок. Анализ показывает, что производительность SCRIT растет с увеличением объема данных и размера модели.'}, 'en': {'title': 'Empowering LLMs with Self-Evolving Critique', 'desc': 'This paper addresses the challenge of providing effective feedback for Large Language Models (LLMs) in tasks where human evaluation is difficult. It introduces SCRIT (Self-evolving CRITic), a framework that enhances the critique capabilities of LLMs without relying on external supervision. SCRIT utilizes synthetic data generated by a contrastive-based self-critic and incorporates a self-validation mechanism to ensure the quality of critiques. The results show that SCRIT significantly improves critique-correction and error identification benchmarks, demonstrating its effectiveness as LLMs scale in size and data.'}, 'zh': {'title': '自我进化,提升批评能力!', 'desc': '尽管大型语言模型(LLMs)表现出色,但在可扩展监督方面面临挑战,特别是在难以进行人类评估的任务中。本文提出了SCRIT(自我进化批评者)框架,旨在提升模型的自我批评能力。SCRIT通过对比自我批评生成合成数据,并利用自我验证机制确保批评质量,从而实现自我改进。实验结果表明,SCRIT在批评纠正和错误识别基准上提高了10.3%的性能,且其表现随着数据和模型规模的增加而提升。'}}}, {'id': 'https://huggingface.co/papers/2501.05452', 'title': 'ReFocus: Visual Editing as a Chain of Thought for Structured Image Understanding', 'url': 'https://huggingface.co/papers/2501.05452', 'abstract': 'Structured image understanding, such as interpreting tables and charts, requires strategically refocusing across various structures and texts within an image, forming a reasoning sequence to arrive at the final answer. However, current multimodal large language models (LLMs) lack this multihop selective attention capability. In this work, we introduce ReFocus, a simple yet effective framework that equips multimodal LLMs with the ability to generate "visual thoughts" by performing visual editing on the input image through code, shifting and refining their visual focuses. Specifically, ReFocus enables multimodal LLMs to generate Python codes to call tools and modify the input image, sequentially drawing boxes, highlighting sections, and masking out areas, thereby enhancing the visual reasoning process. We experiment upon a wide range of structured image understanding tasks involving tables and charts. ReFocus largely improves performance on all tasks over GPT-4o without visual editing, yielding an average gain of 11.0% on table tasks and 6.8% on chart tasks. We present an in-depth analysis of the effects of different visual edits, and reasons why ReFocus can improve the performance without introducing additional information. Further, we collect a 14k training set using ReFocus, and prove that such visual chain-of-thought with intermediate information offers a better supervision than standard VQA data, reaching a 8.0% average gain over the same model trained with QA pairs and 2.6% over CoT.', 'score': 7, 'issue_id': 1630, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '28a63b60414f99da', 'authors': ['Xingyu Fu', 'Minqian Liu', 'Zhengyuan Yang', 'John Corring', 'Yijuan Lu', 'Jianwei Yang', 'Dan Roth', 'Dinei Florencio', 'Cha Zhang'], 'affiliations': ['Microsoft', 'University of Pennsylvania', 'Virginia Tech'], 'pdf_title_img': 'assets/pdf/title_img/2501.05452.jpg', 'data': {'categories': ['#multimodal', '#interpretability', '#dataset', '#reasoning', '#training', '#cv'], 'emoji': '🔍', 'ru': {'title': 'ReFocus: Улучшение визуального понимания LLM через управляемое редактирование изображений', 'desc': "Статья представляет ReFocus - фреймворк, который наделяет мультимодальные большие языковые модели (LLM) способностью генерировать 'визуальные мысли' путем редактирования входного изображения с помощью кода. ReFocus позволяет LLM последовательно рисовать рамки, выделять секции и маскировать области, улучшая процесс визуального рассуждения. Эксперименты показывают значительное улучшение производительности на задачах понимания структурированных изображений, таких как таблицы и диаграммы. Авторы также доказывают, что визуальная цепочка рассуждений с промежуточной информацией обеспечивает лучшее обучение, чем стандартные данные VQA."}, 'en': {'title': 'Enhancing Visual Reasoning with ReFocus', 'desc': "This paper presents ReFocus, a framework designed to enhance the capabilities of multimodal large language models (LLMs) in structured image understanding tasks, such as interpreting tables and charts. ReFocus allows these models to generate 'visual thoughts' by performing visual edits on input images, which helps them focus on relevant areas and improve their reasoning processes. The framework enables the generation of Python code to manipulate images, such as drawing boxes and highlighting sections, which significantly boosts performance on various tasks. Experimental results show that ReFocus achieves notable improvements over existing models, demonstrating the effectiveness of visual editing in enhancing visual reasoning without adding new information."}, 'zh': {'title': 'ReFocus:提升多模态模型的视觉推理能力', 'desc': '本论文提出了一种名为ReFocus的框架,旨在提升多模态大语言模型在结构化图像理解任务中的表现。ReFocus通过生成Python代码对输入图像进行视觉编辑,使模型能够逐步调整视觉焦点,从而形成更有效的推理过程。实验结果表明,ReFocus在表格和图表任务上显著提高了性能,平均提升分别为11.0%和6.8%。此外,研究还表明,使用ReFocus生成的视觉链式思维提供了比标准问答数据更好的监督效果。'}}}, {'id': 'https://huggingface.co/papers/2501.04698', 'title': 'ConceptMaster: Multi-Concept Video Customization on Diffusion Transformer Models Without Test-Time Tuning', 'url': 'https://huggingface.co/papers/2501.04698', 'abstract': 'Text-to-video generation has made remarkable advancements through diffusion models. However, Multi-Concept Video Customization (MCVC) remains a significant challenge. We identify two key challenges in this task: 1) the identity decoupling problem, where directly adopting existing customization methods inevitably mix attributes when handling multiple concepts simultaneously, and 2) the scarcity of high-quality video-entity pairs, which is crucial for training such a model that represents and decouples various concepts well. To address these challenges, we introduce ConceptMaster, an innovative framework that effectively tackles the critical issues of identity decoupling while maintaining concept fidelity in customized videos. Specifically, we introduce a novel strategy of learning decoupled multi-concept embeddings that are injected into the diffusion models in a standalone manner, which effectively guarantees the quality of customized videos with multiple identities, even for highly similar visual concepts. To further overcome the scarcity of high-quality MCVC data, we carefully establish a data construction pipeline, which enables systematic collection of precise multi-concept video-entity data across diverse concepts. A comprehensive benchmark is designed to validate the effectiveness of our model from three critical dimensions: concept fidelity, identity decoupling ability, and video generation quality across six different concept composition scenarios. Extensive experiments demonstrate that our ConceptMaster significantly outperforms previous approaches for this task, paving the way for generating personalized and semantically accurate videos across multiple concepts.', 'score': 6, 'issue_id': 1631, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '6e82dc0b883c447a', 'authors': ['Yuzhou Huang', 'Ziyang Yuan', 'Quande Liu', 'Qiulin Wang', 'Xintao Wang', 'Ruimao Zhang', 'Pengfei Wan', 'Di Zhang', 'Kun Gai'], 'affiliations': ['Kuaishou Technology', 'Sun Yat-sen University', 'The Chinese University of Hong Kong, Shenzhen', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04698.jpg', 'data': {'categories': ['#diffusion', '#benchmark', '#data', '#video', '#dataset'], 'emoji': '🎬', 'ru': {'title': 'ConceptMaster: новый уровень персонализации в генерации видео', 'desc': 'Статья представляет ConceptMaster - новую систему для генерации видео с множественными персонализированными концептами. Авторы решают проблему смешивания атрибутов при одновременной работе с несколькими концептами, предлагая метод обучения раздельных мультиконцептуальных эмбеддингов. Для преодоления нехватки качественных данных разработан специальный конвейер сбора видео-сущностных пар. Эксперименты показывают превосходство ConceptMaster над существующими подходами в точности концептов, способности разделения идентичностей и качестве генерации видео.'}, 'en': {'title': 'Mastering Multi-Concept Video Customization with ConceptMaster', 'desc': "This paper presents ConceptMaster, a new framework for Multi-Concept Video Customization (MCVC) that addresses two main challenges: identity decoupling and the lack of high-quality video-entity pairs. The identity decoupling problem arises when existing methods mix attributes from different concepts, leading to poor customization results. ConceptMaster introduces a novel approach to learn decoupled multi-concept embeddings, which are integrated into diffusion models to ensure high-quality video outputs with distinct identities. Additionally, the authors establish a data construction pipeline to systematically gather diverse multi-concept video-entity data, and they validate their model's effectiveness through comprehensive benchmarks across various scenarios."}, 'zh': {'title': 'ConceptMaster:多概念视频定制的新突破', 'desc': '本文介绍了一种名为ConceptMaster的创新框架,旨在解决多概念视频定制中的身份解耦问题和高质量视频实体对的稀缺性。我们提出了一种新的学习策略,通过独立注入解耦的多概念嵌入到扩散模型中,从而保证定制视频的质量。为了克服高质量MCVC数据的不足,我们建立了一个数据构建管道,系统性地收集多概念视频实体数据。实验结果表明,ConceptMaster在概念保真度、身份解耦能力和视频生成质量等方面显著优于之前的方法。'}}}, {'id': 'https://huggingface.co/papers/2501.05707', 'title': 'Multiagent Finetuning: Self Improvement with Diverse Reasoning Chains', 'url': 'https://huggingface.co/papers/2501.05707', 'abstract': 'Large language models (LLMs) have achieved remarkable performance in recent years but are fundamentally limited by the underlying training data. To improve models beyond the training data, recent works have explored how LLMs can be used to generate synthetic data for autonomous self-improvement. However, successive steps of self-improvement can reach a point of diminishing returns. In this work, we propose a complementary approach towards self-improvement where finetuning is applied to a multiagent society of language models. A group of language models, all starting from the same base model, are independently specialized by updating each one using data generated through multiagent interactions among the models. By training each model on independent sets of data, we illustrate how this approach enables specialization across models and diversification over the set of models. As a result, our overall system is able to preserve diverse reasoning chains and autonomously improve over many more rounds of fine-tuning than single-agent self-improvement methods. We quantitatively illustrate the efficacy of the approach across a wide suite of reasoning tasks.', 'score': 5, 'issue_id': 1629, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '3d75785114d08414', 'authors': ['Vighnesh Subramaniam', 'Yilun Du', 'Joshua B. Tenenbaum', 'Antonio Torralba', 'Shuang Li', 'Igor Mordatch'], 'affiliations': ['Google Deepmind', 'Harvard University', 'MIT CSAIL', 'Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05707.jpg', 'data': {'categories': ['#synthetic', '#reasoning', '#training', '#agents'], 'emoji': '🤖', 'ru': {'title': 'Мультиагентное обучение: новый путь к улучшению языковых моделей', 'desc': 'Эта статья представляет новый подход к улучшению больших языковых моделей (LLM) с помощью мультиагентного обучения. Авторы предлагают создать группу моделей, которые взаимодействуют друг с другом для генерации синтетических данных. Каждая модель специализируется на своем наборе данных, что позволяет сохранить разнообразие рассуждений. Этот метод показывает лучшие результаты по сравнению с одноагентными подходами к самоулучшению на различных задачах рассуждения.'}, 'en': {'title': 'Empowering Language Models through Multiagent Self-Improvement', 'desc': 'This paper discusses a new method for improving large language models (LLMs) by using a multiagent system. Instead of relying solely on the original training data, the authors propose that multiple LLMs can interact and generate their own synthetic data, which they then use to fine-tune themselves. This approach allows each model to specialize in different areas, leading to a more diverse set of reasoning capabilities. The results show that this multiagent fine-tuning method can enhance performance over many iterations, surpassing traditional single-agent self-improvement techniques.'}, 'zh': {'title': '多智能体模型的自我改进新方法', 'desc': '大型语言模型(LLMs)在最近几年取得了显著的性能,但其根本上受到训练数据的限制。为了超越训练数据,最近的研究探索了如何利用LLMs生成合成数据以实现自主自我改进。本文提出了一种补充的方法,通过在多智能体语言模型的社会中进行微调,来实现自我改进。通过独立训练每个模型,利用模型之间的多智能体交互生成的数据,我们展示了这种方法如何实现模型的专业化和多样化,从而在多个微调轮次中保持多样的推理链。'}}}, {'id': 'https://huggingface.co/papers/2501.04961', 'title': 'Demystifying Domain-adaptive Post-training for Financial LLMs', 'url': 'https://huggingface.co/papers/2501.04961', 'abstract': 'Domain-adaptive post-training of large language models (LLMs) has emerged as a promising approach for specialized domains such as medicine and finance. However, significant challenges remain in identifying optimal adaptation criteria and training strategies across varying data and model configurations. To address these challenges, we introduce FINDAP, a systematic and fine-grained investigation into domain-adaptive post-training of LLMs for the finance domain. Our approach begins by identifying the core capabilities required for the target domain and designing a comprehensive evaluation suite aligned with these needs. We then analyze the effectiveness of key post-training stages, including continual pretraining, instruction tuning, and preference alignment. Building on these insights, we propose an effective training recipe centered on a novel preference data distillation method, which leverages process signals from a generative reward model. The resulting model, Llama-Fin, achieves state-of-the-art performance across a wide range of financial tasks. Our analysis also highlights how each post-training stage contributes to distinct capabilities, uncovering specific challenges and effective solutions, providing valuable insights for domain adaptation of LLMs. Project page: https://github.com/SalesforceAIResearch/FinDap', 'score': 4, 'issue_id': 1642, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ade3590d1cc29d47', 'authors': ['Zixuan Ke', 'Yifei Ming', 'Xuan-Phi Nguyen', 'Caiming Xiong', 'Shafiq Joty'], 'affiliations': ['Salesforce AI Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.04961.jpg', 'data': {'categories': ['#optimization', '#rlhf', '#healthcare', '#transfer_learning', '#training'], 'emoji': '💹', 'ru': {'title': 'Оптимизация LLM для финансов: от анализа до совершенства', 'desc': 'Статья представляет FINDAP - систематический подход к доменно-адаптивному постобучению больших языковых моделей (LLM) для финансовой сферы. Авторы разработали комплексный набор оценок, анализирующий эффективность ключевых этапов постобучения, включая продолжающееся предобучение, инструктивную настройку и выравнивание предпочтений. Предложен эффективный рецепт обучения, основанный на новом методе дистилляции данных предпочтений. Результирующая модель Llama-Fin достигает передовых результатов в широком спектре финансовых задач.'}, 'en': {'title': 'FINDAP: Tailoring LLMs for Finance Excellence', 'desc': 'This paper presents FINDAP, a method for improving large language models (LLMs) specifically for the finance sector through domain-adaptive post-training. It identifies essential capabilities needed for financial tasks and creates a tailored evaluation suite to measure these capabilities. The study examines various post-training techniques, such as continual pretraining and instruction tuning, to determine their effectiveness. Ultimately, the authors introduce Llama-Fin, a model that utilizes a novel preference data distillation method, achieving top performance in financial applications while providing insights into the adaptation process.'}, 'zh': {'title': '金融领域的智能适应训练', 'desc': '本文介绍了一种针对金融领域的大型语言模型(LLM)进行领域自适应后训练的方法,称为FINDAP。我们首先识别目标领域所需的核心能力,并设计了与这些需求相一致的综合评估套件。接着,我们分析了关键后训练阶段的有效性,包括持续预训练、指令调优和偏好对齐。最终,我们提出了一种基于新颖偏好数据蒸馏方法的有效训练方案,所得到的模型Llama-Fin在多种金融任务中达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.06187', 'title': 'Multi-subject Open-set Personalization in Video Generation', 'url': 'https://huggingface.co/papers/2501.06187', 'abstract': 'Video personalization methods allow us to synthesize videos with specific concepts such as people, pets, and places. However, existing methods often focus on limited domains, require time-consuming optimization per subject, or support only a single subject. We present Video Alchemist - a video model with built-in multi-subject, open-set personalization capabilities for both foreground objects and background, eliminating the need for time-consuming test-time optimization. Our model is built on a new Diffusion Transformer module that fuses each conditional reference image and its corresponding subject-level text prompt with cross-attention layers. Developing such a large model presents two main challenges: dataset and evaluation. First, as paired datasets of reference images and videos are extremely hard to collect, we sample selected video frames as reference images and synthesize a clip of the target video. However, while models can easily denoise training videos given reference frames, they fail to generalize to new contexts. To mitigate this issue, we design a new automatic data construction pipeline with extensive image augmentations. Second, evaluating open-set video personalization is a challenge in itself. To address this, we introduce a personalization benchmark that focuses on accurate subject fidelity and supports diverse personalization scenarios. Finally, our extensive experiments show that our method significantly outperforms existing personalization methods in both quantitative and qualitative evaluations.', 'score': 4, 'issue_id': 1631, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'fcf16f5f8fe9047a', 'authors': ['Tsai-Shien Chen', 'Aliaksandr Siarohin', 'Willi Menapace', 'Yuwei Fang', 'Kwot Sin Lee', 'Ivan Skorokhodov', 'Kfir Aberman', 'Jun-Yan Zhu', 'Ming-Hsuan Yang', 'Sergey Tulyakov'], 'affiliations': ['CMU', 'Snap Inc.', 'UC Merced'], 'pdf_title_img': 'assets/pdf/title_img/2501.06187.jpg', 'data': {'categories': ['#diffusion', '#synthetic', '#benchmark', '#data', '#optimization', '#video', '#dataset'], 'emoji': '🎭', 'ru': {'title': 'Универсальная персонализация видео без длительной оптимизации', 'desc': 'Статья представляет Video Alchemist - новую модель для персонализации видео с возможностью работы с несколькими объектами. Модель использует новый модуль Diffusion Transformer, который объединяет условные референсные изображения и текстовые промпты. Авторы разработали автоматический конвейер для создания данных с обширными аугментациями изображений. Также был создан новый бенчмарк для оценки персонализации видео в открытом наборе.'}, 'en': {'title': 'Revolutionizing Video Personalization with Video Alchemist', 'desc': "The paper introduces Video Alchemist, a novel video personalization model that allows for the synthesis of videos featuring multiple subjects without the need for extensive optimization. It utilizes a Diffusion Transformer module that integrates reference images and text prompts through cross-attention layers, enabling effective personalization for both foreground and background elements. The authors tackle challenges related to dataset creation by employing a new automatic data construction pipeline with image augmentations, which helps improve generalization to new contexts. Additionally, they propose a personalization benchmark to evaluate the model's performance in diverse scenarios, demonstrating that Video Alchemist outperforms existing methods in both quantitative and qualitative assessments."}, 'zh': {'title': '视频个性化的新突破', 'desc': '视频个性化方法可以合成特定概念的视频,如人物、宠物和地点。然而,现有方法通常只关注有限的领域,且每个主题需要耗时的优化,或者仅支持单一主题。我们提出了视频炼金术师(Video Alchemist),这是一种具有内置多主题、开放集个性化能力的视频模型,能够处理前景物体和背景,消除了耗时的测试时间优化需求。我们的模型基于新的扩散变换器模块,结合条件参考图像和相应的主题级文本提示,通过交叉注意力层进行融合。'}}}, {'id': 'https://huggingface.co/papers/2501.05542', 'title': 'Infecting Generative AI With Viruses', 'url': 'https://huggingface.co/papers/2501.05542', 'abstract': 'This study demonstrates a novel approach to testing the security boundaries of Vision-Large Language Model (VLM/ LLM) using the EICAR test file embedded within JPEG images. We successfully executed four distinct protocols across multiple LLM platforms, including OpenAI GPT-4o, Microsoft Copilot, Google Gemini 1.5 Pro, and Anthropic Claude 3.5 Sonnet. The experiments validated that a modified JPEG containing the EICAR signature could be uploaded, manipulated, and potentially executed within LLM virtual workspaces. Key findings include: 1) consistent ability to mask the EICAR string in image metadata without detection, 2) successful extraction of the test file using Python-based manipulation within LLM environments, and 3) demonstration of multiple obfuscation techniques including base64 encoding and string reversal. This research extends Microsoft Research\'s "Penetration Testing Rules of Engagement" framework to evaluate cloud-based generative AI and LLM security boundaries, particularly focusing on file handling and execution capabilities within containerized environments.', 'score': 4, 'issue_id': 1630, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ac21f1bae807486e', 'authors': ['David Noever', 'Forrest McKee'], 'affiliations': ['PeopleTec, Inc., Huntsville, AL'], 'pdf_title_img': 'assets/pdf/title_img/2501.05542.jpg', 'data': {'categories': ['#cv', '#benchmark', '#data', '#security'], 'emoji': '🛡️', 'ru': {'title': 'Новые горизонты в тестировании безопасности VLM/LLM с помощью EICAR', 'desc': 'Это исследование демонстрирует новый подход к тестированию границ безопасности моделей типа Vision-Large Language Model (VLM/LLM) с использованием тестового файла EICAR, встроенного в изображения JPEG. Эксперименты проводились на нескольких платформах LLM, включая OpenAI GPT-4, Microsoft Copilot, Google Gemini 1.5 Pro и Anthropic Claude 3.5 Sonnet. Ключевые результаты включают успешную маскировку строки EICAR в метаданных изображения, извлечение тестового файла с помощью Python в среде LLM и демонстрацию различных методов обфускации. Исследование расширяет рамки оценки безопасности облачных генеративных ИИ и LLM, особенно в отношении обработки файлов и возможностей выполнения в контейнеризированных средах.'}, 'en': {'title': 'Testing Security Boundaries of LLMs with EICAR in JPEGs', 'desc': 'This paper presents a new method for testing the security limits of Vision-Large Language Models (VLMs/LLMs) by embedding the EICAR test file in JPEG images. The authors conducted experiments on various LLM platforms, revealing that modified JPEGs containing the EICAR signature could be uploaded and manipulated without detection. They demonstrated the ability to extract the EICAR file using Python scripts and employed several obfuscation techniques to hide the EICAR string. This research enhances existing security frameworks by focusing on the file handling and execution capabilities of cloud-based generative AI systems.'}, 'zh': {'title': '测试大型语言模型的安全边界新方法', 'desc': '本研究展示了一种新颖的方法,用于测试视觉大型语言模型(VLM/LLM)的安全边界,使用嵌入在JPEG图像中的EICAR测试文件。我们在多个LLM平台上成功执行了四种不同的协议,包括OpenAI GPT-4o、Microsoft Copilot、Google Gemini 1.5 Pro和Anthropic Claude 3.5 Sonnet。实验验证了修改后的JPEG图像可以在LLM虚拟工作区中上传、操控并可能执行。研究的关键发现包括:在图像元数据中无检测地掩盖EICAR字符串、在LLM环境中成功提取测试文件,以及展示多种混淆技术,如base64编码和字符串反转。'}}}, {'id': 'https://huggingface.co/papers/2501.14249', 'title': "Humanity's Last Exam", 'url': 'https://huggingface.co/papers/2501.14249', 'abstract': "Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. HLE consists of 3,000 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. Each question has a known solution that is unambiguous and easily verifiable, but cannot be quickly answered via internet retrieval. State-of-the-art LLMs demonstrate low accuracy and calibration on HLE, highlighting a significant gap between current LLM capabilities and the expert human frontier on closed-ended academic questions. To inform research and policymaking upon a clear understanding of model capabilities, we publicly release HLE at https://lastexam.ai.", 'score': 29, 'issue_id': 1873, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '4d614974221756d3', 'authors': ['Long Phan', 'Alice Gatti', 'Ziwen Han', 'Nathaniel Li', 'Josephina Hu', 'Hugh Zhang', 'Sean Shi', 'Michael Choi', 'Anish Agrawal', 'Arnav Chopra', 'Adam Khoja', 'Ryan Kim', 'Jason Hausenloy', 'Oliver Zhang', 'Mantas Mazeika', 'Daron Anderson', 'Tung Nguyen', 'Mobeen Mahmood', 'Fiona Feng', 'Steven Y. Feng', 'Haoran Zhao', 'Michael Yu', 'Varun Gangal', 'Chelsea Zou', 'Zihan Wang', 'Jessica P. Wang', 'Pawan Kumar', 'Oleksandr Pokutnyi', 'Robert Gerbicz', 'Serguei Popov', 'John-Clark Levin', 'Mstyslav Kazakov', 'Johannes Schmitt', 'Geoff Galgon', 'Alvaro Sanchez', 'Yongki Lee', 'Will Yeadon', 'Scott Sauers', 'Marc Roth', 'Chidozie Agu', 'Søren Riis', 'Fabian Giska', 'Saiteja Utpala', 'Zachary Giboney', 'Gashaw M. Goshu', 'Joan of Arc Xavier', 'Sarah-Jane Crowson', 'Mohinder Maheshbhai Naiya', 'Noah Burns', 'Lennart Finke', 'Zerui Cheng', 'Hyunwoo Park', 'Francesco Fournier-Facio', 'John Wydallis', 'Mark Nandor', 'Ankit Singh', 'Tim Gehrunger', 'Jiaqi Cai', 'Ben McCarty', 'Darling Duclosel', 'Jungbae Nam', 'Jennifer Zampese', 'Ryan G. Hoerr', 'Aras Bacho', 'Gautier Abou Loume', 'Abdallah Galal', 'Hangrui Cao', 'Alexis C Garretson', 'Damien Sileo', 'Qiuyu Ren', 'Doru Cojoc', 'Pavel Arkhipov', 'Usman Qazi', 'Lianghui Li', 'Sumeet Motwani', 'Christian Schroeder de Witt', 'Edwin Taylor', 'Johannes Veith', 'Eric Singer', 'Taylor D. Hartman', 'Paolo Rissone', 'Jaehyeok Jin', 'Jack Wei Lun Shi', 'Chris G. Willcocks', 'Joshua Robinson', 'Aleksandar Mikov', 'Ameya Prabhu', 'Longke Tang', 'Xavier Alapont', 'Justine Leon Uro', 'Kevin Zhou', 'Emily de Oliveira Santos', 'Andrey Pupasov Maksimov', 'Edward Vendrow', 'Kengo Zenitani', 'Julien Guillod', 'Yuqi Li', 'Joshua Vendrow', 'Vladyslav Kuchkin', 'Ng Ze-An', 'Pierre Marion', 'Denis Efremov', 'Jayson Lynch', 'Kaiqu Liang', 'Andrew Gritsevskiy', 'Dakotah Martinez', 'Ben Pageler', 'Nick Crispino', 'Dimitri Zvonkine', 'Natanael Wildner Fraga', 'Saeed Soori', 'Ori Press', 'Henry Tang', 'Julian Salazar', 'Sean R. Green', 'Lina Brüssel', 'Moon Twayana', 'Aymeric Dieuleveut', 'T. Ryan Rogers', 'Wenjin Zhang', 'Bikun Li', 'Jinzhou Yang', 'Arun Rao', 'Gabriel Loiseau', 'Mikhail Kalinin', 'Marco Lukas', 'Ciprian Manolescu', 'Subrata Mishra', 'Ariel Ghislain Kemogne Kamdoum', 'Tobias Kreiman', 'Tad Hogg', 'Alvin Jin', 'Carlo Bosio', 'Gongbo Sun', 'Brian P Coppola', 'Tim Tarver', 'Haline Heidinger', 'Rafael Sayous', 'Stefan Ivanov', 'Joseph M Cavanagh', 'Jiawei Shen', 'Joseph Marvin Imperial', 'Philippe Schwaller', 'Shaipranesh Senthilkuma', 'Andres M Bran', 'Ali Dehghan', 'Andres Algaba', 'Brecht Verbeken', 'David Noever', 'Ragavendran P V', 'Lisa Schut', 'Ilia Sucholutsky', 'Evgenii Zheltonozhskii', 'Derek Lim', 'Richard Stanley', 'Shankar Sivarajan', 'Tong Yang', 'John Maar', 'Julian Wykowski', 'Martí Oller', 'Jennifer Sandlin', 'Anmol Sahu', 'Yuzheng Hu', 'Sara Fish', 'Nasser Heydari', 'Archimedes Apronti', 'Kaivalya Rawal', 'Tobias Garcia Vilchis', 'Yuexuan Zu', 'Martin Lackner', 'James Koppel', 'Jeremy Nguyen', 'Daniil S. Antonenko', 'Steffi Chern', 'Bingchen Zhao', 'Pierrot Arsene', 'Alan Goldfarb', 'Sergey Ivanov', 'Rafał Poświata', 'Chenguang Wang', 'Daofeng Li', 'Donato Crisostomi', 'Andrea Achilleos', 'Benjamin Myklebust', 'Archan Sen', 'David Perrella', 'Nurdin Kaparov', 'Mark H Inlow', 'Allen Zang', 'Elliott Thornley', 'Daniil Orel', 'Vladislav Poritski', 'Shalev Ben-David', 'Zachary Berger', 'Parker Whitfill', 'Michael Foster', 'Daniel Munro', 'Linh Ho', 'Dan Bar Hava', 'Aleksey Kuchkin', 'Robert Lauff', 'David Holmes', 'Frank Sommerhage', 'Keith Schneider', 'Zakayo Kazibwe', 'Nate Stambaugh', 'Mukhwinder Singh', 'Ilias Magoulas', 'Don Clarke', 'Dae Hyun Kim', 'Felipe Meneguitti Dias', 'Veit Elser', 'Kanu Priya Agarwal', 'Victor Efren Guadarrama Vilchis', 'Immo Klose', 'Christoph Demian', 'Ujjwala Anantheswaran', 'Adam Zweiger', 'Guglielmo Albani', 'Jeffery Li', 'Nicolas Daans', 'Maksim Radionov', 'Václav Rozhoň', 'Ziqiao Ma', 'Christian Stump', 'Mohammed Berkani', 'Jacob Platnick', 'Volodymyr Nevirkovets', 'Luke Basler', 'Marco Piccardo', 'Ferenc Jeanplong', 'Niv Cohen', 'Josef Tkadlec', 'Paul Rosu', 'Piotr Padlewski', 'Stanislaw Barzowski', 'Kyle Montgomery', 'Aline Menezes', 'Arkil Patel', 'Zixuan Wang', 'Jamie Tucker-Foltz', 'Jack Stade', 'Tom Goertzen', 'Fereshteh Kazemi', 'Jeremiah Milbauer', 'John Arnold Ambay', 'Abhishek Shukla', 'Yan Carlos Leyva Labrador', 'Alan Givré', 'Hew Wolff', 'Vivien Rossbach', 'Muhammad Fayez Aziz', 'Younesse Kaddar', 'Yanxu Chen', 'Robin Zhang', 'Jiayi Pan', 'Antonio Terpin', 'Niklas Muennighoff', 'Hailey Schoelkopf', 'Eric Zheng', 'Avishy Carmi', 'Adam Jones', 'Jainam Shah', 'Ethan D. L. Brown', 'Kelin Zhu', 'Max Bartolo', 'Richard Wheeler', 'Andrew Ho', 'Shaul Barkan', 'Jiaqi Wang', 'Martin Stehberger', 'Egor Kretov', 'Kaustubh Sridhar', 'Zienab EL-Wasif', 'Anji Zhang', 'Daniel Pyda', 'Joanna Tam', 'David M. Cunningham', 'Vladimir Goryachev', 'Demosthenes Patramanis', 'Michael Krause', 'Andrew Redenti', 'Daniel Bugas', 'David Aldous', 'Jesyin Lai', 'Shannon Coleman', 'Mohsen Bahaloo', 'Jiangnan Xu', 'Sangwon Lee', 'Sandy Zhao', 'Ning Tang', 'Michael K. Cohen', 'Micah Carroll', 'Orr Paradise', 'Jan Hendrik Kirchner', 'Stefan Steinerberger', 'Maksym Ovchynnikov', 'Jason O. Matos', 'Adithya Shenoy', 'Benedito Alves de Oliveira Junior', 'Michael Wang', 'Yuzhou Nie', 'Paolo Giordano', 'Philipp Petersen', 'Anna Sztyber-Betley', 'Priti Shukla', 'Jonathan Crozier', 'Antonella Pinto', 'Shreyas Verma', 'Prashant Joshi', 'Zheng-Xin Yong', 'Allison Tee', 'Jérémy Andréoletti', 'Orion Weller', 'Raghav Singhal', 'Gang Zhang', 'Alexander Ivanov', 'Seri Khoury', 'Hamid Mostaghimi', 'Kunvar Thaman', 'Qijia Chen', 'Tran Quoc Khánh', 'Jacob Loader', 'Stefano Cavalleri', 'Hannah Szlyk', 'Zachary Brown', 'Jonathan Roberts', 'William Alley', 'Kunyang Sun', 'Ryan Stendall', 'Max Lamparth', 'Anka Reuel', 'Ting Wang', 'Hanmeng Xu', 'Sreenivas Goud Raparthi', 'Pablo Hernández-Cámara', 'Freddie Martin', 'Dmitry Malishev', 'Thomas Preu', 'Tomek Korbak', 'Marcus Abramovitch', 'Dominic Williamson', 'Ziye Chen', 'Biró Bálint', 'M Saiful Bari', 'Peyman Kassani', 'Zihao Wang', 'Behzad Ansarinejad', 'Laxman Prasad Goswami', 'Yewen Sun', 'Hossam Elgnainy', 'Daniel Tordera', 'George Balabanian', 'Earth Anderson', 'Lynna Kvistad', 'Alejandro José Moyano', 'Rajat Maheshwari', 'Ahmad Sakor', 'Murat Eron', 'Isaac C. McAlister', 'Javier Gimenez', 'Innocent Enyekwe', 'Andrew Favre D. O.', 'Shailesh Shah', 'Xiaoxiang Zhou', 'Firuz Kamalov', 'Ronald Clark', 'Sherwin Abdoli', 'Tim Santens', 'Khalida Meer', 'Harrison K Wang', 'Kalyan Ramakrishnan', 'Evan Chen', 'Alessandro Tomasiello', 'G. Bruno De Luca', 'Shi-Zhuo Looi', 'Vinh-Kha Le', 'Noam Kolt', 'Niels Mündler', 'Avi Semler', 'Emma Rodman', 'Jacob Drori', 'Carl J Fossum', 'Milind Jagota', 'Ronak Pradeep', 'Honglu Fan', 'Tej Shah', 'Jonathan Eicher', 'Michael Chen', 'Kushal Thaman', 'William Merrill', 'Carter Harris', 'Jason Gross', 'Ilya Gusev', 'Asankhaya Sharma', 'Shashank Agnihotri', 'Pavel Zhelnov', 'Siranut Usawasutsakorn', 'Mohammadreza Mofayezi', 'Sergei Bogdanov', 'Alexander Piperski', 'Marc Carauleanu', 'David K. Zhang', 'Dylan Ler', 'Roman Leventov', 'Ignat Soroko', 'Thorben Jansen', 'Pascal Lauer', 'Joshua Duersch', 'Vage Taamazyan', 'Wiktor Morak', 'Wenjie Ma', 'William Held', 'Tran Đuc Huy', 'Ruicheng Xian', 'Armel Randy Zebaze', 'Mohanad Mohamed', 'Julian Noah Leser', 'Michelle X Yuan', 'Laila Yacar', 'Johannes Lengler', 'Hossein Shahrtash', 'Edson Oliveira', 'Joseph W. Jackson', 'Daniel Espinosa Gonzalez', 'Andy Zou', 'Muthu Chidambaram', 'Timothy Manik', 'Hector Haffenden', 'Dashiell Stander', 'Ali Dasouqi', 'Alexander Shen', 'Emilien Duc', 'Bita Golshani', 'David Stap', 'Mikalai Uzhou', 'Alina Borisovna Zhidkovskaya', 'Lukas Lewark', 'Mátyás Vincze', 'Dustin Wehr', 'Colin Tang', 'Zaki Hossain', 'Shaun Phillips', 'Jiang Muzhen', 'Fredrik Ekström', 'Angela Hammon', 'Oam Patel', 'Nicolas Remy', 'Faraz Farhidi', 'George Medley', 'Forough Mohammadzadeh', 'Madellene Peñaflor', 'Haile Kassahun', 'Alena Friedrich', 'Claire Sparrow', 'Taom Sakal', 'Omkar Dhamane', 'Ali Khajegili Mirabadi', 'Eric Hallman', 'Mike Battaglia', 'Mohammad Maghsoudimehrabani', 'Hieu Hoang', 'Alon Amit', 'Dave Hulbert', 'Roberto Pereira', 'Simon Weber', 'Stephen Mensah', 'Nathan Andre', 'Anton Peristyy', 'Chris Harjadi', 'Himanshu Gupta', 'Stephen Malina', 'Samuel Albanie', 'Will Cai', 'Mustafa Mehkary', 'Frank Reidegeld', 'Anna-Katharina Dick', 'Cary Friday', 'Jasdeep Sidhu', 'Wanyoung Kim', 'Mariana Costa', 'Hubeyb Gurdogan', 'Brian Weber', 'Harsh Kumar', 'Tong Jiang', 'Arunim Agarwal', 'Chiara Ceconello', 'Warren S. Vaz', 'Chao Zhuang', 'Haon Park', 'Andrew R. Tawfeek', 'Daattavya Aggarwal', 'Michael Kirchhof', 'Linjie Dai', 'Evan Kim', 'Johan Ferret', 'Yuzhou Wang', 'Minghao Yan', 'Krzysztof Burdzy', 'Lixin Zhang', 'Antonio Franca', 'Diana T. Pham', 'Kang Yong Loh', 'Joshua Robinson', 'Shreen Gul', 'Gunjan Chhablani', 'Zhehang Du', 'Adrian Cosma', 'Colin White', 'Robin Riblet', 'Prajvi Saxena', 'Jacob Votava', 'Vladimir Vinnikov', 'Ethan Delaney', 'Shiv Halasyamani', 'Syed M. Shahid', 'Jean-Christophe Mourrat', 'Lavr Vetoshkin', 'Renas Bacho', 'Vincent Ginis', 'Aleksandr Maksapetyan', 'Florencia de la Rosa', 'Xiuyu Li', 'Guillaume Malod', 'Leon Lang', 'Julien Laurendeau', 'Fatimah Adesanya', 'Julien Portier', 'Lawrence Hollom', 'Victor Souza', 'Yuchen Anna Zhou', 'Yiğit Yalın', 'Gbenga Daniel Obikoya', 'Luca Arnaboldi', 'Rai', 'Filippo Bigi', 'Kaniuar Bacho', 'Pierre Clavier', 'Gabriel Recchia', 'Mara Popescu', 'Nikita Shulga', 'Ngefor Mildred Tanwie', 'Thomas C. H. Lux', 'Ben Rank', 'Colin Ni', 'Alesia Yakimchyk', 'Huanxu', 'Liu', 'Olle Häggström', 'Emil Verkama', 'Himanshu Narayan', 'Hans Gundlach', 'Leonor Brito-Santana', 'Brian Amaro', 'Vivek Vajipey', 'Rynaa Grover', 'Yiyang Fan', 'Gabriel Poesia Reis e Silva', 'Linwei Xin', 'Yosi Kratish', 'Jakub Łucki', 'Wen-Ding Li', 'Justin Xu', 'Kevin Joseph Scaria', 'Freddie Vargus', 'Farzad Habibi', 'Long', 'Lian', 'Emanuele Rodolà', 'Jules Robins', 'Vincent Cheng', 'Declan Grabb', 'Ida Bosio', 'Tony Fruhauff', 'Ido Akov', 'Eve J. Y. Lo', 'Hao Qi', 'Xi Jiang', 'Ben Segev', 'Jingxuan Fan', 'Sarah Martinson', 'Erik Y. Wang', 'Kaylie Hausknecht', 'Michael P. Brenner', 'Mao Mao', 'Yibo Jiang', 'Xinyu Zhang', 'David Avagian', 'Eshawn Jessica Scipio', 'Muhammad Rehan Siddiqi', 'Alon Ragoler', 'Justin Tan', 'Deepakkumar Patil', 'Rebeka Plecnik', 'Aaron Kirtland', 'Roselynn Grace Montecillo', 'Stephane Durand', 'Omer Faruk Bodur', 'Zahra Adoul', 'Mohamed Zekry', 'Guillaume Douville', 'Ali Karakoc', 'Tania C. B. Santos', 'Samir Shamseldeen', 'Loukmane Karim', 'Anna Liakhovitskaia', 'Nate Resman', 'Nicholas Farina', 'Juan Carlos Gonzalez', 'Gabe Maayan', 'Sarah Hoback', 'Rodrigo De Oliveira Pena', 'Glen Sherman', 'Hodjat Mariji', 'Rasoul Pouriamanesh', 'Wentao Wu', 'Gözdenur Demir', 'Sandra Mendoza', 'Ismail Alarab', 'Joshua Cole', 'Danyelle Ferreira', 'Bryan Johnson', 'Hsiaoyun Milliron', 'Mohammad Safdari', 'Liangti Dai', 'Siriphan Arthornthurasuk', 'Alexey Pronin', 'Jing Fan', 'Angel Ramirez-Trinidad', 'Ashley Cartwright', 'Daphiny Pottmaier', 'Omid Taheri', 'David Outevsky', 'Stanley Stepanic', 'Samuel Perry', 'Luke Askew', 'Raúl Adrián Huerta Rodríguez', 'Abdelkader Dendane', 'Sam Ali', 'Ricardo Lorena', 'Krishnamurthy Iyer', 'Sk Md Salauddin', 'Murat Islam', 'Juan Gonzalez', 'Josh Ducey', 'Russell Campbell', 'Maja Somrak', 'Vasilios Mavroudis', 'Eric Vergo', 'Juehang Qin', 'Benjámin Borbás', 'Eric Chu', 'Jack Lindsey', 'Anil Radhakrishnan', 'Antoine Jallon', 'I. M. J. McInnis', 'Alex Hoover', 'Sören Möller', 'Song Bian', 'John Lai', 'Tejal Patwardhan', 'Summer Yue', 'Alexandr Wang', 'Dan Hendrycks'], 'affiliations': ['Center for AI Safety', 'Scale AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.14249.jpg', 'data': {'categories': ['#benchmark', '#science', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'Новый рубеж для искусственного интеллекта: тест на пределе человеческих знаний', 'desc': "Статья представляет новый многомодальный бенчмарк для оценки возможностей больших языковых моделей (LLM) под названием 'Последний экзамен человечества' (HLE). HLE состоит из 3000 вопросов по различным предметам, разработанных экспертами со всего мира. Бенчмарк создан для преодоления ограничений существующих тестов, на которых современные LLM достигают точности более 90%. Результаты показывают, что современные LLM демонстрируют низкую точность на HLE, что указывает на значительный разрыв между их возможностями и экспертными знаниями человека."}, 'en': {'title': "Raising the Bar: Humanity's Last Exam for LLMs", 'desc': "This paper introduces a new benchmark called Humanity's Last Exam (HLE) to evaluate the capabilities of large language models (LLMs). HLE consists of 3,000 questions across various subjects, including mathematics and humanities, designed to be challenging for LLMs. Unlike existing benchmarks, HLE questions cannot be easily answered through internet searches, making them a better measure of true understanding. The results show that current state-of-the-art LLMs struggle with HLE, indicating a significant gap between their performance and that of expert humans."}, 'zh': {'title': '人类的最后考试:挑战LLM的极限', 'desc': '基准测试是跟踪大型语言模型(LLM)能力快速发展的重要工具。然而,现有的基准测试难度未能与LLM的进步相匹配,导致LLM在流行基准测试(如MMLU)上达到90%以上的准确率。为此,我们推出了人类的最后考试(HLE),这是一个涵盖广泛学科的多模态基准,旨在成为此类学术基准的最终版本。HLE包含3000个问题,涉及数学、人文学科和自然科学,旨在揭示当前LLM能力与专家人类水平之间的显著差距。'}}}, {'id': 'https://huggingface.co/papers/2501.13953', 'title': 'Redundancy Principles for MLLMs Benchmarks', 'url': 'https://huggingface.co/papers/2501.13953', 'abstract': "With the rapid iteration of Multi-modality Large Language Models (MLLMs) and the evolving demands of the field, the number of benchmarks produced annually has surged into the hundreds. The rapid growth has inevitably led to significant redundancy among benchmarks. Therefore, it is crucial to take a step back and critically assess the current state of redundancy and propose targeted principles for constructing effective MLLM benchmarks. In this paper, we focus on redundancy from three key perspectives: 1) Redundancy of benchmark capability dimensions, 2) Redundancy in the number of test questions, and 3) Cross-benchmark redundancy within specific domains. Through the comprehensive analysis over hundreds of MLLMs' performance across more than 20 benchmarks, we aim to quantitatively measure the level of redundancy lies in existing MLLM evaluations, provide valuable insights to guide the future development of MLLM benchmarks, and offer strategies to refine and address redundancy issues effectively.", 'score': 21, 'issue_id': 1877, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': 'f504e124f29e4140', 'authors': ['Zicheng Zhang', 'Xiangyu Zhao', 'Xinyu Fang', 'Chunyi Li', 'Xiaohong Liu', 'Xiongkuo Min', 'Haodong Duan', 'Kai Chen', 'Guangtao Zhai'], 'affiliations': ['Shanghai AI Lab', 'Shanghai Jiao Tong University', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13953.jpg', 'data': {'categories': ['#benchmark', '#survey'], 'emoji': '🔍', 'ru': {'title': 'Борьба с избыточностью: оптимизация бенчмарков для мультимодальных языковых моделей', 'desc': 'Статья посвящена проблеме избыточности в бенчмарках для мультимодальных больших языковых моделей (MLLM). Авторы анализируют избыточность с трех ключевых аспектов: измерения возможностей, количество тестовых вопросов и пересечение между бенчмарками в конкретных областях. На основе анализа производительности MLLM на более чем 20 бенчмарках, исследователи предлагают количественно оценить уровень избыточности и дать рекомендации по улучшению бенчмарков. Цель работы - предоставить ценные идеи для будущего развития оценки MLLM и стратегии по устранению проблем избыточности.'}, 'en': {'title': 'Streamlining MLLM Benchmarks: Tackling Redundancy for Better Evaluation', 'desc': 'This paper examines the growing issue of redundancy in benchmarks for Multi-modality Large Language Models (MLLMs). It identifies three main types of redundancy: in the capabilities being tested, the number of test questions, and across different benchmarks within the same domain. By analyzing the performance of numerous MLLMs across over 20 benchmarks, the authors quantitatively measure the extent of this redundancy. The findings aim to inform the development of more effective benchmarks and provide strategies to reduce redundancy in future evaluations.'}, 'zh': {'title': '优化多模态大型语言模型基准测试,减少冗余', 'desc': '随着多模态大型语言模型(MLLMs)的快速发展,年度基准测试的数量激增,导致基准测试之间的冗余现象显著增加。本文从三个关键角度分析冗余问题:基准能力维度的冗余、测试问题数量的冗余以及特定领域内的跨基准冗余。通过对数百个MLLM在20多个基准测试中的表现进行综合分析,我们定量测量现有MLLM评估中的冗余水平。我们的目标是为未来MLLM基准的开发提供有价值的见解,并提出有效解决冗余问题的策略。'}}}, {'id': 'https://huggingface.co/papers/2501.14342', 'title': 'Chain-of-Retrieval Augmented Generation', 'url': 'https://huggingface.co/papers/2501.14342', 'abstract': "This paper introduces an approach for training o1-like RAG models that retrieve and reason over relevant information step by step before generating the final answer. Conventional RAG methods usually perform a single retrieval step before the generation process, which limits their effectiveness in addressing complex queries due to imperfect retrieval results. In contrast, our proposed method, CoRAG (Chain-of-Retrieval Augmented Generation), allows the model to dynamically reformulate the query based on the evolving state. To train CoRAG effectively, we utilize rejection sampling to automatically generate intermediate retrieval chains, thereby augmenting existing RAG datasets that only provide the correct final answer. At test time, we propose various decoding strategies to scale the model's test-time compute by controlling the length and number of sampled retrieval chains. Experimental results across multiple benchmarks validate the efficacy of CoRAG, particularly in multi-hop question answering tasks, where we observe more than 10 points improvement in EM score compared to strong baselines. On the KILT benchmark, CoRAG establishes a new state-of-the-art performance across a diverse range of knowledge-intensive tasks. Furthermore, we offer comprehensive analyses to understand the scaling behavior of CoRAG, laying the groundwork for future research aimed at developing factual and grounded foundation models.", 'score': 18, 'issue_id': 1873, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': 'cd489ba1638c5496', 'authors': ['Liang Wang', 'Haonan Chen', 'Nan Yang', 'Xiaolong Huang', 'Zhicheng Dou', 'Furu Wei'], 'affiliations': ['Microsoft Corporation', 'Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.14342.jpg', 'data': {'categories': ['#benchmark', '#optimization', '#rag', '#reasoning'], 'emoji': '🔗', 'ru': {'title': 'CoRAG: Пошаговый поиск для улучшения генерации ответов', 'desc': 'Статья представляет новый подход к обучению моделей извлечения и генерации (RAG), позволяющий выполнять пошаговый поиск и рассуждение перед генерацией окончательного ответа. Метод CoRAG (Chain-of-Retrieval Augmented Generation) позволяет модели динамически переформулировать запрос на основе развивающегося состояния. Для обучения CoRAG используется отбор с отклонением для автоматической генерации промежуточных цепочек поиска. Экспериментальные результаты показывают значительное улучшение производительности на различных бенчмарках, особенно в задачах многоэтапного ответа на вопросы.'}, 'en': {'title': 'CoRAG: Enhancing RAG with Dynamic Retrieval for Complex Queries', 'desc': 'This paper presents CoRAG, a novel approach for training retrieval-augmented generation (RAG) models that enhances their ability to handle complex queries. Unlike traditional RAG methods that rely on a single retrieval step, CoRAG employs a dynamic query reformulation process, allowing the model to retrieve information iteratively. The training process utilizes rejection sampling to create intermediate retrieval chains, enriching the dataset beyond just the final answers. Experimental results demonstrate that CoRAG significantly improves performance in multi-hop question answering tasks, achieving state-of-the-art results on the KILT benchmark.'}, 'zh': {'title': '动态检索,提升问答能力!', 'desc': '本文介绍了一种训练类似o1的RAG模型的新方法,该方法在生成最终答案之前逐步检索和推理相关信息。传统的RAG方法通常在生成过程之前只进行一次检索,这限制了它们在处理复杂查询时的有效性。我们提出的方法CoRAG(链式检索增强生成)允许模型根据不断变化的状态动态重构查询。通过使用拒绝采样自动生成中间检索链,我们有效地增强了现有的RAG数据集,从而在多跳问答任务中显著提高了模型的表现。'}}}, {'id': 'https://huggingface.co/papers/2501.14492', 'title': 'RealCritic: Towards Effectiveness-Driven Evaluation of Language Model Critiques', 'url': 'https://huggingface.co/papers/2501.14492', 'abstract': 'Critiques are important for enhancing the performance of Large Language Models (LLMs), enabling both self-improvement and constructive feedback for others by identifying flaws and suggesting improvements. However, evaluating the critique capabilities of LLMs presents a significant challenge due to the open-ended nature of the task. In this work, we introduce a new benchmark designed to assess the critique capabilities of LLMs. Unlike existing benchmarks, which typically function in an open-loop fashion, our approach employs a closed-loop methodology that evaluates the quality of corrections generated from critiques. Moreover, the benchmark incorporates features such as self-critique, cross-critique, and iterative critique, which are crucial for distinguishing the abilities of advanced reasoning models from more classical ones. We implement this benchmark using eight challenging reasoning tasks. We have several interesting findings. First, despite demonstrating comparable performance in direct chain-of-thought generation, classical LLMs significantly lag behind the advanced reasoning-based model o1-mini across all critique scenarios. Second, in self-critique and iterative critique settings, classical LLMs may even underperform relative to their baseline capabilities. We hope that this benchmark will serve as a valuable resource to guide future advancements. The code and data are available at https://github.com/tangzhy/RealCritic.', 'score': 13, 'issue_id': 1873, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '683923c8fb1958c2', 'authors': ['Zhengyang Tang', 'Ziniu Li', 'Zhenyang Xiao', 'Tian Ding', 'Ruoyu Sun', 'Benyou Wang', 'Dayiheng Liu', 'Fei Huang', 'Tianyu Liu', 'Bowen Yu', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Inc.', 'Shenzhen Research Institute of Big Data', 'The Chinese University of Hong Kong, Shenzhen'], 'pdf_title_img': 'assets/pdf/title_img/2501.14492.jpg', 'data': {'categories': ['#benchmark', '#interpretability', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Новый бенчмарк раскрывает истинный потенциал LLM в критическом мышлении', 'desc': 'Эта статья представляет новый бенчмарк для оценки способностей больших языковых моделей (LLM) к критике. В отличие от существующих бенчмарков, этот подход использует замкнутую методологию, оценивающую качество исправлений, сгенерированных на основе критики. Бенчмарк включает в себя самокритику, перекрестную критику и итеративную критику, что важно для различения способностей продвинутых моделей рассуждения от классических. Исследование показало, что классические LLM значительно отстают от продвинутых моделей рассуждения во всех сценариях критики, несмотря на сопоставимую производительность в прямой генерации цепочки рассуждений.'}, 'en': {'title': 'Enhancing LLMs Through Effective Critique Evaluation', 'desc': 'This paper focuses on improving Large Language Models (LLMs) by evaluating their critique capabilities, which are essential for self-improvement and providing feedback. The authors introduce a new benchmark that uses a closed-loop methodology to assess how well LLMs can generate corrections based on critiques. This benchmark includes features like self-critique, cross-critique, and iterative critique, allowing for a more nuanced evaluation of reasoning abilities. The findings reveal that advanced reasoning models outperform classical LLMs in critique scenarios, highlighting the need for better evaluation methods in machine learning.'}, 'zh': {'title': '提升LLMs性能的新基准评估批评能力', 'desc': '本文探讨了大型语言模型(LLMs)在批评能力方面的评估。我们提出了一种新的基准,采用闭环方法来评估批评生成的修正质量。该基准包括自我批评、交叉批评和迭代批评等特性,以区分高级推理模型与传统模型的能力。研究发现,尽管传统LLMs在直接思维生成方面表现相似,但在所有批评场景中,它们的表现明显落后于基于高级推理的模型o1-mini。'}}}, {'id': 'https://huggingface.co/papers/2501.14726', 'title': 'Relightable Full-Body Gaussian Codec Avatars', 'url': 'https://huggingface.co/papers/2501.14726', 'abstract': 'We propose Relightable Full-Body Gaussian Codec Avatars, a new approach for modeling relightable full-body avatars with fine-grained details including face and hands. The unique challenge for relighting full-body avatars lies in the large deformations caused by body articulation and the resulting impact on appearance caused by light transport. Changes in body pose can dramatically change the orientation of body surfaces with respect to lights, resulting in both local appearance changes due to changes in local light transport functions, as well as non-local changes due to occlusion between body parts. To address this, we decompose the light transport into local and non-local effects. Local appearance changes are modeled using learnable zonal harmonics for diffuse radiance transfer. Unlike spherical harmonics, zonal harmonics are highly efficient to rotate under articulation. This allows us to learn diffuse radiance transfer in a local coordinate frame, which disentangles the local radiance transfer from the articulation of the body. To account for non-local appearance changes, we introduce a shadow network that predicts shadows given precomputed incoming irradiance on a base mesh. This facilitates the learning of non-local shadowing between the body parts. Finally, we use a deferred shading approach to model specular radiance transfer and better capture reflections and highlights such as eye glints. We demonstrate that our approach successfully models both the local and non-local light transport required for relightable full-body avatars, with a superior generalization ability under novel illumination conditions and unseen poses.', 'score': 5, 'issue_id': 1873, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '0072ce1869c715b7', 'authors': ['Shaofei Wang', 'Tomas Simon', 'Igor Santesteban', 'Timur Bagautdinov', 'Junxuan Li', 'Vasu Agrawal', 'Fabian Prada', 'Shoou-I Yu', 'Pace Nalbone', 'Matt Gramlich', 'Roman Lubachersky', 'Chenglei Wu', 'Javier Romero', 'Jason Saragih', 'Michael Zollhoefer', 'Andreas Geiger', 'Siyu Tang', 'Shunsuke Saito'], 'affiliations': ['Codec Avatars Lab, Meta, USA', 'ETH Zürich, Switzerland', 'University of Tübingen, Germany'], 'pdf_title_img': 'assets/pdf/title_img/2501.14726.jpg', 'data': {'categories': ['#cv', '#3d'], 'emoji': '🕴️', 'ru': {'title': 'Реалистичное освещение для полноразмерных цифровых аватаров', 'desc': 'Статья представляет новый подход к моделированию полноразмерных аватаров с возможностью изменения освещения, включая детализацию лица и рук. Авторы предлагают декомпозицию световых эффектов на локальные и нелокальные, используя обучаемые зональные гармоники для диффузного переноса освещения и специальную нейронную сеть для предсказания теней. Метод также включает отложенный шейдинг для моделирования зеркального переноса освещения. Результаты демонстрируют успешное моделирование как локального, так и нелокального переноса света для полноразмерных аватаров с улучшенной способностью к обобщению в новых условиях освещения и позах.'}, 'en': {'title': 'Realistic Relightable Avatars Through Advanced Light Transport Modeling', 'desc': 'This paper presents a novel method for creating relightable full-body avatars that capture intricate details like facial features and hands. The authors tackle the challenge of how body movements affect lighting and appearance by separating light transport into local and non-local effects. They utilize learnable zonal harmonics to efficiently model local changes in appearance due to body articulation, while a shadow network predicts non-local shadowing effects between body parts. The proposed approach enhances the realism of avatars under varying lighting conditions and poses, demonstrating improved generalization capabilities.'}, 'zh': {'title': '可重光照的全身头像建模新方法', 'desc': '我们提出了一种新的方法,称为可重光照的全身高斯编码头像,旨在建模具有细致面部和手部特征的全身头像。该方法解决了由于身体关节运动引起的大变形对外观的影响,特别是光传输的变化。我们将光传输分解为局部和非局部效应,使用可学习的区域谐波来建模局部外观变化,并引入阴影网络来预测身体部位之间的阴影。最终,我们采用延迟着色方法来建模镜面反射,以更好地捕捉反射和高光效果。'}}}, {'id': 'https://huggingface.co/papers/2501.14176', 'title': 'RL + Transformer = A General-Purpose Problem Solver', 'url': 'https://huggingface.co/papers/2501.14176', 'abstract': 'What if artificial intelligence could not only solve problems for which it was trained but also learn to teach itself to solve new problems (i.e., meta-learn)? In this study, we demonstrate that a pre-trained transformer fine-tuned with reinforcement learning over multiple episodes develops the ability to solve problems that it has never encountered before - an emergent ability called In-Context Reinforcement Learning (ICRL). This powerful meta-learner not only excels in solving unseen in-distribution environments with remarkable sample efficiency, but also shows strong performance in out-of-distribution environments. In addition, we show that it exhibits robustness to the quality of its training data, seamlessly stitches together behaviors from its context, and adapts to non-stationary environments. These behaviors demonstrate that an RL-trained transformer can iteratively improve upon its own solutions, making it an excellent general-purpose problem solver.', 'score': 4, 'issue_id': 1884, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '708deafdf9ddb570', 'authors': ['Micah Rentschler', 'Jesse Roberts'], 'affiliations': ['Tennessee Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2501.14176.jpg', 'data': {'categories': ['#training', '#transfer_learning', '#optimization', '#agi', '#rl'], 'emoji': '🧠', 'ru': {'title': 'Трансформер учится учиться: новые горизонты искусственного интеллекта', 'desc': 'Исследование демонстрирует, что предобученный трансформер, дообученный с помощью обучения с подкреплением, развивает способность решать новые задачи - так называемое контекстное обучение с подкреплением (ICRL). Эта мета-обучающаяся модель эффективно решает не только задачи из распределения обучающих данных, но и задачи вне этого распределения. Модель показывает устойчивость к качеству обучающих данных и способность адаптироваться к нестационарным средам. Это свидетельствует о том, что трансформер, обученный с подкреплением, может итеративно улучшать свои решения.'}, 'en': {'title': 'Empowering AI: Learning to Solve New Problems with In-Context Reinforcement Learning', 'desc': 'This paper explores the concept of In-Context Reinforcement Learning (ICRL), where a pre-trained transformer model learns to solve new problems through reinforcement learning. The model shows remarkable sample efficiency, allowing it to tackle unseen problems effectively, both in familiar and unfamiliar environments. It also demonstrates robustness to varying training data quality and adapts well to changing conditions. Overall, the study highlights the potential of RL-trained transformers as versatile problem solvers capable of self-improvement.'}, 'zh': {'title': '元学习:让AI自我解决新问题的能力', 'desc': '本研究展示了一种预训练的变换器模型,通过强化学习进行微调,能够解决之前未遇到过的问题,这种能力被称为上下文强化学习(ICRL)。这种强大的元学习者在处理未见过的环境时表现出色,具有显著的样本效率,并且在分布外环境中也表现良好。此外,它对训练数据的质量具有鲁棒性,能够无缝地结合上下文中的行为,并适应非平稳环境。这些特性表明,经过强化学习训练的变换器能够不断改进自己的解决方案,成为一种优秀的通用问题解决者。'}}}, {'id': 'https://huggingface.co/papers/2501.13687', 'title': 'Question Answering on Patient Medical Records with Private Fine-Tuned LLMs', 'url': 'https://huggingface.co/papers/2501.13687', 'abstract': 'Healthcare systems continuously generate vast amounts of electronic health records (EHRs), commonly stored in the Fast Healthcare Interoperability Resources (FHIR) standard. Despite the wealth of information in these records, their complexity and volume make it difficult for users to retrieve and interpret crucial health insights. Recent advances in Large Language Models (LLMs) offer a solution, enabling semantic question answering (QA) over medical data, allowing users to interact with their health records more effectively. However, ensuring privacy and compliance requires edge and private deployments of LLMs. This paper proposes a novel approach to semantic QA over EHRs by first identifying the most relevant FHIR resources for a user query (Task1) and subsequently answering the query based on these resources (Task2). We explore the performance of privately hosted, fine-tuned LLMs, evaluating them against benchmark models such as GPT-4 and GPT-4o. Our results demonstrate that fine-tuned LLMs, while 250x smaller in size, outperform GPT-4 family models by 0.55% in F1 score on Task1 and 42% on Meteor Task in Task2. Additionally, we examine advanced aspects of LLM usage, including sequential fine-tuning, model self-evaluation (narcissistic evaluation), and the impact of training data size on performance. The models and datasets are available here: https://huggingface.co/genloop', 'score': 3, 'issue_id': 1885, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '710359a2b4f5f274', 'authors': ['Sara Kothari', 'Ayush Gupta'], 'affiliations': ['Department of Computer Science Stanford University', 'Genloop Labs, Inc. Delaware, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.13687.jpg', 'data': {'categories': ['#open_source', '#multimodal', '#dataset', '#training', '#science', '#benchmark', '#healthcare'], 'emoji': '🏥', 'ru': {'title': 'Эффективный анализ медицинских данных с помощью LLM', 'desc': 'Статья представляет новый подход к семантическому вопросно-ответному анализу электронных медицинских карт (ЭМК) с использованием больших языковых моделей (LLM). Авторы предлагают двухэтапный метод: сначала идентифицируются релевантные ресурсы FHIR, затем на их основе формируется ответ на запрос пользователя. Исследование показывает, что дообученные LLM меньшего размера превосходят модели семейства GPT-4 по ряду метрик. Также рассматриваются продвинутые аспекты использования LLM, включая последовательную тонкую настройку и самооценку моделей.'}, 'en': {'title': 'Unlocking Health Insights with Fine-Tuned Language Models', 'desc': 'This paper addresses the challenge of extracting meaningful insights from electronic health records (EHRs) using Large Language Models (LLMs). It introduces a two-step approach for semantic question answering (QA) that first identifies relevant FHIR resources and then answers user queries based on those resources. The study evaluates privately hosted, fine-tuned LLMs against benchmark models like GPT-4, showing that these smaller models can outperform larger ones in specific tasks. Additionally, it explores advanced techniques such as sequential fine-tuning and the effects of training data size on model performance.'}, 'zh': {'title': '提升医疗数据问答的智能化与隐私保护', 'desc': '本论文提出了一种新的语义问答方法,旨在提高用户对电子健康记录(EHRs)的访问和理解。首先,通过识别与用户查询最相关的FHIR资源(任务1),然后基于这些资源回答查询(任务2)。研究表明,经过微调的私有托管大型语言模型(LLMs)在任务1的F1分数上比GPT-4模型高出0.55%,在任务2的Meteor任务上高出42%。此外,论文还探讨了模型的自我评估和训练数据规模对性能的影响。'}}}, {'id': 'https://huggingface.co/papers/2501.13925', 'title': 'GeoPixel: Pixel Grounding Large Multimodal Model in Remote Sensing', 'url': 'https://huggingface.co/papers/2501.13925', 'abstract': 'Recent advances in large multimodal models (LMMs) have recognized fine-grained grounding as an imperative factor of visual understanding and dialogue. However, the benefits of such representation in LMMs are limited to the natural image domain, and these models perform poorly for remote sensing (RS). The distinct overhead viewpoint, scale variation, and presence of small objects in high-resolution RS imagery present a unique challenge in region-level comprehension. Moreover, the development of the grounding conversation capability of LMMs within RS is hindered by the lack of granular, RS domain-specific grounded data. Addressing these limitations, we propose GeoPixel - the first end-to-end high resolution RS-LMM that supports pixel-level grounding. This capability allows fine-grained visual perception by generating interleaved masks in conversation. GeoPixel supports up to 4K HD resolution in any aspect ratio, ideal for high-precision RS image analysis. To support the grounded conversation generation (GCG) in RS imagery, we curate a visually grounded dataset GeoPixelD through a semi-automated pipeline that utilizes set-of-marks prompting and spatial priors tailored for RS data to methodically control the data generation process. GeoPixel demonstrates superior performance in pixel-level comprehension, surpassing existing LMMs in both single-target and multi-target segmentation tasks. Our methodological ablation studies validate the effectiveness of each component in the overall architecture. Our code and data will be publicly released.', 'score': 3, 'issue_id': 1883, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '0c6257aa10e28148', 'authors': ['Akashah Shabbir', 'Mohammed Zumri', 'Mohammed Bennamoun', 'Fahad S. Khan', 'Salman Khan'], 'affiliations': ['Australian National University', 'Linkoping University', 'Mohamed bin Zayed University of AI', 'The University of Western Australia'], 'pdf_title_img': 'assets/pdf/title_img/2501.13925.jpg', 'data': {'categories': ['#open_source', '#architecture', '#dataset', '#multimodal', '#data', '#games', '#optimization'], 'emoji': '🛰️', 'ru': {'title': 'GeoPixel: Новый уровень детализации в анализе спутниковых снимков', 'desc': 'Статья представляет GeoPixel - первую модель для дистанционного зондирования с поддержкой пиксельного уровня детализации. Модель способна анализировать изображения высокого разрешения до 4K, что идеально подходит для точного анализа спутниковых снимков. Для обучения модели был создан специализированный датасет GeoPixelD с аннотациями на уровне пикселей. GeoPixel превосходит существующие мультимодальные модели в задачах сегментации как одиночных, так и множественных объектов на спутниковых снимках.'}, 'en': {'title': 'GeoPixel: Revolutionizing Remote Sensing with Pixel-Level Grounding', 'desc': "This paper introduces GeoPixel, a novel large multimodal model designed specifically for remote sensing imagery. It addresses the challenges of fine-grained grounding in high-resolution images, which are often complicated by factors like scale variation and small object presence. GeoPixel enhances visual understanding by enabling pixel-level grounding and generating interleaved masks during conversations. The authors also present a new dataset, GeoPixelD, which is tailored for remote sensing tasks and supports the model's grounded conversation capabilities."}, 'zh': {'title': 'GeoPixel:高分辨率遥感图像的像素级理解', 'desc': '最近,大型多模态模型(LMMs)的进展表明,细粒度的基础是视觉理解和对话的重要因素。然而,这些模型在遥感(RS)领域的表现较差,主要是由于遥感图像的独特挑战,如视角、尺度变化和小物体的存在。为了解决这些问题,我们提出了GeoPixel,这是第一个支持像素级基础的高分辨率RS-LMM,能够生成交错的掩码以实现细粒度的视觉感知。GeoPixel在单目标和多目标分割任务中表现优于现有的LMMs,展示了其在高精度遥感图像分析中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2403.14614', 'title': 'AdaIR: Adaptive All-in-One Image Restoration via Frequency Mining and Modulation', 'url': 'https://huggingface.co/papers/2403.14614', 'abstract': 'In the image acquisition process, various forms of degradation, including noise, haze, and rain, are frequently introduced. These degradations typically arise from the inherent limitations of cameras or unfavorable ambient conditions. To recover clean images from degraded versions, numerous specialized restoration methods have been developed, each targeting a specific type of degradation. Recently, all-in-one algorithms have garnered significant attention by addressing different types of degradations within a single model without requiring prior information of the input degradation type. However, these methods purely operate in the spatial domain and do not delve into the distinct frequency variations inherent to different degradation types. To address this gap, we propose an adaptive all-in-one image restoration network based on frequency mining and modulation. Our approach is motivated by the observation that different degradation types impact the image content on different frequency subbands, thereby requiring different treatments for each restoration task. Specifically, we first mine low- and high-frequency information from the input features, guided by the adaptively decoupled spectra of the degraded image. The extracted features are then modulated by a bidirectional operator to facilitate interactions between different frequency components. Finally, the modulated features are merged into the original input for a progressively guided restoration. With this approach, the model achieves adaptive reconstruction by accentuating the informative frequency subbands according to different input degradations. Extensive experiments demonstrate that the proposed method achieves state-of-the-art performance on different image restoration tasks, including denoising, dehazing, deraining, motion deblurring, and low-light image enhancement. Our code is available at https://github.com/c-yn/AdaIR.', 'score': 2, 'issue_id': 1883, 'pub_date': '2025-03-21', 'pub_date_card': {'ru': '21 марта', 'en': 'March 21', 'zh': '3月21日'}, 'hash': '54f7acd2a97e8313', 'authors': ['Yuning Cui', 'Syed Waqas Zamir', 'Salman Khan', 'Alois Knoll', 'Mubarak Shah', 'Fahad Shahbaz Khan'], 'affiliations': ['Australian National University', 'Inception Institute of Artificial Intelligence', 'Linköping University', 'Mohammed Bin Zayed University of AI', 'Technical University of Munich', 'University of Central Florida'], 'pdf_title_img': 'assets/pdf/title_img/2403.14614.jpg', 'data': {'categories': ['#cv'], 'emoji': '🖼️', 'ru': {'title': 'Адаптивное восстановление изображений на основе частотного анализа', 'desc': 'Статья описывает новый метод адаптивного восстановления изображений, пострадавших от различных видов деградации (шум, туман, дождь и т.д.). Авторы предлагают нейронную сеть, которая анализирует частотные характеристики искажений и адаптивно применяет соответствующие методы обработки. Подход основан на извлечении низко- и высокочастотной информации из входных данных и их модуляции с помощью двунаправленного оператора. Эксперименты показывают, что предложенный метод достигает наилучших результатов в различных задачах восстановления изображений.'}, 'en': {'title': 'Adaptive Image Restoration through Frequency Modulation', 'desc': 'This paper presents an innovative image restoration network that adapts to various types of image degradation, such as noise, haze, and rain. Unlike traditional methods that focus solely on spatial domain processing, this approach utilizes frequency mining to identify and modulate low- and high-frequency information specific to each degradation type. By employing a bidirectional operator, the model enhances interactions between different frequency components, allowing for more effective restoration. The results show that this adaptive method outperforms existing techniques across multiple restoration tasks, demonstrating its versatility and effectiveness.'}, 'zh': {'title': '自适应一体化图像修复:频率驱动的创新', 'desc': '在图像获取过程中,常常会出现噪声、雾霾和雨水等各种退化形式。这些退化通常源于相机的固有限制或不利的环境条件。为了从退化图像中恢复清晰图像,已经开发了许多专门的修复方法。我们提出了一种基于频率挖掘和调制的自适应一体化图像修复网络,能够在单一模型中处理不同类型的退化,且无需输入退化类型的先验信息。'}}}, {'id': 'https://huggingface.co/papers/2411.19458', 'title': 'Multiview Equivariance Improves 3D Correspondence Understanding with Minimal Feature Finetuning', 'url': 'https://huggingface.co/papers/2411.19458', 'abstract': 'Vision foundation models, particularly the ViT family, have revolutionized image understanding by providing rich semantic features. However, despite their success in 2D comprehension, their abilities on grasping 3D spatial relationships are still unclear. In this work, we evaluate and enhance the 3D awareness of ViT-based models. We begin by systematically assessing their ability to learn 3D equivariant features, specifically examining the consistency of semantic embeddings across different viewpoints. Our findings indicate that improved 3D equivariance leads to better performance on various downstream tasks, including pose estimation, tracking, and semantic transfer. Building on this insight, we propose a simple yet effective finetuning strategy based on 3D correspondences, which significantly enhances the 3D correspondence understanding of existing vision models. Remarkably, even finetuning on a single object for just one iteration results in substantial performance gains. All code and resources will be made publicly available to support further advancements in 3D-aware vision models. Our code is available at https://github.com/qq456cvb/3DCorrEnhance.', 'score': 1, 'issue_id': 1883, 'pub_date': '2025-11-29', 'pub_date_card': {'ru': '29 ноября', 'en': 'November 29', 'zh': '11月29日'}, 'hash': 'df24163a81379619', 'authors': ['Yang You', 'Yixin Li', 'Congyue Deng', 'Yue Wang', 'Leonidas Guibas'], 'affiliations': ['Department of Computer Science, Stanford University, U.S.A.', 'Department of Computer Science, University of Southern California, U.S.A.'], 'pdf_title_img': 'assets/pdf/title_img/2411.19458.jpg', 'data': {'categories': ['#cv', '#open_source', '#3d', '#training'], 'emoji': '🧊', 'ru': {'title': 'Повышение 3D-осведомленности моделей компьютерного зрения', 'desc': 'Статья посвящена исследованию и улучшению понимания трехмерных пространственных отношений моделями компьютерного зрения, основанными на архитектуре ViT. Авторы оценивают способность этих моделей изучать 3D-эквивариантные признаки и обнаруживают, что улучшение 3D-эквивариантности приводит к повышению производительности в различных задачах. Они предлагают эффективную стратегию дообучения на основе 3D-соответствий, которая значительно улучшает понимание трехмерных соответствий существующими моделями. Даже минимальное дообучение на одном объекте приводит к существенному повышению производительности.'}, 'en': {'title': 'Enhancing 3D Awareness in Vision Transformers', 'desc': "This paper focuses on improving the 3D understanding capabilities of Vision Transformer (ViT) models, which are known for their strong performance in 2D image analysis. The authors evaluate how well these models can learn 3D equivariant features, which are essential for maintaining consistent semantic meanings across different viewpoints. They discover that enhancing 3D equivariance significantly boosts the models' performance on tasks like pose estimation and tracking. To achieve this, they introduce a straightforward finetuning method that leverages 3D correspondences, showing that even minimal finetuning can lead to notable improvements in 3D comprehension."}, 'zh': {'title': '提升视觉模型的3D理解能力', 'desc': '本文探讨了视觉基础模型,特别是ViT系列在图像理解中的应用,尤其是其在3D空间关系理解方面的能力。我们系统评估了这些模型学习3D等变特征的能力,重点分析了不同视角下语义嵌入的一致性。研究表明,提升3D等变性可以显著改善在姿态估计、跟踪和语义转移等下游任务中的表现。基于这一发现,我们提出了一种简单有效的微调策略,通过3D对应关系显著增强现有视觉模型的3D理解能力。'}}}, {'id': 'https://huggingface.co/papers/2501.11325', 'title': 'CatV2TON: Taming Diffusion Transformers for Vision-Based Virtual Try-On with Temporal Concatenation', 'url': 'https://huggingface.co/papers/2501.11325', 'abstract': 'Virtual try-on (VTON) technology has gained attention due to its potential to transform online retail by enabling realistic clothing visualization of images and videos. However, most existing methods struggle to achieve high-quality results across image and video try-on tasks, especially in long video scenarios. In this work, we introduce CatV2TON, a simple and effective vision-based virtual try-on (V2TON) method that supports both image and video try-on tasks with a single diffusion transformer model. By temporally concatenating garment and person inputs and training on a mix of image and video datasets, CatV2TON achieves robust try-on performance across static and dynamic settings. For efficient long-video generation, we propose an overlapping clip-based inference strategy that uses sequential frame guidance and Adaptive Clip Normalization (AdaCN) to maintain temporal consistency with reduced resource demands. We also present ViViD-S, a refined video try-on dataset, achieved by filtering back-facing frames and applying 3D mask smoothing for enhanced temporal consistency. Comprehensive experiments demonstrate that CatV2TON outperforms existing methods in both image and video try-on tasks, offering a versatile and reliable solution for realistic virtual try-ons across diverse scenarios.', 'score': 0, 'issue_id': 1887, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': '3b21eab627e1a9f7', 'authors': ['Zheng Chong', 'Wenqing Zhang', 'Shiyue Zhang', 'Jun Zheng', 'Xiao Dong', 'Haoxiang Li', 'Yiling Wu', 'Dongmei Jiang', 'Xiaodan Liang'], 'affiliations': ['National University of Singapore', 'Pengcheng Laboratory', 'Pixocial Technology', 'Sun Yat-Sen University'], 'pdf_title_img': 'assets/pdf/title_img/2501.11325.jpg', 'data': {'categories': ['#cv', '#multimodal', '#dataset', '#video'], 'emoji': '👚', 'ru': {'title': 'Универсальная виртуальная примерка для изображений и видео', 'desc': 'CatV2TON - это новый метод виртуальной примерки одежды, использующий диффузионный трансформер для обработки как изображений, так и видео. Метод применяет технику конкатенации входных данных и обучение на смешанном наборе изображений и видео для достижения высокого качества результатов. Для эффективной обработки длинных видео предложена стратегия инференса с перекрывающимися клипами и адаптивной нормализацией. Авторы также представили улучшенный датасет ViViD-S для задачи виртуальной примерки на видео.'}, 'en': {'title': 'Transforming Virtual Try-Ons with CatV2TON: One Model, Many Scenarios!', 'desc': 'This paper presents CatV2TON, a novel virtual try-on method that utilizes a single diffusion transformer model for both image and video applications. It addresses the challenges of achieving high-quality results in long video scenarios by employing an overlapping clip-based inference strategy, which enhances temporal consistency. The method is trained on a diverse dataset that includes both images and videos, allowing it to perform effectively in various settings. Experimental results show that CatV2TON outperforms existing techniques, making it a promising solution for realistic virtual clothing visualization.'}, 'zh': {'title': 'CatV2TON:高效的虚拟试穿解决方案', 'desc': '虚拟试穿(VTON)技术在在线零售中引起了广泛关注,因为它能够实现真实的服装可视化。现有的方法在图像和视频试穿任务中,尤其是长视频场景中,往往难以达到高质量的效果。我们提出的CatV2TON是一种简单有效的基于视觉的虚拟试穿方法,能够支持图像和视频试穿任务,并使用单一的扩散变换器模型。通过时间上连接服装和人物输入,并在混合的图像和视频数据集上进行训练,CatV2TON在静态和动态场景中都表现出强大的试穿性能。'}}}, {'id': 'https://huggingface.co/papers/2406.18516', 'title': 'Denoising as Adaptation: Noise-Space Domain Adaptation for Image Restoration', 'url': 'https://huggingface.co/papers/2406.18516', 'abstract': 'Although learning-based image restoration methods have made significant progress, they still struggle with limited generalization to real-world scenarios due to the substantial domain gap caused by training on synthetic data. Existing methods address this issue by improving data synthesis pipelines, estimating degradation kernels, employing deep internal learning, and performing domain adaptation and regularization. Previous domain adaptation methods have sought to bridge the domain gap by learning domain-invariant knowledge in either feature or pixel space. However, these techniques often struggle to extend to low-level vision tasks within a stable and compact framework. In this paper, we show that it is possible to perform domain adaptation via the noise space using diffusion models. In particular, by leveraging the unique property of how auxiliary conditional inputs influence the multi-step denoising process, we derive a meaningful diffusion loss that guides the restoration model in progressively aligning both restored synthetic and real-world outputs with a target clean distribution. We refer to this method as denoising as adaptation. To prevent shortcuts during joint training, we present crucial strategies such as channel-shuffling layer and residual-swapping contrastive learning in the diffusion model. They implicitly blur the boundaries between conditioned synthetic and real data and prevent the reliance of the model on easily distinguishable features. Experimental results on three classical image restoration tasks, namely denoising, deblurring, and deraining, demonstrate the effectiveness of the proposed method.', 'score': 0, 'issue_id': 1883, 'pub_date': '2025-06-26', 'pub_date_card': {'ru': '26 июня', 'en': 'June 26', 'zh': '6月26日'}, 'hash': 'ef06fd4cf15b3995', 'authors': ['Kang Liao', 'Zongsheng Yue', 'Zhouxia Wang', 'Chen Change Loy'], 'affiliations': ['S-Lab, Nanyang Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2406.18516.jpg', 'data': {'categories': ['#training', '#diffusion', '#data', '#optimization', '#cv', '#transfer_learning'], 'emoji': '🖼️', 'ru': {'title': 'Адаптация домена через шум: новый подход к восстановлению изображений', 'desc': "Статья представляет новый метод адаптации домена для задач восстановления изображений с использованием диффузионных моделей. Авторы предлагают выполнять адаптацию через пространство шума, используя уникальные свойства многоступенчатого процесса удаления шума. Метод, названный 'denoising as adaptation', направляет модель восстановления на постепенное выравнивание как синтетических, так и реальных выходных данных с целевым чистым распределением. Экспериментальные результаты на задачах шумоподавления, устранения размытия и удаления дождя демонстрируют эффективность предложенного подхода."}, 'en': {'title': 'Bridging the Gap: Denoising as Adaptation for Image Restoration', 'desc': "This paper addresses the challenge of image restoration methods that struggle to generalize to real-world scenarios due to the gap between synthetic training data and real data. The authors propose a novel approach called 'denoising as adaptation' that utilizes diffusion models to perform domain adaptation in the noise space. By introducing a diffusion loss that aligns synthetic and real-world outputs, the method effectively guides the restoration process. Additionally, strategies like channel-shuffling and residual-swapping contrastive learning are implemented to enhance the model's robustness against overfitting to distinguishable features."}, 'zh': {'title': '去噪作为适应:提升图像恢复的领域适应能力', 'desc': '本文探讨了基于学习的图像恢复方法在真实场景中的泛化能力不足的问题,主要是由于训练数据与真实数据之间存在显著的领域差距。我们提出了一种新的领域适应方法,通过噪声空间利用扩散模型来实现,特别是利用辅助条件输入对多步去噪过程的影响,导出了一种有意义的扩散损失。该方法称为去噪作为适应,能够逐步对齐恢复的合成图像和真实图像。实验结果表明,该方法在去噪、去模糊和去雨等经典图像恢复任务中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.12948', 'title': 'DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning', 'url': 'https://huggingface.co/papers/2501.12948', 'abstract': 'We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama.', 'score': 94, 'issue_id': 1819, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': 'cae642107ec57790', 'authors': ['DeepSeek-AI', 'Daya Guo', 'Dejian Yang', 'Haowei Zhang', 'Junxiao Song', 'Ruoyu Zhang', 'Runxin Xu', 'Qihao Zhu', 'Shirong Ma', 'Peiyi Wang', 'Xiao Bi', 'Xiaokang Zhang', 'Xingkai Yu', 'Yu Wu', 'Z. F. Wu', 'Zhibin Gou', 'Zhihong Shao', 'Zhuoshu Li', 'Ziyi Gao', 'Aixin Liu', 'Bing Xue', 'Bingxuan Wang', 'Bochao Wu', 'Bei Feng', 'Chengda Lu', 'Chenggang Zhao', 'Chengqi Deng', 'Chenyu Zhang', 'Chong Ruan', 'Damai Dai', 'Deli Chen', 'Dongjie Ji', 'Erhang Li', 'Fangyun Lin', 'Fucong Dai', 'Fuli Luo', 'Guangbo Hao', 'Guanting Chen', 'Guowei Li', 'H. Zhang', 'Han Bao', 'Hanwei Xu', 'Haocheng Wang', 'Honghui Ding', 'Huajian Xin', 'Huazuo Gao', 'Hui Qu', 'Hui Li', 'Jianzhong Guo', 'Jiashi Li', 'Jiawei Wang', 'Jingchang Chen', 'Jingyang Yuan', 'Junjie Qiu', 'Junlong Li', 'J. L. Cai', 'Jiaqi Ni', 'Jian Liang', 'Jin Chen', 'Kai Dong', 'Kai Hu', 'Kaige Gao', 'Kang Guan', 'Kexin Huang', 'Kuai Yu', 'Lean Wang', 'Lecong Zhang', 'Liang Zhao', 'Litong Wang', 'Liyue Zhang', 'Lei Xu', 'Leyi Xia', 'Mingchuan Zhang', 'Minghua Zhang', 'Minghui Tang', 'Meng Li', 'Miaojun Wang', 'Mingming Li', 'Ning Tian', 'Panpan Huang', 'Peng Zhang', 'Qiancheng Wang', 'Qinyu Chen', 'Qiushi Du', 'Ruiqi Ge', 'Ruisong Zhang', 'Ruizhe Pan', 'Runji Wang', 'R. J. Chen', 'R. L. Jin', 'Ruyi Chen', 'Shanghao Lu', 'Shangyan Zhou', 'Shanhuang Chen', 'Shengfeng Ye', 'Shiyu Wang', 'Shuiping Yu', 'Shunfeng Zhou', 'Shuting Pan', 'S. S. Li', 'Shuang Zhou', 'Shaoqing Wu', 'Shengfeng Ye', 'Tao Yun', 'Tian Pei', 'Tianyu Sun', 'T. Wang', 'Wangding Zeng', 'Wanjia Zhao', 'Wen Liu', 'Wenfeng Liang', 'Wenjun Gao', 'Wenqin Yu', 'Wentao Zhang', 'W. L. Xiao', 'Wei An', 'Xiaodong Liu', 'Xiaohan Wang', 'Xiaokang Chen', 'Xiaotao Nie', 'Xin Cheng', 'Xin Liu', 'Xin Xie', 'Xingchao Liu', 'Xinyu Yang', 'Xinyuan Li', 'Xuecheng Su', 'Xuheng Lin', 'X. Q. Li', 'Xiangyue Jin', 'Xiaojin Shen', 'Xiaosha Chen', 'Xiaowen Sun', 'Xiaoxiang Wang', 'Xinnan Song', 'Xinyi Zhou', 'Xianzu Wang', 'Xinxia Shan', 'Y. K. Li', 'Y. Q. Wang', 'Y. X. Wei', 'Yang Zhang', 'Yanhong Xu', 'Yao Li', 'Yao Zhao', 'Yaofeng Sun', 'Yaohui Wang', 'Yi Yu', 'Yichao Zhang', 'Yifan Shi', 'Yiliang Xiong', 'Ying He', 'Yishi Piao', 'Yisong Wang', 'Yixuan Tan', 'Yiyang Ma', 'Yiyuan Liu', 'Yongqiang Guo', 'Yuan Ou', 'Yuduan Wang', 'Yue Gong', 'Yuheng Zou', 'Yujia He', 'Yunfan Xiong', 'Yuxiang Luo', 'Yuxiang You', 'Yuxuan Liu', 'Yuyang Zhou', 'Y. X. Zhu', 'Yanhong Xu', 'Yanping Huang', 'Yaohui Li', 'Yi Zheng', 'Yuchen Zhu', 'Yunxian Ma', 'Ying Tang', 'Yukun Zha', 'Yuting Yan', 'Z. Z. Ren', 'Zehui Ren', 'Zhangli Sha', 'Zhe Fu', 'Zhean Xu', 'Zhenda Xie', 'Zhengyan Zhang', 'Zhewen Hao', 'Zhicheng Ma', 'Zhigang Yan', 'Zhiyu Wu', 'Zihui Gu', 'Zijia Zhu', 'Zijun Liu', 'Zilin Li', 'Ziwei Xie', 'Ziyang Song', 'Zizheng Pan', 'Zhen Huang', 'Zhipeng Xu', 'Zhongyu Zhang', 'Zhen Zhang'], 'affiliations': ['DeepSeek-AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.12948.jpg', 'data': {'categories': ['#training', '#rl', '#reasoning', '#open_source', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Новое поколение моделей рассуждения: обучение с подкреплением открывает путь к улучшенному ИИ', 'desc': 'Исследователи представили модели рассуждений DeepSeek-R1-Zero и DeepSeek-R1. DeepSeek-R1-Zero обучена с помощью масштабного обучения с подкреплением без предварительной тонкой настройки и демонстрирует впечатляющие способности к рассуждению. DeepSeek-R1 использует многоэтапное обучение для улучшения производительности и решения проблем читаемости. Модели показывают результаты, сравнимые с OpenAI-o1-1217 на задачах рассуждения, и исследователи открыли исходный код моделей для научного сообщества.'}, 'en': {'title': 'Revolutionizing Reasoning with DeepSeek Models', 'desc': 'This paper presents two reasoning models, DeepSeek-R1-Zero and DeepSeek-R1, developed for enhanced reasoning capabilities. DeepSeek-R1-Zero is trained using large-scale reinforcement learning without any supervised fine-tuning, showcasing impressive reasoning behaviors but facing issues like readability and language mixing. To improve these aspects, DeepSeek-R1 employs a multi-stage training approach and utilizes cold-start data prior to reinforcement learning. The performance of DeepSeek-R1 is on par with existing models like OpenAI-o1-1217, and both models, along with several distilled versions, are made available to the research community.'}, 'zh': {'title': '深度推理模型的创新与挑战', 'desc': '我们介绍了第一代推理模型DeepSeek-R1-Zero和DeepSeek-R1。DeepSeek-R1-Zero是通过大规模强化学习(RL)训练的模型,没有经过监督微调(SFT),展现出卓越的推理能力。尽管如此,它在可读性和语言混合方面存在一些挑战。为了解决这些问题并进一步提升推理性能,我们引入了DeepSeek-R1,该模型在进行RL之前采用了多阶段训练和冷启动数据。'}}}, {'id': 'https://huggingface.co/papers/2501.12909', 'title': 'FilmAgent: A Multi-Agent Framework for End-to-End Film Automation in Virtual 3D Spaces', 'url': 'https://huggingface.co/papers/2501.12909', 'abstract': "Virtual film production requires intricate decision-making processes, including scriptwriting, virtual cinematography, and precise actor positioning and actions. Motivated by recent advances in automated decision-making with language agent-based societies, this paper introduces FilmAgent, a novel LLM-based multi-agent collaborative framework for end-to-end film automation in our constructed 3D virtual spaces. FilmAgent simulates various crew roles, including directors, screenwriters, actors, and cinematographers, and covers key stages of a film production workflow: (1) idea development transforms brainstormed ideas into structured story outlines; (2) scriptwriting elaborates on dialogue and character actions for each scene; (3) cinematography determines the camera setups for each shot. A team of agents collaborates through iterative feedback and revisions, thereby verifying intermediate scripts and reducing hallucinations. We evaluate the generated videos on 15 ideas and 4 key aspects. Human evaluation shows that FilmAgent outperforms all baselines across all aspects and scores 3.98 out of 5 on average, showing the feasibility of multi-agent collaboration in filmmaking. Further analysis reveals that FilmAgent, despite using the less advanced GPT-4o model, surpasses the single-agent o1, showing the advantage of a well-coordinated multi-agent system. Lastly, we discuss the complementary strengths and weaknesses of OpenAI's text-to-video model Sora and our FilmAgent in filmmaking.", 'score': 43, 'issue_id': 1819, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '0b73908eee2c2db3', 'authors': ['Zhenran Xu', 'Longyue Wang', 'Jifang Wang', 'Zhouyi Li', 'Senbao Shi', 'Xue Yang', 'Yiyu Wang', 'Baotian Hu', 'Jun Yu', 'Min Zhang'], 'affiliations': ['Harbin Institute of Technology (Shenzhen)', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.12909.jpg', 'data': {'categories': ['#multimodal', '#story_generation', '#3d', '#open_source', '#agents', '#hallucinations'], 'emoji': '🎬', 'ru': {'title': 'Виртуальная киностудия: ИИ-агенты создают фильмы от идеи до готового продукта', 'desc': 'FilmAgent - это новая система на основе языковых моделей для автоматизации создания фильмов в виртуальном 3D-пространстве. Она симулирует работу съемочной группы, включая режиссеров, сценаристов, актеров и операторов. Система охватывает ключевые этапы производства фильма: разработку идеи, написание сценария и выбор планов съемки. FilmAgent использует многоагентное взаимодействие для итеративной доработки результатов, что позволяет достичь лучшего качества по сравнению с одноагентными подходами.'}, 'en': {'title': 'Revolutionizing Film Production with Multi-Agent Collaboration', 'desc': 'This paper presents FilmAgent, a collaborative framework that utilizes large language models (LLMs) to automate the film production process in 3D virtual environments. FilmAgent employs multiple agents that simulate various roles in filmmaking, such as directors and screenwriters, to collaboratively develop ideas, write scripts, and plan cinematography. The framework enhances decision-making through iterative feedback, which helps to refine scripts and minimize errors. Evaluation results indicate that FilmAgent significantly outperforms traditional methods, demonstrating the effectiveness of multi-agent systems in creative tasks like filmmaking.'}, 'zh': {'title': '多智能体协作,革新虚拟电影制作', 'desc': '这篇论文介绍了一种名为FilmAgent的新型多智能体协作框架,旨在实现虚拟电影制作的自动化。FilmAgent利用大型语言模型(LLM)模拟导演、编剧、演员和摄影师等不同角色,涵盖电影制作的关键阶段,包括创意开发、剧本写作和摄影。通过智能体之间的迭代反馈和修订,FilmAgent能够验证中间剧本并减少错误。评估结果显示,FilmAgent在多个方面的表现优于所有基线模型,证明了多智能体协作在电影制作中的可行性。'}}}, {'id': 'https://huggingface.co/papers/2501.12895', 'title': 'Test-Time Preference Optimization: On-the-Fly Alignment via Iterative Textual Feedback', 'url': 'https://huggingface.co/papers/2501.12895', 'abstract': 'Large language models (LLMs) demonstrate impressive performance but lack the flexibility to adapt to human preferences quickly without retraining. In this work, we introduce Test-time Preference Optimization (TPO), a framework that aligns LLM outputs with human preferences during inference, removing the need to update model parameters. Rather than relying on purely numerical rewards, TPO translates reward signals into textual critiques and uses them as textual rewards to iteratively refine its response. Evaluations on benchmarks covering instruction following, preference alignment, safety, and mathematics reveal that TPO progressively improves alignment with human preferences. Notably, after only a few TPO steps, the initially unaligned Llama-3.1-70B-SFT model can surpass the aligned counterpart, Llama-3.1-70B-Instruct. Furthermore, TPO scales efficiently with both the search width and depth during inference. Through case studies, we illustrate how TPO exploits the innate capacity of LLM to interpret and act upon reward signals. Our findings establish TPO as a practical, lightweight alternative for test-time preference optimization, achieving alignment on the fly. Our code is publicly available at https://github.com/yafuly/TPO.', 'score': 40, 'issue_id': 1820, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': 'ebde6f173ad4f6f9', 'authors': ['Yafu Li', 'Xuyang Hu', 'Xiaoye Qu', 'Linjie Li', 'Yu Cheng'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.12895.jpg', 'data': {'categories': ['#rlhf', '#training', '#alignment', '#inference'], 'emoji': '🎯', 'ru': {'title': 'Адаптация языковых моделей на лету: оптимизация без переобучения', 'desc': 'Авторы представляют новый метод под названием Test-time Preference Optimization (TPO), который позволяет адаптировать выходные данные больших языковых моделей (LLM) к предпочтениям человека во время вывода, без необходимости обновления параметров модели. TPO преобразует сигналы вознаграждения в текстовые критические замечания и использует их в качестве текстовых наград для итеративного улучшения ответа. Эксперименты показывают, что TPO постепенно улучшает соответствие предпочтениям человека, причем даже изначально не настроенная модель Llama-3.1-70B-SFT может превзойти настроенный аналог после нескольких шагов TPO. Метод демонстрирует эффективность и масштабируемость, представляя собой практичную альтернативу для оптимизации предпочтений во время вывода.'}, 'en': {'title': 'Aligning Language Models with Human Preferences on the Fly', 'desc': 'This paper presents Test-time Preference Optimization (TPO), a novel framework designed to enhance the alignment of large language model (LLM) outputs with human preferences during inference without the need for retraining. TPO utilizes textual critiques as a form of reward signals, allowing the model to iteratively refine its responses based on human feedback. The results show that TPO can significantly improve the performance of the Llama-3.1-70B-SFT model, enabling it to exceed the performance of the pre-aligned Llama-3.1-70B-Instruct model after just a few optimization steps. Additionally, TPO demonstrates efficient scaling with search width and depth, making it a practical solution for real-time preference alignment in LLMs.'}, 'zh': {'title': '测试时偏好优化:让模型更懂你', 'desc': '大型语言模型(LLMs)在性能上表现出色,但在不重新训练的情况下,难以快速适应人类偏好。我们提出了一种名为测试时偏好优化(TPO)的框架,它在推理过程中将LLM的输出与人类偏好对齐,避免了更新模型参数的需求。TPO通过将奖励信号转化为文本批评,并将其作为文本奖励,逐步优化模型的响应。评估结果显示,经过少量TPO步骤后,最初未对齐的Llama-3.1-70B-SFT模型能够超越已对齐的Llama-3.1-70B-Instruct模型。'}}}, {'id': 'https://huggingface.co/papers/2501.13106', 'title': 'VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding', 'url': 'https://huggingface.co/papers/2501.13106', 'abstract': 'In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation model for image and video understanding. The core design philosophy of VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the vision-centric training paradigm and vision-centric framework design. The key insight of our vision-centric training paradigm is that high-quality image-text data is crucial for both image and video understanding. Instead of preparing massive video-text datasets, we focus on constructing large-scale and high-quality image-text datasets. VideoLLaMA3 has four training stages: 1) vision-centric alignment stage, which warms up the vision encoder and projector; 2) vision-language pretraining stage, which jointly tunes the vision encoder, projector, and LLM with large-scale image-text data covering multiple types (including scene images, documents, charts) as well as text-only data. 3) multi-task fine-tuning stage, which incorporates image-text SFT data for downstream tasks and video-text data to establish a foundation for video understanding. 4) video-centric fine-tuning, which further improves the model\'s capability in video understanding. As for the framework design, to better capture fine-grained details in images, the pretrained vision encoder is adapted to encode images of varying sizes into vision tokens with corresponding numbers, rather than a fixed number of tokens. For video inputs, we reduce the number of vision tokens according to their similarity so that the representation of videos will be more precise and compact. Benefit from vision-centric designs, VideoLLaMA3 achieves compelling performances in both image and video understanding benchmarks.', 'score': 39, 'issue_id': 1820, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': 'd22ea6b804e73c9a', 'authors': ['Boqiang Zhang', 'Kehan Li', 'Zesen Cheng', 'Zhiqiang Hu', 'Yuqian Yuan', 'Guanzheng Chen', 'Sicong Leng', 'Yuming Jiang', 'Hang Zhang', 'Xin Li', 'Peng Jin', 'Wenqi Zhang', 'Fan Wang', 'Lidong Bing', 'Deli Zhao'], 'affiliations': ['DAMO Academy, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.13106.jpg', 'data': {'categories': ['#multimodal', '#cv', '#agi', '#games', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'VideoLLaMA3: Зрение как ключ к пониманию изображений и видео', 'desc': 'VideoLLaMA3 - это усовершенствованная мультимодальная модель для понимания изображений и видео. Ключевая особенность модели - ориентированность на зрение, что проявляется как в парадигме обучения, так и в архитектуре. Модель обучается в четыре этапа, уделяя особое внимание высококачественным данным изображение-текст. VideoLLaMA3 использует адаптивное кодирование изображений разного размера и сжатие представления видео для более точного анализа.'}, 'en': {'title': 'Empowering Image and Video Understanding with Vision-Centric Design', 'desc': 'VideoLLaMA3 is a cutting-edge multimodal foundation model designed for understanding images and videos. It emphasizes a vision-centric approach, which involves training with high-quality image-text datasets instead of large video-text datasets. The model undergoes four training stages, including alignment, pretraining, fine-tuning, and video-centric fine-tuning, to enhance its capabilities in both image and video comprehension. By adapting the vision encoder to handle varying image sizes and optimizing video token representation, VideoLLaMA3 demonstrates impressive performance across various benchmarks.'}, 'zh': {'title': '以视觉为中心的多模态理解模型', 'desc': '本文提出了VideoLLaMA3,这是一个更先进的多模态基础模型,用于图像和视频理解。其核心设计理念是以视觉为中心,强调高质量的图像-文本数据对图像和视频理解的重要性。VideoLLaMA3的训练分为四个阶段,包括视觉对齐、视觉-语言预训练、多任务微调和视频微调,以提升模型在视频理解方面的能力。通过适应性地编码不同大小的图像和优化视频输入的表示,VideoLLaMA3在图像和视频理解基准测试中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.12599', 'title': 'Kimi k1.5: Scaling Reinforcement Learning with LLMs', 'url': 'https://huggingface.co/papers/2501.12599', 'abstract': "Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of artificial intelligence, with the promise that large language models (LLMs) can scale their training data by learning to explore with rewards. However, prior published work has not produced competitive results. In light of this, we report on the training practice of Kimi k1.5, our latest multi-modal LLM trained with RL, including its RL training techniques, multi-modal data recipes, and infrastructure optimization. Long context scaling and improved policy optimization methods are key ingredients of our approach, which establishes a simplistic, effective RL framework without relying on more complex techniques such as Monte Carlo tree search, value functions, and process reward models. Notably, our system achieves state-of-the-art reasoning performance across multiple benchmarks and modalities -- e.g., 77.5 on AIME, 96.2 on MATH 500, 94-th percentile on Codeforces, 74.9 on MathVista -- matching OpenAI's o1. Moreover, we present effective long2short methods that use long-CoT techniques to improve short-CoT models, yielding state-of-the-art short-CoT reasoning results -- e.g., 60.8 on AIME, 94.6 on MATH500, 47.3 on LiveCodeBench -- outperforming existing short-CoT models such as GPT-4o and Claude Sonnet 3.5 by a large margin (up to +550%).", 'score': 33, 'issue_id': 1819, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '427fb9e286a6e3a8', 'authors': ['Kimi Team', 'Angang Du', 'Bofei Gao', 'Bowei Xing', 'Changjiu Jiang', 'Cheng Chen', 'Cheng Li', 'Chenjun Xiao', 'Chenzhuang Du', 'Chonghua Liao', 'Chuning Tang', 'Congcong Wang', 'Dehao Zhang', 'Enming Yuan', 'Enzhe Lu', 'Fengxiang Tang', 'Flood Sung', 'Guangda Wei', 'Guokun Lai', 'Haiqing Guo', 'Han Zhu', 'Hao Ding', 'Hao Hu', 'Hao Yang', 'Hao Zhang', 'Haotian Yao', 'Haotian Zhao', 'Haoyu Lu', 'Haoze Li', 'Haozhen Yu', 'Hongcheng Gao', 'Huabin Zheng', 'Huan Yuan', 'Jia Chen', 'Jianhang Guo', 'Jianlin Su', 'Jianzhou Wang', 'Jie Zhao', 'Jin Zhang', 'Jingyuan Liu', 'Junjie Yan', 'Junyan Wu', 'Lidong Shi', 'Ling Ye', 'Longhui Yu', 'Mengnan Dong', 'Neo Zhang', 'Ningchen Ma', 'Qiwei Pan', 'Qucheng Gong', 'Shaowei Liu', 'Shengling Ma', 'Shupeng Wei', 'Sihan Cao', 'Siying Huang', 'Tao Jiang', 'Weihao Gao', 'Weimin Xiong', 'Weiran He', 'Weixiao Huang', 'Wenhao Wu', 'Wenyang He', 'Xianghui Wei', 'Xianqing Jia', 'Xingzhe Wu', 'Xinran Xu', 'Xinxing Zu', 'Xinyu Zhou', 'Xuehai Pan', 'Y. Charles', 'Yang Li', 'Yangyang Hu', 'Yangyang Liu', 'Yanru Chen', 'Yejie Wang', 'Yibo Liu', 'Yidao Qin', 'Yifeng Liu', 'Ying Yang', 'Yiping Bao', 'Yulun Du', 'Yuxin Wu', 'Yuzhi Wang', 'Zaida Zhou', 'Zhaoji Wang', 'Zhaowei Li', 'Zhen Zhu', 'Zheng Zhang', 'Zhexu Wang', 'Zhilin Yang', 'Zhiqi Huang', 'Zihao Huang', 'Ziyao Xu', 'Zonghan Yang'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.12599.jpg', 'data': {'categories': ['#multimodal', '#optimization', '#training', '#benchmark', '#rl', '#reasoning', '#long_context', '#math'], 'emoji': '🤖', 'ru': {'title': 'Эффективное обучение с подкреплением для многомодальных языковых моделей', 'desc': "Статья описывает обучение многомодальной языковой модели Kimi k1.5 с использованием обучения с подкреплением (RL). Авторы представляют упрощенный эффективный подход к RL без использования сложных техник, таких как поиск по дереву Монте-Карло. Ключевыми элементами являются масштабирование на длинный контекст и улучшенные методы оптимизации политики. Модель достигает передовых результатов по рассуждению на различных бенчмарках и модальностях, сравнимых с OpenAI's o1."}, 'en': {'title': 'Unlocking AI Potential with Reinforcement Learning in LLMs', 'desc': 'This paper discusses the development of Kimi k1.5, a multi-modal large language model (LLM) that utilizes reinforcement learning (RL) to enhance its training data exploration through reward mechanisms. The authors highlight their innovative RL training techniques and infrastructure optimizations that allow for effective long context scaling and policy optimization without complex methods like Monte Carlo tree search. Kimi k1.5 achieves state-of-the-art performance on various reasoning benchmarks, demonstrating its competitive edge over existing models. Additionally, the paper introduces long2short methods that leverage long-context techniques to significantly improve short-context reasoning results, outperforming other models by a substantial margin.'}, 'zh': {'title': '强化学习助力大语言模型的突破', 'desc': '本文介绍了Kimi k1.5的训练实践,这是一种最新的多模态大语言模型,采用强化学习(RL)进行训练。我们的方法通过长上下文扩展和改进的策略优化,建立了一个简单有效的RL框架,而不依赖于复杂的技术,如蒙特卡洛树搜索和价值函数。Kimi k1.5在多个基准测试中表现出色,达到了最先进的推理性能,超越了现有的短链推理模型。我们的研究表明,利用长链技术可以显著提升短链模型的表现,取得了显著的进步。'}}}, {'id': 'https://huggingface.co/papers/2501.13074', 'title': 'Autonomy-of-Experts Models', 'url': 'https://huggingface.co/papers/2501.13074', 'abstract': "Mixture-of-Experts (MoE) models mostly use a router to assign tokens to specific expert modules, activating only partial parameters and often outperforming dense models. We argue that the separation between the router's decision-making and the experts' execution is a critical yet overlooked issue, leading to suboptimal expert selection and ineffective learning. To address this, we propose Autonomy-of-Experts (AoE), a novel MoE paradigm in which experts autonomously select themselves to process inputs. AoE is based on the insight that an expert is aware of its own capacity to effectively process a token, an awareness reflected in the scale of its internal activations. In AoE, routers are removed; instead, experts pre-compute internal activations for inputs and are ranked based on their activation norms. Only the top-ranking experts proceed with the forward pass, while the others abort. The overhead of pre-computing activations is reduced through a low-rank weight factorization. This self-evaluating-then-partner-comparing approach ensures improved expert selection and effective learning. We pre-train language models having 700M up to 4B parameters, demonstrating that AoE outperforms traditional MoE models with comparable efficiency.", 'score': 29, 'issue_id': 1819, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '5cf511144ad54091', 'authors': ['Ang Lv', 'Ruobing Xie', 'Yining Qian', 'Songhao Wu', 'Xingwu Sun', 'Zhanhui Kang', 'Di Wang', 'Rui Yan'], 'affiliations': ['Machine Learning Platform Department, Tencent', 'Renmin University of China', 'Southeast University, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.13074.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization'], 'emoji': '🧠', 'ru': {'title': 'Самоотбор экспертов: новый подход к эффективным нейросетям', 'desc': 'Статья представляет новый подход к моделям Mixture-of-Experts (MoE) под названием Autonomy-of-Experts (AoE). В AoE эксперты самостоятельно выбирают себя для обработки входных данных, основываясь на внутренних активациях, что устраняет необходимость в отдельном маршрутизаторе. Этот метод обеспечивает более эффективный выбор экспертов и улучшенное обучение. Эксперименты с языковыми моделями от 700 млн до 4 млрд параметров показывают, что AoE превосходит традиционные модели MoE при сопоставимой эффективности.'}, 'en': {'title': 'Empowering Experts: Self-Selection for Enhanced Learning in MoE Models', 'desc': 'This paper introduces a new approach called Autonomy-of-Experts (AoE) for Mixture-of-Experts (MoE) models, which traditionally rely on a router to assign tasks to expert modules. The authors argue that the separation of decision-making and execution in MoE leads to poor expert selection and learning inefficiencies. In AoE, experts autonomously evaluate their ability to process inputs based on their internal activations, eliminating the need for a router. By allowing only the most capable experts to participate in processing, AoE enhances expert selection and improves overall model performance while maintaining efficiency.'}, 'zh': {'title': '自主选择,提升专家学习效率', 'desc': '混合专家模型(MoE)通常使用路由器将输入分配给特定的专家模块,仅激活部分参数,通常比密集模型表现更好。我们认为,路由器的决策与专家的执行之间的分离是一个关键但被忽视的问题,导致专家选择不佳和学习效果不理想。为了解决这个问题,我们提出了自主专家(AoE),一种新颖的MoE范式,其中专家自主选择自己处理输入。AoE基于专家能够意识到自身处理能力的洞察,通过内部激活的规模反映出来,从而确保了更好的专家选择和有效学习。'}}}, {'id': 'https://huggingface.co/papers/2501.13007', 'title': 'Pairwise RM: Perform Best-of-N Sampling with Knockout Tournament', 'url': 'https://huggingface.co/papers/2501.13007', 'abstract': "Best-of-N (BoN) sampling, a common strategy for test-time scaling of Large Language Models (LLMs), relies on reward models to select the best candidate solution from multiple generations. However, traditional reward models often assign arbitrary and inconsistent scores, limiting their effectiveness. To address this, we propose a Pairwise Reward Model (Pairwise RM) combined with a knockout tournament for BoN sampling. Instead of assigning absolute scores, given one math problem, Pairwise RM evaluates two candidate solutions' correctness simultaneously. This approach eliminates the need for arbitrary scoring and enables cross-validation of solutions through parallel comparison. In the knockout tournament, Pairwise RM conducts pairwise comparisons between candidate solutions and eliminates the incorrect ones iteratively. We construct \\ourdataset, a large-scale dataset of 443K pairwise comparisons derived from NumiaMath and annotated using gemini-1.5-flash, and train the Pairwise RM via supervised fine-tuning. Experiments on MATH-500 and the Olympiad Bench demonstrate significant improvements over traditional discriminative reward models. And a 40\\% to 60\\% relative improvement is achieved on the top 50\\% challenging problems.", 'score': 13, 'issue_id': 1821, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': 'a34210b73ec25875', 'authors': ['Yantao Liu', 'Zijun Yao', 'Rui Min', 'Yixin Cao', 'Lei Hou', 'Juanzi Li'], 'affiliations': ['Fudan University', 'Hong Kong University of Science and Technology', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13007.jpg', 'data': {'categories': ['#reasoning', '#training', '#optimization', '#dataset', '#math', '#rlhf'], 'emoji': '🏆', 'ru': {'title': 'Попарное сравнение вместо абсолютных оценок: новый подход к выбору лучшего решения в LLM', 'desc': 'Эта статья представляет новый подход к выбору лучшего решения из нескольких вариантов, генерируемых большими языковыми моделями (LLM). Авторы предлагают использовать попарную модель вознаграждения (Pairwise Reward Model) в сочетании с турниром на выбывание для Best-of-N сэмплирования. Этот метод позволяет избежать произвольного назначения баллов и обеспечивает перекрестную проверку решений через параллельное сравнение. Эксперименты показали значительное улучшение результатов по сравнению с традиционными дискриминативными моделями вознаграждения, особенно на сложных задачах.'}, 'en': {'title': 'Enhancing Solution Selection with Pairwise Comparisons', 'desc': 'This paper introduces a new method called Pairwise Reward Model (Pairwise RM) to improve the selection process in Best-of-N (BoN) sampling for Large Language Models (LLMs). Instead of giving arbitrary scores to candidate solutions, Pairwise RM compares two solutions at a time to determine which one is more correct. This method allows for better validation of solutions through direct comparison and eliminates inconsistencies in scoring. The authors also created a large dataset of 443,000 pairwise comparisons to train the model, resulting in significant performance improvements on challenging math problems compared to traditional reward models.'}, 'zh': {'title': '成对奖励模型:提升大型语言模型的选择能力', 'desc': '本文提出了一种新的奖励模型,称为成对奖励模型(Pairwise RM),用于大型语言模型的最佳N(BoN)采样。传统的奖励模型常常给出任意且不一致的分数,限制了其有效性。成对奖励模型通过同时评估两个候选解的正确性,消除了对任意评分的需求,并通过并行比较实现了解决方案的交叉验证。我们构建了一个包含443K成对比较的大规模数据集,并通过监督微调训练了成对奖励模型,实验结果显示其在解决数学问题时显著优于传统的判别奖励模型。'}}}, {'id': 'https://huggingface.co/papers/2501.12570', 'title': 'O1-Pruner: Length-Harmonizing Fine-Tuning for O1-Like Reasoning Pruning', 'url': 'https://huggingface.co/papers/2501.12570', 'abstract': "Recently, long-thought reasoning LLMs, such as OpenAI's O1, adopt extended reasoning processes similar to how humans ponder over complex problems. This reasoning paradigm significantly enhances the model's problem-solving abilities and has achieved promising results. However, long-thought reasoning process leads to a substantial increase in inference time. A pressing challenge is reducing the inference overhead of long-thought LLMs while ensuring accuracy. In this paper, we experimentally demonstrate that long-thought reasoning models struggle to effectively allocate token budgets based on problem difficulty and reasoning redundancies. To address this, we propose Length-Harmonizing Fine-Tuning (O1-Pruner), aiming at minimizing reasoning overhead while maintaining accuracy. This effective fine-tuning method first estimates the LLM's baseline performance through pre-sampling and then uses RL-style fine-tuning to encourage the model to generate shorter reasoning processes under accuracy constraints. This allows the model to achieve efficient reasoning with lower redundancy while maintaining accuracy. Experiments on various mathematical reasoning benchmarks show that O1-Pruner not only significantly reduces inference overhead but also achieves higher accuracy, providing a novel and promising solution to this challenge. Our code is coming soon at https://github.com/StarDewXXX/O1-Pruner", 'score': 11, 'issue_id': 1818, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '2cb7e92315bbf3e4', 'authors': ['Haotian Luo', 'Li Shen', 'Haiying He', 'Yibo Wang', 'Shiwei Liu', 'Wei Li', 'Naiqiang Tan', 'Xiaochun Cao', 'Dacheng Tao'], 'affiliations': ['China Agriculture University', 'Didichuxing Co. Ltd', 'Nanyang Technological University', 'Shenzhen Campus of Sun Yat-sen University', 'Tsinghua University', 'University of Oxford'], 'pdf_title_img': 'assets/pdf/title_img/2501.12570.jpg', 'data': {'categories': ['#reasoning', '#math', '#optimization', '#training', '#benchmark', '#inference'], 'emoji': '⚡', 'ru': {'title': 'Ускорение мышления ИИ без потери качества', 'desc': "Статья описывает метод оптимизации работы языковых моделей с длительным рассуждением, таких как OpenAI's O1. Авторы предлагают технику под названием Length-Harmonizing Fine-Tuning (O1-Pruner), которая сокращает время вывода, сохраняя точность модели. Метод использует предварительную выборку для оценки базовой производительности модели, а затем применяет обучение с подкреплением для генерации более коротких процессов рассуждения. Эксперименты на различных бенчмарках математического рассуждения показали, что O1-Pruner значительно снижает вычислительные затраты при сохранении или даже повышении точности."}, 'en': {'title': 'Optimizing Long-Thought Reasoning for Efficient Problem Solving', 'desc': "This paper discusses a new approach to improve long-thought reasoning in large language models (LLMs) like OpenAI's O1. The authors identify that while these models enhance problem-solving, they also increase inference time due to inefficient token usage. To tackle this, they introduce Length-Harmonizing Fine-Tuning (O1-Pruner), which optimizes the reasoning process by balancing accuracy and efficiency. Their experiments show that O1-Pruner reduces inference overhead and improves accuracy on mathematical reasoning tasks, making it a valuable advancement in LLM performance."}, 'zh': {'title': '优化推理效率,提升准确性!', 'desc': '最近,长思考推理的语言模型(LLM)如OpenAI的O1,采用了类似人类思考复杂问题的扩展推理过程。这种推理范式显著增强了模型的解决问题能力,并取得了良好的效果。然而,长思考推理过程导致推理时间大幅增加。为了解决这个问题,我们提出了长度协调微调(O1-Pruner),旨在在保持准确性的同时,减少长思考LLM的推理开销。'}}}, {'id': 'https://huggingface.co/papers/2501.11067', 'title': 'IntellAgent: A Multi-Agent Framework for Evaluating Conversational AI Systems', 'url': 'https://huggingface.co/papers/2501.11067', 'abstract': 'Large Language Models (LLMs) are transforming artificial intelligence, evolving into task-oriented systems capable of autonomous planning and execution. One of the primary applications of LLMs is conversational AI systems, which must navigate multi-turn dialogues, integrate domain-specific APIs, and adhere to strict policy constraints. However, evaluating these agents remains a significant challenge, as traditional methods fail to capture the complexity and variability of real-world interactions. We introduce IntellAgent, a scalable, open-source multi-agent framework designed to evaluate conversational AI systems comprehensively. IntellAgent automates the creation of diverse, synthetic benchmarks by combining policy-driven graph modeling, realistic event generation, and interactive user-agent simulations. This innovative approach provides fine-grained diagnostics, addressing the limitations of static and manually curated benchmarks with coarse-grained metrics. IntellAgent represents a paradigm shift in evaluating conversational AI. By simulating realistic, multi-policy scenarios across varying levels of complexity, IntellAgent captures the nuanced interplay of agent capabilities and policy constraints. Unlike traditional methods, it employs a graph-based policy model to represent relationships, likelihoods, and complexities of policy interactions, enabling highly detailed diagnostics. IntellAgent also identifies critical performance gaps, offering actionable insights for targeted optimization. Its modular, open-source design supports seamless integration of new domains, policies, and APIs, fostering reproducibility and community collaboration. Our findings demonstrate that IntellAgent serves as an effective framework for advancing conversational AI by addressing challenges in bridging research and deployment. The framework is available at https://github.com/plurai-ai/intellagent', 'score': 6, 'issue_id': 1820, 'pub_date': '2025-01-19', 'pub_date_card': {'ru': '19 января', 'en': 'January 19', 'zh': '1月19日'}, 'hash': '019b0714b4212a7f', 'authors': ['Elad Levi', 'Ilan Kadar'], 'affiliations': ['Plurai'], 'pdf_title_img': 'assets/pdf/title_img/2501.11067.jpg', 'data': {'categories': ['#multimodal', '#agents', '#open_source', '#games', '#optimization', '#graphs', '#benchmark'], 'emoji': '🤖', 'ru': {'title': 'IntellAgent: революция в оценке разговорного ИИ', 'desc': 'IntellAgent - это масштабируемая система с открытым исходным кодом для комплексной оценки разговорных ИИ-систем. Она автоматизирует создание разнообразных синтетических тестов, объединяя моделирование графов на основе политик, генерацию реалистичных событий и интерактивное моделирование взаимодействия пользователя и агента. IntellAgent использует графовую модель политик для представления отношений, вероятностей и сложностей взаимодействия политик, что позволяет проводить детальную диагностику. Система выявляет критические пробелы в производительности и предлагает полезные идеи для целенаправленной оптимизации.'}, 'en': {'title': 'Revolutionizing Evaluation of Conversational AI with IntellAgent', 'desc': 'This paper presents IntellAgent, a new framework for evaluating conversational AI systems, particularly those powered by Large Language Models (LLMs). It addresses the challenges of traditional evaluation methods by automating the creation of diverse benchmarks that simulate real-world interactions. IntellAgent uses a graph-based policy model to analyze the complex relationships and interactions between different policies, providing detailed diagnostics and identifying performance gaps. The open-source nature of IntellAgent encourages collaboration and integration of new features, making it a valuable tool for improving conversational AI systems.'}, 'zh': {'title': 'IntellAgent:对话式AI评估的新范式', 'desc': '大型语言模型(LLMs)正在改变人工智能,成为能够自主规划和执行任务的系统。它们在对话式人工智能系统中的应用尤为重要,这些系统需要处理多轮对话、整合特定领域的API,并遵循严格的政策约束。然而,评估这些智能体仍然是一个重大挑战,因为传统方法无法捕捉现实世界交互的复杂性和多样性。我们提出了IntellAgent,这是一个可扩展的开源多智能体框架,旨在全面评估对话式人工智能系统。'}}}, {'id': 'https://huggingface.co/papers/2412.19723', 'title': 'OS-Genesis: Automating GUI Agent Trajectory Construction via Reverse Task Synthesis', 'url': 'https://huggingface.co/papers/2412.19723', 'abstract': "Graphical User Interface (GUI) agents powered by Vision-Language Models (VLMs) have demonstrated human-like computer control capability. Despite their utility in advancing digital automation, a critical bottleneck persists: collecting high-quality trajectory data for training. Common practices for collecting such data rely on human supervision or synthetic data generation through executing pre-defined tasks, which are either resource-intensive or unable to guarantee data quality. Moreover, these methods suffer from limited data diversity and significant gaps between synthetic data and real-world environments. To address these challenges, we propose OS-Genesis, a novel GUI data synthesis pipeline that reverses the conventional trajectory collection process. Instead of relying on pre-defined tasks, OS-Genesis enables agents first to perceive environments and perform step-wise interactions, then retrospectively derive high-quality tasks to enable trajectory-level exploration. A trajectory reward model is then employed to ensure the quality of the generated trajectories. We demonstrate that training GUI agents with OS-Genesis significantly improves their performance on highly challenging online benchmarks. In-depth analysis further validates OS-Genesis's efficiency and its superior data quality and diversity compared to existing synthesis methods. Our codes, data, and checkpoints are available at https://qiushisun.github.io/OS-Genesis-Home/{OS-Genesis Homepage}.", 'score': 50, 'issue_id': 1455, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': 'b331198d09aa8650', 'authors': ['Qiushi Sun', 'Kanzhi Cheng', 'Zichen Ding', 'Chuanyang Jin', 'Yian Wang', 'Fangzhi Xu', 'Zhenyu Wu', 'Chengyou Jia', 'Liheng Chen', 'Zhoumianze Liu', 'Ben Kao', 'Guohao Li', 'Junxian He', 'Yu Qiao', 'Zhiyong Wu'], 'affiliations': ['Hong Kong University of Science and Technology', 'Johns Hopkins University', 'Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The University of Hong Kong', 'University of Oxford'], 'pdf_title_img': 'assets/pdf/title_img/2412.19723.jpg', 'data': {'categories': ['#benchmark', '#synthetic', '#dataset', '#optimization', '#training', '#data', '#agents'], 'emoji': '🖥️', 'ru': {'title': 'Революция в обучении ИИ-агентов: от заданий к исследованию', 'desc': 'Статья представляет OS-Genesis - новый метод синтеза данных для обучения ИИ-агентов взаимодействию с графическим интерфейсом. Вместо предопределенных задач, агенты сначала исследуют среду и выполняют пошаговые действия, а затем ретроспективно формируют качественные траектории. Используется модель вознаграждения для обеспечения качества сгенерированных траекторий. Результаты показывают значительное улучшение производительности агентов на сложных онлайн-бенчмарках по сравнению с существующими методами.'}, 'en': {'title': 'Revolutionizing GUI Agent Training with OS-Genesis', 'desc': 'This paper introduces OS-Genesis, a new method for generating high-quality trajectory data for training GUI agents using Vision-Language Models (VLMs). Unlike traditional methods that rely on human supervision or predefined tasks, OS-Genesis allows agents to first interact with their environment and then derive tasks retrospectively. This approach enhances data diversity and quality by enabling agents to explore and learn from real-world interactions. The results show that GUI agents trained with OS-Genesis perform significantly better on challenging benchmarks, demonstrating the effectiveness of this novel data synthesis pipeline.'}, 'zh': {'title': 'OS-Genesis:提升GUI代理性能的新方法', 'desc': '本论文提出了一种名为OS-Genesis的新型图形用户界面(GUI)数据合成管道,旨在解决高质量轨迹数据收集的瓶颈。传统方法依赖于人类监督或合成数据生成,往往资源消耗大且数据质量难以保证。OS-Genesis通过让代理先感知环境并进行逐步交互,随后回溯生成高质量任务,从而实现轨迹级探索。实验结果表明,使用OS-Genesis训练的GUI代理在复杂的在线基准测试中表现显著提升,且其数据质量和多样性优于现有合成方法。'}}}, {'id': 'https://huggingface.co/papers/2412.19638', 'title': 'Xmodel-2 Technical Report', 'url': 'https://huggingface.co/papers/2412.19638', 'abstract': 'Xmodel-2 is a 1.2-billion-parameter large language model designed specifically for reasoning tasks. Its architecture enables different model scales to share a unified set of hyperparameters, allowing for extensive experimentation on smaller models and seamless transfer of optimal configurations to larger models. To maximize training efficiency and stability, Xmodel-2 employs the WSD learning rate scheduler from MiniCPM. Pretrained on 1.5 trillion tokens from diverse sources, Xmodel-2 achieves state-of-the-art performance in complex reasoning and agent-based tasks, while maintaining low training costs. These results highlight the potential of efficient model design and training strategies in advancing reasoning capabilities. Model checkpoints and code are publicly available on GitHub at https://github.com/XiaoduoAILab/Xmodel-2', 'score': 11, 'issue_id': 1453, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': '4707dc8ac5a87e66', 'authors': ['Wang Qun', 'Liu Yang', 'Lin Qingquan', 'Qu Zhijiu', 'Jiang Ling'], 'affiliations': ['AI Lab, Xiaodu Technology'], 'pdf_title_img': 'assets/pdf/title_img/2412.19638.jpg', 'data': {'categories': ['#optimization', '#training', '#small_models', '#reasoning', '#open_source', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективное рассуждение с Xmodel-2: мощь в компактности', 'desc': 'Xmodel-2 - это языковая модель с 1,2 миллиардами параметров, специализирующаяся на задачах рассуждения. Её архитектура позволяет разным масштабам модели использовать единый набор гиперпараметров, что облегчает эксперименты и перенос оптимальных конфигураций. Модель использует планировщик скорости обучения WSD из MiniCPM для повышения эффективности и стабильности. Предобученная на 1,5 триллионах токенов, Xmodel-2 достигает передовых результатов в сложных задачах рассуждения, сохраняя низкие затраты на обучение.'}, 'en': {'title': 'Unlocking Reasoning Power with Efficient Model Design', 'desc': 'Xmodel-2 is a large language model with 1.2 billion parameters, specifically built for reasoning tasks. It features a flexible architecture that allows different model sizes to use the same hyperparameters, facilitating experimentation and optimization across scales. The model utilizes the WSD learning rate scheduler to enhance training efficiency and stability. With pretraining on 1.5 trillion tokens, Xmodel-2 demonstrates superior performance in complex reasoning tasks while keeping training costs low, showcasing the benefits of efficient model design.'}, 'zh': {'title': '高效推理能力的模型设计与训练策略', 'desc': 'Xmodel-2 是一个拥有 12 亿参数的大型语言模型,专门设计用于推理任务。它的架构允许不同规模的模型共享统一的超参数,从而可以在较小的模型上进行广泛实验,并将最佳配置无缝转移到更大的模型上。为了最大化训练效率和稳定性,Xmodel-2 采用了 MiniCPM 的 WSD 学习率调度器。经过在 1.5 万亿个来自多样化来源的标记上进行预训练,Xmodel-2 在复杂推理和基于代理的任务中达到了最先进的性能,同时保持了较低的训练成本。'}}}, {'id': 'https://huggingface.co/papers/2412.20735', 'title': 'HUNYUANPROVER: A Scalable Data Synthesis Framework and Guided Tree Search for Automated Theorem Proving', 'url': 'https://huggingface.co/papers/2412.20735', 'abstract': 'We introduce HunyuanProver, an language model finetuned from the Hunyuan 7B for interactive automatic theorem proving with LEAN4. To alleviate the data sparsity issue, we design a scalable framework to iterative synthesize data with low cost. Besides, guided tree search algorithms are designed to enable effective ``system 2 thinking`` of the prover. HunyuanProver achieves state-of-the-art (SOTA) performances on major benchmarks. Specifically, it achieves a pass of 68.4% on the miniF2F-test compared to 65.9%, the current SOTA results. It proves 4 IMO statements (imo_1960_p2, imo_1962_p2}, imo_1964_p2 and imo_1983_p6) in miniF2F-test. To benefit the community, we will open-source a dataset of 30k synthesized instances, where each instance contains the original question in natural language, the converted statement by autoformalization, and the proof by HunyuanProver.', 'score': 3, 'issue_id': 1464, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '18d70581e862bf86', 'authors': ['Yang Li', 'Dong Du', 'Linfeng Song', 'Chen Li', 'Weikang Wang', 'Tao Yang', 'Haitao Mi'], 'affiliations': ['Tencent', 'Tencent Hunyuan Teams'], 'pdf_title_img': 'assets/pdf/title_img/2412.20735.jpg', 'data': {'categories': ['#dataset', '#synthetic', '#data', '#benchmark', '#reasoning', '#open_source', '#training', '#math'], 'emoji': '🧠', 'ru': {'title': 'Прорыв в автоматическом доказательстве теорем с помощью ИИ', 'desc': "HunyuanProver - это языковая модель, настроенная для автоматического доказательства теорем с использованием LEAN4. Модель использует масштабируемую структуру для итеративного синтеза данных и алгоритмы направленного поиска по дереву для эффективного 'системного мышления'. HunyuanProver достигает лучших результатов на основных бенчмарках, включая 68.4% прохождения на miniF2F-test. Авторы планируют открыть доступ к набору данных из 30 тысяч синтезированных примеров для пользы сообщества."}, 'en': {'title': 'HunyuanProver: Advancing Theorem Proving with AI', 'desc': 'HunyuanProver is a language model specifically fine-tuned for interactive automatic theorem proving using LEAN4. To address the challenge of data sparsity, the authors developed a scalable framework that allows for the iterative synthesis of data at a low cost. They also implemented guided tree search algorithms to enhance the reasoning capabilities of the prover, enabling it to perform complex logical deductions. HunyuanProver has achieved state-of-the-art performance on key benchmarks, including a notable pass rate of 68.4% on the miniF2F-test, surpassing previous results and proving several significant mathematical statements.'}, 'zh': {'title': 'HunyuanProver:自动定理证明的新突破', 'desc': '本文介绍了HunyuanProver,这是一个基于Hunyuan 7B微调的语言模型,旨在与LEAN4进行交互式自动定理证明。为了缓解数据稀疏问题,我们设计了一个可扩展的框架,以低成本迭代合成数据。此外,我们还设计了引导树搜索算法,以实现证明者的有效“系统2思维”。HunyuanProver在主要基准测试中达到了最先进的性能,特别是在miniF2F-test中取得了68.4%的通过率,超越了当前的65.9%最先进结果。'}}}, {'id': 'https://huggingface.co/papers/2501.07301', 'title': 'The Lessons of Developing Process Reward Models in Mathematical Reasoning', 'url': 'https://huggingface.co/papers/2501.07301', 'abstract': 'Process Reward Models (PRMs) emerge as a promising approach for process supervision in mathematical reasoning of Large Language Models (LLMs), which aim to identify and mitigate intermediate errors in the reasoning processes. However, the development of effective PRMs faces significant challenges, particularly in data annotation and evaluation methodologies. In this paper, through extensive experiments, we demonstrate that commonly used Monte Carlo (MC) estimation-based data synthesis for PRMs typically yields inferior performance and generalization compared to LLM-as-a-judge and human annotation methods. MC estimation relies on completion models to evaluate current-step correctness, leading to inaccurate step verification. Furthermore, we identify potential biases in conventional Best-of-N (BoN) evaluation strategies for PRMs: (1) The unreliable policy models generate responses with correct answers but flawed processes, leading to a misalignment between the evaluation criteria of BoN and the PRM objectives of process verification. (2) The tolerance of PRMs of such responses leads to inflated BoN scores. (3) Existing PRMs have a significant proportion of minimum scores concentrated on the final answer steps, revealing the shift from process to outcome-based assessment in BoN Optimized PRMs. To address these challenges, we develop a consensus filtering mechanism that effectively integrates MC estimation with LLM-as-a-judge and advocates a more comprehensive evaluation framework that combines response-level and step-level metrics. Based on the mechanisms, we significantly improve both model performance and data efficiency in the BoN evaluation and the step-wise error identification task. Finally, we release a new state-of-the-art PRM that outperforms existing open-source alternatives and provides practical guidelines for future research in building process supervision models.', 'score': 46, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '98f46bb1e2772efc', 'authors': ['Zhenru Zhang', 'Chujie Zheng', 'Yangzhen Wu', 'Beichen Zhang', 'Runji Lin', 'Bowen Yu', 'Dayiheng Liu', 'Jingren Zhou', 'Junyang Lin'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.07301.jpg', 'data': {'categories': ['#math', '#data', '#reasoning', '#benchmark', '#optimization', '#open_source', '#training'], 'emoji': '🧮', 'ru': {'title': 'Усовершенствование Process Reward Models для более точного контроля математических рассуждений', 'desc': 'Статья посвящена Process Reward Models (PRM) для контроля процесса математических рассуждений в больших языковых моделях. Авторы выявили проблемы в существующих методах синтеза данных и оценки PRMs, таких как Monte Carlo и Best-of-N. Они предложили новый механизм фильтрации на основе консенсуса, объединяющий MC-оценку с подходом LLM-as-a-judge. В результате исследователи создали улучшенную PRM, превосходящую существующие open-source альтернативы.'}, 'en': {'title': 'Enhancing Reasoning in LLMs with Process Reward Models', 'desc': 'This paper introduces Process Reward Models (PRMs) as a method to enhance the reasoning capabilities of Large Language Models (LLMs) by identifying and correcting errors in their reasoning processes. The authors highlight the limitations of traditional Monte Carlo estimation methods for data synthesis, which often lead to poor performance in evaluating reasoning steps. They also point out biases in the Best-of-N evaluation strategies that can misalign with the goals of PRMs, particularly in how they assess the correctness of reasoning processes versus final answers. To overcome these issues, the paper proposes a new consensus filtering mechanism that combines different evaluation methods, resulting in improved model performance and more accurate error identification.'}, 'zh': {'title': '提升过程监督模型的有效性', 'desc': '本文探讨了过程奖励模型(PRMs)在大型语言模型(LLMs)数学推理中的应用,旨在识别和减少推理过程中的中间错误。研究表明,传统的基于蒙特卡洛估计的数据合成方法在性能和泛化能力上不如使用LLM作为评判者和人工标注的方法。我们还发现,现有的最佳选择(BoN)评估策略存在偏差,导致评估标准与PRM的过程验证目标不一致。为了解决这些问题,本文提出了一种共识过滤机制,结合了蒙特卡洛估计和LLM评判者,显著提高了模型性能和数据效率。'}}}, {'id': 'https://huggingface.co/papers/2501.06425', 'title': 'Tensor Product Attention Is All You Need', 'url': 'https://huggingface.co/papers/2501.06425', 'abstract': 'Scaling language models to handle longer input sequences typically necessitates large key-value (KV) caches, resulting in substantial memory overhead during inference. In this paper, we propose Tensor Product Attention (TPA), a novel attention mechanism that uses tensor decompositions to represent queries, keys, and values compactly, significantly shrinking KV cache size at inference time. By factorizing these representations into contextual low-rank components (contextual factorization) and seamlessly integrating with RoPE, TPA achieves improved model quality alongside memory efficiency. Based on TPA, we introduce the Tensor ProducT ATTenTion Transformer (T6), a new model architecture for sequence modeling. Through extensive empirical evaluation of language modeling tasks, we demonstrate that T6 exceeds the performance of standard Transformer baselines including MHA, MQA, GQA, and MLA across various metrics, including perplexity and a range of renowned evaluation benchmarks. Notably, TPAs memory efficiency enables the processing of significantly longer sequences under fixed resource constraints, addressing a critical scalability challenge in modern language models. The code is available at https://github.com/tensorgi/T6.', 'score': 35, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'f723487eccf1ccfe', 'authors': ['Yifan Zhang', 'Yifeng Liu', 'Huizhuo Yuan', 'Zhen Qin', 'Yang Yuan', 'Quanquan Gu', 'Andrew Chi-Chih Yao'], 'affiliations': ['IIIS, Tsinghua University', 'Shanghai Qi Zhi Institute', 'TapTap', 'University of California, Los Angeles'], 'pdf_title_img': 'assets/pdf/title_img/2501.06425.jpg', 'data': {'categories': ['#benchmark', '#long_context', '#optimization', '#inference', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективное внимание: компактные трансформеры для длинных последовательностей', 'desc': 'В статье представлен новый механизм внимания - Tensor Product Attention (TPA), использующий тензорные разложения для компактного представления запросов, ключей и значений. TPA значительно уменьшает размер кэша ключ-значение при выводе, что повышает эффективность использования памяти. На основе TPA авторы разработали новую архитектуру модели - Tensor ProducT ATTenTion Transformer (T6). Эмпирические исследования показали, что T6 превосходит стандартные базовые модели Transformer по различным метрикам. TPA позволяет обрабатывать значительно более длинные последовательности при фиксированных ресурсах, решая важную проблему масштабируемости современных языковых моделей.'}, 'en': {'title': 'Efficient Attention for Longer Sequences with TPA', 'desc': 'This paper introduces Tensor Product Attention (TPA), a new attention mechanism designed to reduce memory usage during inference in language models. TPA achieves this by using tensor decompositions to compactly represent queries, keys, and values, which allows for smaller key-value caches. The authors present the Tensor ProducT ATTenTion Transformer (T6), a model that integrates TPA and shows improved performance on language modeling tasks compared to traditional Transformer architectures. T6 not only enhances model quality but also enables the processing of longer input sequences efficiently, addressing a key limitation in current language models.'}, 'zh': {'title': '张量乘积注意力:高效处理长序列的创新方案', 'desc': '本文提出了一种新的注意力机制,称为张量乘积注意力(TPA),旨在解决长输入序列处理中的内存开销问题。TPA通过张量分解技术,紧凑地表示查询、键和值,从而显著减少推理时的KV缓存大小。该机制结合了上下文低秩分解和RoPE,提升了模型质量和内存效率。基于TPA,我们还引入了一种新的模型架构——张量乘积注意力变换器(T6),在语言建模任务中表现优于传统的Transformer基线。'}}}, {'id': 'https://huggingface.co/papers/2501.06252', 'title': '$\\text{Transformer}^2$: Self-adaptive LLMs', 'url': 'https://huggingface.co/papers/2501.06252', 'abstract': 'Self-adaptive large language models (LLMs) aim to solve the challenges posed by traditional fine-tuning methods, which are often computationally intensive and static in their ability to handle diverse tasks. We introduce \\implname, a novel self-adaptation framework that adapts LLMs for unseen tasks in real-time by selectively adjusting only the singular components of their weight matrices. During inference, \\implname employs a two-pass mechanism: first, a dispatch system identifies the task properties, and then task-specific "expert" vectors, trained using reinforcement learning, are dynamically mixed to obtain targeted behavior for the incoming prompt. Our method outperforms ubiquitous approaches such as LoRA, with fewer parameters and greater efficiency. \\implname demonstrates versatility across different LLM architectures and modalities, including vision-language tasks. \\implname represents a significant leap forward, offering a scalable, efficient solution for enhancing the adaptability and task-specific performance of LLMs, paving the way for truly dynamic, self-organizing AI systems.', 'score': 19, 'issue_id': 1651, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '935c31e095aeeec8', 'authors': ['Qi Sun', 'Edoardo Cetin', 'Yujin Tang'], 'affiliations': ['Institute of Science Tokyo, Japan', 'Sakana AI, Japan'], 'pdf_title_img': 'assets/pdf/title_img/2501.06252.jpg', 'data': {'categories': ['#multimodal', '#agi', '#rl', '#optimization', '#training', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Самоадаптация языковых моделей в реальном времени', 'desc': 'Статья представляет новый фреймворк самоадаптации для больших языковых моделей (LLM), который позволяет адаптироваться к новым задачам в реальном времени. Метод использует двухэтапный механизм: сначала определяются свойства задачи, затем применяются специальные векторы экспертов для настройки поведения модели. Подход превосходит традиционные методы вроде LoRA, используя меньше параметров и работая эффективнее. Фреймворк демонстрирует универсальность для разных архитектур LLM и модальностей, включая задачи компьютерного зрения.'}, 'en': {'title': 'Dynamic Adaptation for Language Models', 'desc': "This paper presents a new framework called \textit{implname} that enhances large language models (LLMs) by allowing them to adapt to new tasks in real-time without the heavy computational costs of traditional fine-tuning. Instead of adjusting the entire model, \textit{implname} selectively modifies specific components of the model's weight matrices, making it more efficient. The framework uses a two-step process during inference: first, it identifies the task requirements, and then it combines specialized 'expert' vectors, which are optimized through reinforcement learning, to tailor the model's response. This approach not only improves performance compared to existing methods like LoRA but also works across various LLM architectures and tasks, including those involving both text and images."}, 'zh': {'title': '自适应LLMs:高效应对多样化任务的未来', 'desc': '自适应大型语言模型(LLMs)旨在解决传统微调方法的挑战,这些方法通常计算密集且在处理多样化任务时能力有限。我们介绍了一种新颖的自适应框架\textit{implname},它通过选择性调整权重矩阵的单个组件,实时适应LLMs以应对未见过的任务。在推理过程中,\textit{implname}采用双重机制:首先,调度系统识别任务属性,然后动态混合经过强化学习训练的任务特定“专家”向量,以获得针对输入提示的目标行为。我们的研究方法在参数更少且效率更高的情况下,超越了广泛使用的方法,如LoRA,展示了在不同LLM架构和模态(包括视觉-语言任务)中的多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.06173', 'title': 'VideoAuteur: Towards Long Narrative Video Generation', 'url': 'https://huggingface.co/papers/2501.06173', 'abstract': 'Recent video generation models have shown promising results in producing high-quality video clips lasting several seconds. However, these models face challenges in generating long sequences that convey clear and informative events, limiting their ability to support coherent narrations. In this paper, we present a large-scale cooking video dataset designed to advance long-form narrative generation in the cooking domain. We validate the quality of our proposed dataset in terms of visual fidelity and textual caption accuracy using state-of-the-art Vision-Language Models (VLMs) and video generation models, respectively. We further introduce a Long Narrative Video Director to enhance both visual and semantic coherence in generated videos and emphasize the role of aligning visual embeddings to achieve improved overall video quality. Our method demonstrates substantial improvements in generating visually detailed and semantically aligned keyframes, supported by finetuning techniques that integrate text and image embeddings within the video generation process. Project page: https://videoauteur.github.io/', 'score': 18, 'issue_id': 1653, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'e110fbe840c50afa', 'authors': ['Junfei Xiao', 'Feng Cheng', 'Lu Qi', 'Liangke Gui', 'Jiepeng Cen', 'Zhibei Ma', 'Alan Yuille', 'Lu Jiang'], 'affiliations': ['ByteDance', 'ByteDance Seed', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06173.jpg', 'data': {'categories': ['#video', '#story_generation', '#dataset', '#long_context', '#training', '#multimodal', '#alignment'], 'emoji': '🍳', 'ru': {'title': 'Готовим длинные видео: новый подход к генерации нарративного контента', 'desc': 'Статья представляет новый датасет видеороликов о приготовлении пищи для улучшения генерации длинных нарративных видео. Авторы проверяют качество датасета с помощью современных моделей компьютерного зрения и генерации видео. Они также предлагают метод Long Narrative Video Director для повышения визуальной и семантической согласованности генерируемых видео. Результаты показывают значительное улучшение в генерации детализированных и семантически согласованных ключевых кадров.'}, 'en': {'title': 'Enhancing Long-Form Video Generation with Coherent Narratives', 'desc': 'This paper addresses the limitations of current video generation models in creating long, coherent videos, particularly in the cooking domain. It introduces a large-scale dataset specifically designed for generating long-form cooking videos, ensuring high visual quality and accurate textual descriptions. The authors propose a Long Narrative Video Director that improves both the visual and semantic coherence of the generated content by aligning visual embeddings. Their approach shows significant advancements in producing detailed keyframes and enhancing overall video quality through the integration of text and image embeddings.'}, 'zh': {'title': '推动烹饪视频的长篇叙事生成', 'desc': '最近的视频生成模型在生成持续几秒的高质量视频片段方面取得了良好效果。然而,这些模型在生成长序列时面临挑战,难以传达清晰且信息丰富的事件,限制了它们支持连贯叙述的能力。本文提出了一个大规模的烹饪视频数据集,旨在推动烹饪领域的长篇叙事生成。我们引入了一种长叙事视频导演,增强生成视频的视觉和语义一致性,并强调对齐视觉嵌入在提高整体视频质量中的重要性。'}}}, {'id': 'https://huggingface.co/papers/2501.07572', 'title': 'WebWalker: Benchmarking LLMs in Web Traversal', 'url': 'https://huggingface.co/papers/2501.07572', 'abstract': "Retrieval-augmented generation (RAG) demonstrates remarkable performance across tasks in open-domain question-answering. However, traditional search engines may retrieve shallow content, limiting the ability of LLMs to handle complex, multi-layered information. To address it, we introduce WebWalkerQA, a benchmark designed to assess the ability of LLMs to perform web traversal. It evaluates the capacity of LLMs to traverse a website's subpages to extract high-quality data systematically. We propose WebWalker, which is a multi-agent framework that mimics human-like web navigation through an explore-critic paradigm. Extensive experimental results show that WebWalkerQA is challenging and demonstrates the effectiveness of RAG combined with WebWalker, through the horizontal and vertical integration in real-world scenarios.", 'score': 14, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '1dd4e60432c1ca54', 'authors': ['Jialong Wu', 'Wenbiao Yin', 'Yong Jiang', 'Zhenglin Wang', 'Zekun Xi', 'Runnan Fang', 'Deyu Zhou', 'Pengjun Xie', 'Fei Huang'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.07572.jpg', 'data': {'categories': ['#rag', '#reasoning', '#benchmark', '#agi', '#optimization', '#games', '#interpretability', '#agents', '#survey'], 'emoji': '🕸️', 'ru': {'title': 'WebWalker: умная навигация по веб-страницам для улучшения вопросно-ответных систем', 'desc': 'В статье представлен новый подход к решению задач открытого вопросно-ответного поиска - WebWalkerQA. Эта система оценивает способность языковых моделей систематически исследовать подстраницы веб-сайтов для извлечения качественной информации. Авторы предлагают фреймворк WebWalker, использующий мультиагентный подход для имитации человеческой навигации по веб-страницам. Экспериментальные результаты демонстрируют эффективность комбинации RAG и WebWalker в реальных сценариях.'}, 'en': {'title': 'Enhancing LLMs with Human-like Web Navigation for Better Information Retrieval', 'desc': "This paper introduces WebWalkerQA, a benchmark for evaluating large language models (LLMs) in open-domain question-answering tasks. It addresses the limitations of traditional search engines that often retrieve superficial content, which hinders LLMs from accessing complex information. The proposed WebWalker framework uses a multi-agent system that simulates human-like web navigation, allowing LLMs to systematically traverse subpages of a website to gather high-quality data. Experimental results indicate that combining retrieval-augmented generation (RAG) with WebWalker enhances the models' performance in real-world scenarios by enabling deeper information extraction."}, 'zh': {'title': 'WebWalkerQA:提升问答系统的网页导航能力', 'desc': '检索增强生成(RAG)在开放领域问答任务中表现出色,但传统搜索引擎可能只检索到表面内容,限制了大型语言模型(LLMs)处理复杂信息的能力。为了解决这个问题,我们引入了WebWalkerQA,这是一个评估LLMs进行网页遍历能力的基准。它评估LLMs系统性地遍历网站子页面以提取高质量数据的能力。我们提出了WebWalker,这是一个多代理框架,通过探索-评估范式模拟人类的网页导航。'}}}, {'id': 'https://huggingface.co/papers/2501.06458', 'title': 'O1 Replication Journey -- Part 3: Inference-time Scaling for Medical Reasoning', 'url': 'https://huggingface.co/papers/2501.06458', 'abstract': "Building upon our previous investigations of O1 replication (Part 1: Journey Learning [Qin et al., 2024] and Part 2: Distillation [Huang et al., 2024]), this work explores the potential of inference-time scaling in large language models (LLMs) for medical reasoning tasks, ranging from diagnostic decision-making to treatment planning. Through extensive experiments on medical benchmarks of varying complexity (MedQA, Medbullets, and JAMA Clinical Challenges), our investigation reveals several key insights: (1) Increasing inference time does lead to improved performance. With a modest training set of 500 samples, our model yields substantial performance improvements of 6%-11%. (2) Task complexity directly correlates with the required length of reasoning chains, confirming the necessity of extended thought processes for challenging problems. (3) The differential diagnoses generated by our model adhere to the principles of the hypothetico-deductive method, producing a list of potential conditions that may explain a patient's symptoms and systematically narrowing these possibilities by evaluating the evidence. These findings demonstrate the promising synergy between inference-time scaling and journey learning in advancing LLMs' real-world clinical reasoning capabilities.", 'score': 14, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'c95817afd181bd85', 'authors': ['Zhongzhen Huang', 'Gui Geng', 'Shengyi Hua', 'Zhen Huang', 'Haoyang Zou', 'Shaoting Zhang', 'Pengfei Liu', 'Xiaofan Zhang'], 'affiliations': ['Generative AI Research Lab (GAIR)', 'SII', 'SPIRAL Lab', 'Shanghai Jiao Tong University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06458.jpg', 'data': {'categories': ['#science', '#inference', '#healthcare', '#reasoning'], 'emoji': '🩺', 'ru': {'title': 'Масштабирование времени вывода LLM улучшает медицинские рассуждения', 'desc': 'Данная работа исследует потенциал масштабирования времени вывода в больших языковых моделях (LLM) для задач медицинского рассуждения. Эксперименты на медицинских бенчмарках показали, что увеличение времени вывода приводит к улучшению производительности модели. Сложность задачи напрямую коррелирует с необходимой длиной цепочек рассуждений. Дифференциальные диагнозы, генерируемые моделью, соответствуют принципам гипотетико-дедуктивного метода.'}, 'en': {'title': 'Enhancing Medical Reasoning in LLMs through Inference-Time Scaling', 'desc': "This paper investigates how increasing inference time can enhance the performance of large language models (LLMs) in medical reasoning tasks. The authors conducted experiments on various medical benchmarks and found that longer inference times lead to significant performance improvements, even with a small training dataset. They also discovered that more complex tasks require longer reasoning chains, highlighting the importance of extended thought processes. Additionally, the model's differential diagnoses align with the hypothetico-deductive method, showcasing its ability to systematically evaluate potential conditions based on patient symptoms."}, 'zh': {'title': '推理时间扩展助力医学推理能力提升', 'desc': '本研究基于我们之前对O1复制的研究,探讨了在大型语言模型(LLMs)中推理时间扩展对医学推理任务的潜力。通过在不同复杂度的医学基准(如MedQA、Medbullets和JAMA临床挑战)上进行广泛实验,我们发现增加推理时间确实能提高模型性能,尤其是在仅有500个样本的训练集上,性能提升可达6%-11%。此外,任务的复杂性与所需推理链的长度直接相关,表明对于复杂问题需要更长的思考过程。最后,我们的模型生成的差异性诊断遵循假设演绎法的原则,系统地评估证据以缩小可能的病症范围。'}}}, {'id': 'https://huggingface.co/papers/2501.06282', 'title': 'MinMo: A Multimodal Large Language Model for Seamless Voice Interaction', 'url': 'https://huggingface.co/papers/2501.06282', 'abstract': 'Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon.', 'score': 13, 'issue_id': 1651, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '2bd352453760208e', 'authors': ['Qian Chen', 'Yafeng Chen', 'Yanni Chen', 'Mengzhe Chen', 'Yingda Chen', 'Chong Deng', 'Zhihao Du', 'Ruize Gao', 'Changfeng Gao', 'Zhifu Gao', 'Yabin Li', 'Xiang Lv', 'Jiaqing Liu', 'Haoneng Luo', 'Bin Ma', 'Chongjia Ni', 'Xian Shi', 'Jialong Tang', 'Hui Wang', 'Hao Wang', 'Wen Wang', 'Yuxuan Wang', 'Yunlan Xu', 'Fan Yu', 'Zhijie Yan', 'Yexin Yang', 'Baosong Yang', 'Xian Yang', 'Guanrou Yang', 'Tianyu Zhao', 'Qinglin Zhang', 'Shiliang Zhang', 'Nan Zhao', 'Pei Zhang', 'Chong Zhang', 'Jinren Zhou'], 'affiliations': ['Tongyi Lab, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.06282.jpg', 'data': {'categories': ['#audio', '#multimodal', '#training'], 'emoji': '🗣️', 'ru': {'title': 'MinMo: революция в голосовом ИИ-взаимодействии', 'desc': 'Статья представляет MinMo - мультимодальную большую языковую модель для беспрепятственного голосового взаимодействия. Модель обучена на 1,4 миллионах часов разнообразных речевых данных и широком спектре речевых задач через несколько этапов выравнивания речи и текста. MinMo достигает передовых результатов в понимании и генерации речи, сохраняя при этом возможности текстовых ЯБМ. Модель также поддерживает полнодуплексное общение и управляемую генерацию речи с различными нюансами, включая эмоции, диалекты и темп речи.'}, 'en': {'title': 'MinMo: Revolutionizing Voice Interactions with Multimodal Learning', 'desc': 'This paper presents MinMo, a Multimodal Large Language Model designed for seamless voice interactions, featuring around 8 billion parameters. It overcomes limitations of previous aligned models by employing a multi-stage training approach that includes speech-to-text, text-to-speech, and duplex interaction alignments, utilizing a vast dataset of 1.4 million hours of diverse speech. MinMo achieves state-of-the-art performance in voice comprehension and generation, enabling full-duplex conversations and enhanced instruction-following capabilities for nuanced speech generation. Additionally, it introduces a novel voice decoder that significantly improves voice generation quality compared to earlier models.'}, 'zh': {'title': 'MinMo:无缝语音交互的新突破', 'desc': '本文介绍了一种名为MinMo的多模态大型语言模型,旨在实现无缝的语音交互。MinMo具有约80亿个参数,通过多阶段的对齐训练,克服了以往模型在语音理解和生成方面的局限性。该模型能够支持全双工对话,允许用户与系统进行实时的双向交流。MinMo还具备根据用户指令生成语音的能力,能够调整情感、方言和语速等细节。'}}}, {'id': 'https://huggingface.co/papers/2501.06842', 'title': 'SPAM: Spike-Aware Adam with Momentum Reset for Stable LLM Training', 'url': 'https://huggingface.co/papers/2501.06842', 'abstract': 'Large Language Models (LLMs) have demonstrated exceptional performance across diverse tasks, yet their training remains highly resource-intensive and susceptible to critical challenges such as training instability. A predominant source of this instability stems from gradient and loss spikes, which disrupt the learning process, often leading to costly interventions like checkpoint recovery and experiment restarts, further amplifying inefficiencies. This paper presents a comprehensive investigation into gradient spikes observed during LLM training, revealing their prevalence across multiple architectures and datasets. Our analysis shows that these spikes can be up to 1000times larger than typical gradients, substantially deteriorating model performance. To address this issue, we propose Spike-Aware Adam with Momentum Reset SPAM, a novel optimizer designed to counteract gradient spikes through momentum reset and spike-aware gradient clipping. Extensive experiments, including both pre-training and fine-tuning, demonstrate that SPAM consistently surpasses Adam and its variants across various tasks, including (1) LLM pre-training from 60M to 1B, (2) 4-bit LLM pre-training,(3) reinforcement learning, and (4) Time Series Forecasting. Additionally, SPAM facilitates memory-efficient training by enabling sparse momentum, where only a subset of momentum terms are maintained and updated. When operating under memory constraints, SPAM outperforms state-of-the-art memory-efficient optimizers such as GaLore and Adam-Mini. Our work underscores the importance of mitigating gradient spikes in LLM training and introduces an effective optimization strategy that enhances both training stability and resource efficiency at scale. Code is available at https://github.com/TianjinYellow/SPAM-Optimizer.git', 'score': 10, 'issue_id': 1658, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': 'd5fec659e34cf867', 'authors': ['Tianjin Huang', 'Ziquan Zhu', 'Gaojie Jin', 'Lu Liu', 'Zhangyang Wang', 'Shiwei Liu'], 'affiliations': ['Eindhoven University of Technology', 'University of Exeter', 'University of Leicester', 'University of Oxford', 'University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.06842.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization'], 'emoji': '📈', 'ru': {'title': 'SPAM: Стабильное и эффективное обучение языковых моделей', 'desc': 'Исследователи представили новый оптимизатор SPAM (Spike-Aware Adam with Momentum Reset) для обучения больших языковых моделей (LLM). SPAM предназначен для решения проблемы резких скачков градиентов, которые могут быть в 1000 раз больше обычных и нарушают процесс обучения. Оптимизатор использует сброс импульса и адаптивное ограничение градиента для противодействия этим скачкам. Эксперименты показали, что SPAM превосходит Adam и его варианты в различных задачах, включая предобучение LLM, обучение с подкреплением и прогнозирование временных рядов.'}, 'en': {'title': 'Taming Gradient Spikes for Stable LLM Training with SPAM', 'desc': 'This paper investigates the issue of gradient spikes during the training of Large Language Models (LLMs), which can lead to instability and inefficiencies. These spikes can be significantly larger than normal gradients, negatively impacting model performance and requiring costly interventions. To combat this problem, the authors propose a new optimizer called Spike-Aware Adam with Momentum Reset (SPAM), which incorporates momentum reset and spike-aware gradient clipping. Experimental results show that SPAM outperforms traditional optimizers like Adam in various tasks while also being more memory-efficient.'}, 'zh': {'title': '应对梯度波动,提升训练稳定性!', 'desc': '大型语言模型(LLMs)在多种任务中表现出色,但其训练过程资源消耗大且容易出现不稳定性。研究发现,梯度和损失的剧烈波动是导致训练不稳定的主要原因,这会影响学习过程并增加干预成本。本文提出了一种新型优化器——Spike-Aware Adam with Momentum Reset(SPAM),旨在通过动量重置和梯度剪切来应对梯度波动。实验结果表明,SPAM在多种任务中均优于传统的Adam优化器,显著提高了训练的稳定性和资源效率。'}}}, {'id': 'https://huggingface.co/papers/2501.07574', 'title': 'UnCommon Objects in 3D', 'url': 'https://huggingface.co/papers/2501.07574', 'abstract': 'We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for 3D deep learning and 3D generative AI. uCO3D is the largest publicly-available collection of high-resolution videos of objects with 3D annotations that ensures full-360^{circ} coverage. uCO3D is significantly more diverse than MVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of higher quality, due to extensive quality checks of both the collected videos and the 3D annotations. Similar to analogous datasets, uCO3D contains annotations for 3D camera poses, depth maps and sparse point clouds. In addition, each object is equipped with a caption and a 3D Gaussian Splat reconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D and obtain superior results using the latter, showing that uCO3D is better for learning applications.', 'score': 7, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '79c40f6997052ddd', 'authors': ['Xingchen Liu', 'Piyush Tayal', 'Jianyuan Wang', 'Jesus Zarzar', 'Tom Monnier', 'Konstantinos Tertikas', 'Jiali Duan', 'Antoine Toisoul', 'Jason Y. Zhang', 'Natalia Neverova', 'Andrea Vedaldi', 'Roman Shapovalov', 'David Novotny'], 'affiliations': ['Carnegie Mellon University', 'KAUST', 'Meta AI', 'NKUA, Greece'], 'pdf_title_img': 'assets/pdf/title_img/2501.07574.jpg', 'data': {'categories': ['#dataset', '#open_source', '#synthetic', '#3d'], 'emoji': '🔍', 'ru': {'title': 'uCO3D: Новый стандарт для 3D-данных в машинном обучении', 'desc': 'Авторы представляют новый набор данных uCO3D для глубокого обучения и генеративного ИИ в 3D. Этот датасет содержит высококачественные видео объектов с полным 360-градусным охватом и 3D-аннотациями. uCO3D превосходит аналоги по разнообразию, охватывая более 1000 категорий объектов, и качеству благодаря тщательным проверкам. Помимо стандартных аннотаций, датасет включает подписи к объектам и 3D-реконструкции на основе гауссовых сплатов.'}, 'en': {'title': 'Unlocking 3D Learning with uCO3D: A New Era of Object-Centric Datasets', 'desc': 'The paper presents Uncommon Objects in 3D (uCO3D), a comprehensive dataset designed for advancing 3D deep learning and generative AI. This dataset features high-resolution videos with full 360-degree coverage and includes over 1,000 diverse object categories, making it larger and more varied than existing datasets like MVImgNet and CO3Dv2. uCO3D provides detailed annotations such as 3D camera poses, depth maps, and sparse point clouds, along with captions and 3D Gaussian Splat reconstructions for each object. Experiments demonstrate that training large 3D models on uCO3D yields superior performance compared to other datasets, highlighting its effectiveness for learning applications.'}, 'zh': {'title': 'uCO3D:提升3D学习的全新数据集', 'desc': '我们介绍了一个新的3D深度学习和生成AI数据集,名为Uncommon Objects in 3D(uCO3D)。uCO3D是一个公开可用的高分辨率视频集合,包含360度的3D注释,涵盖超过1000个物体类别,具有更高的多样性和质量。该数据集提供了3D相机姿态、深度图和稀疏点云的注释,并为每个物体配备了描述和3D高斯点云重建。通过在多个数据集上训练大型3D模型,我们发现uCO3D在学习应用中表现更优。'}}}, {'id': 'https://huggingface.co/papers/2501.07171', 'title': 'BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and Vision-Language Models Derived from Scientific Literature', 'url': 'https://huggingface.co/papers/2501.07171', 'abstract': 'The development of vision-language models (VLMs) is driven by large-scale and diverse multimodal datasets. However, progress toward generalist biomedical VLMs is limited by the lack of annotated, publicly accessible datasets across biology and medicine. Existing efforts are restricted to narrow domains, missing the full diversity of biomedical knowledge encoded in scientific literature. To address this gap, we introduce BIOMEDICA, a scalable, open-source framework to extract, annotate, and serialize the entirety of the PubMed Central Open Access subset into an easy-to-use, publicly accessible dataset.Our framework produces a comprehensive archive with over 24 million unique image-text pairs from over 6 million articles. Metadata and expert-guided annotations are also provided. We demonstrate the utility and accessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style models continuously pre-trained on the BIOMEDICA dataset via streaming, eliminating the need to download 27 TB of data locally.On average, our models achieve state-of-the-art performance across 40 tasks - spanning pathology, radiology, ophthalmology, dermatology, surgery, molecular biology, parasitology, and cell biology - excelling in zero-shot classification with a 6.56% average improvement (as high as 29.8% and 17.5% in dermatology and ophthalmology, respectively), and stronger image-text retrieval, all while using 10x less compute. To foster reproducibility and collaboration, we release our codebase and dataset for the broader research community.', 'score': 3, 'issue_id': 1656, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '07db2230e08b0fde', 'authors': ['Alejandro Lozano', 'Min Woo Sun', 'James Burgess', 'Liangyu Chen', 'Jeffrey J Nirschl', 'Jeffrey Gu', 'Ivan Lopez', 'Josiah Aklilu', 'Austin Wolfgang Katzer', 'Collin Chiu', 'Anita Rau', 'Xiaohan Wang', 'Yuhui Zhang', 'Alfred Seunghoon Song', 'Robert Tibshirani', 'Serena Yeung-Levy'], 'affiliations': ['Department of Biomedical Data Science, Stanford University', 'Department of Computer Science, Stanford University', 'Department of Electrical Engineering, Stanford University', 'Department of Pathology, Stanford University', 'Department of Statistics, Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07171.jpg', 'data': {'categories': ['#healthcare', '#cv', '#dataset', '#science', '#multimodal', '#open_source'], 'emoji': '🧬', 'ru': {'title': 'BIOMEDICA: Прорыв в обработке биомедицинских данных с помощью ИИ', 'desc': 'Статья представляет BIOMEDICA - масштабируемый фреймворк с открытым исходным кодом для извлечения и аннотирования биомедицинских данных из научной литературы. Фреймворк создал обширный архив из более чем 24 миллионов уникальных пар изображение-текст из более 6 миллионов статей. На основе этого датасета были обучены модели BMCA-CLIP, достигшие state-of-the-art результатов в 40 биомедицинских задачах. Модели показали значительное улучшение в zero-shot классификации и поиске изображений по тексту при использовании в 10 раз меньших вычислительных ресурсов.'}, 'en': {'title': 'Unlocking Biomedical Knowledge with BIOMEDICA', 'desc': 'This paper presents BIOMEDICA, a new framework designed to create a large, open-source dataset from the PubMed Central Open Access subset, which includes over 24 million image-text pairs from scientific articles. The framework addresses the challenge of limited annotated datasets in the biomedical field, enabling the development of generalist vision-language models (VLMs) that can understand diverse biomedical knowledge. The authors also introduce BMCA-CLIP, a set of models that are continuously pre-trained on this dataset, achieving state-of-the-art performance across various medical tasks with significant improvements in zero-shot classification and image-text retrieval. By making their codebase and dataset publicly available, they aim to enhance reproducibility and collaboration in biomedical research.'}, 'zh': {'title': '推动生物医学领域的视觉语言模型发展', 'desc': '本文介绍了BIOMEDICA,一个可扩展的开源框架,用于提取、注释和序列化PubMed Central开放获取子集的全部内容。该框架生成了一个包含超过2400万个独特图像-文本对的综合档案,来自超过600万篇文章。我们还提供了元数据和专家指导的注释,并展示了BMCA-CLIP模型在40个医学任务中的优越性能,尤其在零样本分类和图像-文本检索方面表现突出。通过发布代码库和数据集,我们促进了研究的可重复性和合作。'}}}, {'id': 'https://huggingface.co/papers/2501.06590', 'title': 'ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning', 'url': 'https://huggingface.co/papers/2501.06590', 'abstract': 'Chemical reasoning usually involves complex, multi-step processes that demand precise calculations, where even minor errors can lead to cascading failures. Furthermore, large language models (LLMs) encounter difficulties handling domain-specific formulas, executing reasoning steps accurately, and integrating code effectively when tackling chemical reasoning tasks. To address these challenges, we present ChemAgent, a novel framework designed to improve the performance of LLMs through a dynamic, self-updating library. This library is developed by decomposing chemical tasks into sub-tasks and compiling these sub-tasks into a structured collection that can be referenced for future queries. Then, when presented with a new problem, ChemAgent retrieves and refines pertinent information from the library, which we call memory, facilitating effective task decomposition and the generation of solutions. Our method designs three types of memory and a library-enhanced reasoning component, enabling LLMs to improve over time through experience. Experimental results on four chemical reasoning datasets from SciBench demonstrate that ChemAgent achieves performance gains of up to 46% (GPT-4), significantly outperforming existing methods. Our findings suggest substantial potential for future applications, including tasks such as drug discovery and materials science. Our code can be found at https://github.com/gersteinlab/chemagent', 'score': 3, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'c217e826245ef357', 'authors': ['Xiangru Tang', 'Tianyu Hu', 'Muyang Ye', 'Yanjun Shao', 'Xunjian Yin', 'Siru Ouyang', 'Wangchunshu Zhou', 'Pan Lu', 'Zhuosheng Zhang', 'Yilun Zhao', 'Arman Cohan', 'Mark Gerstein'], 'affiliations': ['Shanghai Jiao Tong University', 'Stanford University', 'UIUC', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06590.jpg', 'data': {'categories': ['#science', '#reasoning', '#multimodal', '#agents', '#dataset'], 'emoji': '🧪', 'ru': {'title': 'ChemAgent: Умный помощник для LLM в химических задачах', 'desc': 'ChemAgent - это новая система, улучшающая работу больших языковых моделей (LLM) в задачах химического рассуждения. Она использует динамически обновляемую библиотеку, созданную путем декомпозиции химических задач на подзадачи. При решении новых проблем ChemAgent извлекает и уточняет релевантную информацию из библиотеки, что позволяет эффективно декомпозировать задачи и генерировать решения. Система показала значительное превосходство над существующими методами, улучшив производительность LLM до 46% на четырех наборах данных по химическому рассуждению.'}, 'en': {'title': 'Empowering LLMs for Chemical Reasoning with ChemAgent', 'desc': 'This paper introduces ChemAgent, a new framework that enhances large language models (LLMs) for chemical reasoning tasks. It addresses the challenges LLMs face with complex chemical calculations and domain-specific formulas by creating a dynamic library of decomposed sub-tasks. ChemAgent retrieves and refines relevant information from this library, allowing for better task decomposition and solution generation. Experimental results show that ChemAgent significantly improves performance on chemical reasoning datasets, indicating its potential for applications in drug discovery and materials science.'}, 'zh': {'title': 'ChemAgent:提升化学推理的智能助手', 'desc': '化学推理通常涉及复杂的多步骤过程,需要精确的计算,哪怕是微小的错误也可能导致严重的后果。大型语言模型(LLMs)在处理特定领域的公式、准确执行推理步骤和有效整合代码时面临困难。为了解决这些问题,我们提出了ChemAgent,一个通过动态自更新库来提升LLMs性能的新框架。该框架通过将化学任务分解为子任务,并将这些子任务编译成结构化的集合,以便在未来查询时参考,从而实现有效的任务分解和解决方案生成。'}}}, {'id': 'https://huggingface.co/papers/2501.06708', 'title': 'Evaluating Sample Utility for Data Selection by Mimicking Model Weights', 'url': 'https://huggingface.co/papers/2501.06708', 'abstract': "Foundation models rely on large-scale web-crawled datasets, which frequently contain noisy data, biases, and irrelevant content. Existing data selection techniques typically use human heuristics, downstream evaluation datasets, or specialized scoring models, and can overlook samples' utility in the training process. Instead, we propose a new approach, Mimic Score, a data quality metric that uses a pretrained reference model as a guide to assess the usefulness of data samples for training a new model. It relies on the alignment between the gradient of the new model parameters and the vector pointing toward the reference model in weight space. Samples that misalign with this direction are considered low-value and can be filtered out. Motivated by the Mimic score, we develop Grad-Mimic, a data selection framework that identifies and prioritizes useful samples, automating the selection process to create effective filters. Empirically, using Mimic scores to guide model training results in consistent performance gains across six image datasets and enhances the performance of CLIP models. Moreover, Mimic scores and their associated filters improve upon existing filtering methods and offer accurate estimation of dataset quality.", 'score': 2, 'issue_id': 1661, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': '7560c17a0e1b7234', 'authors': ['Tzu-Heng Huang', 'Manjot Bilkhu', 'Frederic Sala', 'Javier Movellan'], 'affiliations': ['Apple Inc.', 'University of Wisconsin-Madison'], 'pdf_title_img': 'assets/pdf/title_img/2501.06708.jpg', 'data': {'categories': ['#data', '#optimization', '#dataset', '#ethics', '#training'], 'emoji': '🧠', 'ru': {'title': 'Умный отбор данных для эффективного обучения моделей', 'desc': 'Предложен новый подход к оценке качества данных для обучения моделей машинного обучения - Mimic Score. Этот метод использует предобученную эталонную модель для оценки полезности образцов данных, анализируя выравнивание градиента параметров новой модели с вектором, указывающим на эталонную модель в пространстве весов. На основе Mimic Score разработан фреймворк Grad-Mimic для автоматизированного отбора полезных образцов данных. Эксперименты показали, что использование Mimic Score приводит к улучшению производительности моделей на нескольких наборах данных изображений и моделей CLIP.'}, 'en': {'title': 'Enhancing Data Selection with Mimic Score for Better Model Training', 'desc': 'This paper introduces a new method called Mimic Score to improve data selection for training foundation models. It uses a pretrained reference model to evaluate the usefulness of data samples by analyzing the alignment of gradients in weight space. Samples that do not align well with the reference model are deemed low-value and can be removed from the training dataset. The proposed Grad-Mimic framework automates this selection process, leading to better model performance across various image datasets and outperforming existing data filtering techniques.'}, 'zh': {'title': 'Mimic Score:提升数据选择的新方法', 'desc': '基础模型依赖于大规模的网络爬取数据集,这些数据集常常包含噪声数据、偏见和无关内容。现有的数据选择技术通常使用人工启发式方法、下游评估数据集或专门的评分模型,可能会忽视样本在训练过程中的实用性。我们提出了一种新的方法,称为Mimic Score,这是一种数据质量指标,利用预训练的参考模型来评估数据样本对新模型训练的有用性。基于Mimic Score,我们开发了Grad-Mimic数据选择框架,自动识别和优先选择有用样本,从而提高模型训练的效果。'}}}, {'id': 'https://huggingface.co/papers/2501.03262', 'title': 'REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models', 'url': 'https://huggingface.co/papers/2501.03262', 'abstract': 'Reinforcement Learning from Human Feedback (RLHF) has emerged as a critical approach for aligning large language models with human preferences, witnessing rapid algorithmic evolution through methods such as Proximal Policy Optimization (PPO), Direct Preference Optimization (DPO), REINFORCE Leave One-Out (RLOO), ReMax, and Group Relative Policy Optimization (GRPO). We present REINFORCE++, an enhanced variant of the classical REINFORCE algorithm that incorporates key optimization techniques from PPO while eliminating the need for a critic network. REINFORCE++ achieves three primary objectives: (1) simplicity (2) enhanced training stability, and (3) reduced computational overhead. Through extensive empirical evaluation, we demonstrate that REINFORCE++ exhibits superior stability compared to GRPO and achieves greater computational efficiency than PPO while maintaining comparable performance. The implementation is available at https://github.com/OpenRLHF/OpenRLHF.', 'score': 42, 'issue_id': 1553, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': 'a05acf5aab0c07dd', 'authors': ['Jian Hu'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.03262.jpg', 'data': {'categories': ['#training', '#rlhf', '#optimization', '#alignment'], 'emoji': '🤖', 'ru': {'title': 'REINFORCE++: Простой и эффективный алгоритм для RLHF', 'desc': 'В статье представлен REINFORCE++, улучшенная версия алгоритма REINFORCE для обучения с подкреплением на основе обратной связи от человека (RLHF). REINFORCE++ сочетает ключевые техники оптимизации из PPO, но не требует использования критической нейронной сети. Алгоритм отличается простотой, повышенной стабильностью обучения и сниженными вычислительными затратами. Эмпирические исследования показывают, что REINFORCE++ демонстрирует лучшую стабильность по сравнению с GRPO и большую вычислительную эффективность, чем PPO, при сохранении сопоставимой производительности.'}, 'en': {'title': 'REINFORCE++: Simplifying Reinforcement Learning with Human Feedback', 'desc': 'This paper introduces REINFORCE++, a new version of the REINFORCE algorithm designed to improve the training of reinforcement learning models using human feedback. It combines the strengths of Proximal Policy Optimization (PPO) while removing the need for a critic network, making it simpler and more efficient. The authors highlight that REINFORCE++ offers better training stability and lower computational costs compared to existing methods like GRPO and PPO. Their experiments show that REINFORCE++ performs well while being easier to use and faster to train.'}, 'zh': {'title': 'REINFORCE++:简化与高效的强化学习新选择', 'desc': '强化学习中的人类反馈(RLHF)是一种重要的方法,用于使大型语言模型更符合人类的偏好。本文提出了REINFORCE++,这是经典REINFORCE算法的增强版本,结合了PPO的优化技术,并且不再需要评论网络。REINFORCE++的主要目标是实现简单性、提高训练稳定性和减少计算开销。通过大量实证评估,我们证明了REINFORCE++在稳定性上优于GRPO,并且在计算效率上超过PPO,同时保持了相似的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.02955', 'title': 'MotionBench: Benchmarking and Improving Fine-grained Video Motion Understanding for Vision Language Models', 'url': 'https://huggingface.co/papers/2501.02955', 'abstract': "In recent years, vision language models (VLMs) have made significant advancements in video understanding. However, a crucial capability - fine-grained motion comprehension - remains under-explored in current benchmarks. To address this gap, we propose MotionBench, a comprehensive evaluation benchmark designed to assess the fine-grained motion comprehension of video understanding models. MotionBench evaluates models' motion-level perception through six primary categories of motion-oriented question types and includes data collected from diverse sources, ensuring a broad representation of real-world video content. Experimental results reveal that existing VLMs perform poorly in understanding fine-grained motions. To enhance VLM's ability to perceive fine-grained motion within a limited sequence length of LLM, we conduct extensive experiments reviewing VLM architectures optimized for video feature compression and propose a novel and efficient Through-Encoder (TE) Fusion method. Experiments show that higher frame rate inputs and TE Fusion yield improvements in motion understanding, yet there is still substantial room for enhancement. Our benchmark aims to guide and motivate the development of more capable video understanding models, emphasizing the importance of fine-grained motion comprehension. Project page: https://motion-bench.github.io .", 'score': 30, 'issue_id': 1551, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'a7051c2d239484b4', 'authors': ['Wenyi Hong', 'Yean Cheng', 'Zhuoyi Yang', 'Weihan Wang', 'Lefan Wang', 'Xiaotao Gu', 'Shiyu Huang', 'Yuxiao Dong', 'Jie Tang'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.02955.jpg', 'data': {'categories': ['#architecture', '#optimization', '#benchmark', '#video'], 'emoji': '🎥', 'ru': {'title': 'MotionBench: новый рубеж в понимании движения для моделей компьютерного зрения', 'desc': 'Статья представляет новый бенчмарк MotionBench для оценки способности моделей компьютерного зрения понимать детальные движения в видео. Авторы обнаружили, что существующие модели плохо справляются с этой задачей. Для улучшения результатов предложен новый метод Through-Encoder Fusion, а также использование видео с более высокой частотой кадров. Бенчмарк призван стимулировать развитие более совершенных моделей понимания видео.'}, 'en': {'title': 'Enhancing Video Understanding with Fine-Grained Motion Comprehension', 'desc': "This paper introduces MotionBench, a new benchmark for evaluating how well vision language models (VLMs) understand fine-grained motion in videos. It identifies a gap in current models' abilities to comprehend detailed motion, which is crucial for accurate video analysis. The benchmark includes various motion-oriented question types and diverse video data to ensure comprehensive testing. The authors also propose a Through-Encoder Fusion method to improve VLM performance, highlighting the need for further advancements in fine-grained motion comprehension."}, 'zh': {'title': '提升视频理解的细粒度运动能力', 'desc': '近年来,视觉语言模型(VLMs)在视频理解方面取得了显著进展。然而,细粒度运动理解这一关键能力在当前基准测试中仍未得到充分探索。为了解决这一问题,我们提出了MotionBench,这是一个全面的评估基准,旨在评估视频理解模型的细粒度运动理解能力。实验结果表明,现有的VLM在理解细粒度运动方面表现不佳,因此我们提出了一种新颖的Through-Encoder(TE)融合方法,以提高模型的运动理解能力。'}}}, {'id': 'https://huggingface.co/papers/2501.03575', 'title': 'Cosmos World Foundation Model Platform for Physical AI', 'url': 'https://huggingface.co/papers/2501.03575', 'abstract': 'Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.', 'score': 25, 'issue_id': 1552, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'f4b2044cbc1076a8', 'authors': ['NVIDIA', ':', 'Niket Agarwal', 'Arslan Ali', 'Maciej Bala', 'Yogesh Balaji', 'Erik Barker', 'Tiffany Cai', 'Prithvijit Chattopadhyay', 'Yongxin Chen', 'Yin Cui', 'Yifan Ding', 'Daniel Dworakowski', 'Jiaojiao Fan', 'Michele Fenzi', 'Francesco Ferroni', 'Sanja Fidler', 'Dieter Fox', 'Songwei Ge', 'Yunhao Ge', 'Jinwei Gu', 'Siddharth Gururani', 'Ethan He', 'Jiahui Huang', 'Jacob Huffman', 'Pooya Jannaty', 'Jingyi Jin', 'Seung Wook Kim', 'Gergely Klár', 'Grace Lam', 'Shiyi Lan', 'Laura Leal-Taixe', 'Anqi Li', 'Zhaoshuo Li', 'Chen-Hsuan Lin', 'Tsung-Yi Lin', 'Huan Ling', 'Ming-Yu Liu', 'Xian Liu', 'Alice Luo', 'Qianli Ma', 'Hanzi Mao', 'Kaichun Mo', 'Arsalan Mousavian', 'Seungjun Nah', 'Sriharsha Niverty', 'David Page', 'Despoina Paschalidou', 'Zeeshan Patel', 'Lindsey Pavao', 'Morteza Ramezanali', 'Fitsum Reda', 'Xiaowei Ren', 'Vasanth Rao Naik Sabavat', 'Ed Schmerling', 'Stella Shi', 'Bartosz Stefaniak', 'Shitao Tang', 'Lyne Tchapmi', 'Przemek Tredak', 'Wei-Cheng Tseng', 'Jibin Varghese', 'Hao Wang', 'Haoxiang Wang', 'Heng Wang', 'Ting-Chun Wang', 'Fangyin Wei', 'Xinyue Wei', 'Jay Zhangjie Wu', 'Jiashu Xu', 'Wei Yang', 'Lin Yen-Chen', 'Xiaohui Zeng', 'Yu Zeng', 'Jing Zhang', 'Qinsheng Zhang', 'Yuxuan Zhang', 'Qingqing Zhao', 'Artur Zolkowski'], 'affiliations': ['NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2501.03575.jpg', 'data': {'categories': ['#open_source', '#data', '#benchmark', '#architecture', '#video', '#multimodal', '#dataset', '#training'], 'emoji': '🌍', 'ru': {'title': 'Цифровой двойник мира для обучения физического ИИ', 'desc': 'Статья представляет платформу Cosmos World Foundation Model для разработки моделей мира в физическом ИИ. Авторы предлагают концепцию базовой модели мира, которую можно дообучать для конкретных приложений. Платформа включает конвейер курации видео, предобученные базовые модели мира, примеры дообучения и токенизаторы видео. Проект открытый и доступен на GitHub для помощи разработчикам физического ИИ в решении важных проблем общества.'}, 'en': {'title': 'Empowering Physical AI with Customizable World Models', 'desc': 'This paper introduces the Cosmos World Foundation Model Platform, designed to assist developers in creating tailored world models for Physical AI systems. It emphasizes the necessity of having a digital twin of both the AI and its environment to enable effective training. The platform includes a comprehensive video curation pipeline, pre-trained models, and tools for fine-tuning these models for specific applications. By making the platform and models open-source, the authors aim to empower developers to address significant societal challenges using Physical AI.'}, 'zh': {'title': '构建物理AI的数字双胞胎与世界模型', 'desc': '这篇论文介绍了物理人工智能(Physical AI)在数字训练中的重要性。为了实现这一目标,需要构建一个数字双胞胎(digital twin)和一个世界模型(world model)。我们提出了Cosmos世界基础模型平台,帮助开发者为物理人工智能定制世界模型。该平台提供了视频策划管道、预训练的世界基础模型以及后训练示例,旨在解决社会中的关键问题,并且是开源的。'}}}, {'id': 'https://huggingface.co/papers/2501.03895', 'title': 'LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One Vision Token', 'url': 'https://huggingface.co/papers/2501.03895', 'abstract': 'The advent of real-time large multimodal models (LMMs) like GPT-4o has sparked considerable interest in efficient LMMs. LMM frameworks typically encode visual inputs into vision tokens (continuous representations) and integrate them and textual instructions into the context of large language models (LLMs), where large-scale parameters and numerous context tokens (predominantly vision tokens) result in substantial computational overhead. Previous efforts towards efficient LMMs always focus on replacing the LLM backbone with smaller models, while neglecting the crucial issue of token quantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal vision tokens. To achieve a high compression ratio of vision tokens while preserving visual information, we first analyze how LMMs understand vision tokens and find that most vision tokens only play a crucial role in the early layers of LLM backbone, where they mainly fuse visual information into text tokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to fuse visual information into text tokens in advance, thereby facilitating the extreme compression of vision tokens fed to LLM backbone into one token. LLaVA-Mini is a unified large multimodal model that can support the understanding of images, high-resolution images, and videos in an efficient manner. Experiments across 11 image-based and 7 video-based benchmarks demonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token instead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by 77%, deliver low-latency responses within 40 milliseconds, and process over 10,000 frames of video on the GPU hardware with 24GB of memory.', 'score': 19, 'issue_id': 1550, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '925d2f81d6fcbb0b', 'authors': ['Shaolei Zhang', 'Qingkai Fang', 'Zhe Yang', 'Yang Feng'], 'affiliations': ['Key Laboratory of AI Safety, Chinese Academy of Sciences', 'Key Laboratory of Intelligent Information Processing, Institute of Computing Technology, Chinese Academy of Sciences (ICT/CAS)', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.03895.jpg', 'data': {'categories': ['#agi', '#video', '#multimodal', '#architecture', '#optimization', '#cv', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Эффективность через минимизацию: революция в мультимодальных моделях', 'desc': 'Статья представляет LLaVA-Mini - эффективную мультимодальную модель с минимальным количеством визуальных токенов. Авторы обнаружили, что большинство визуальных токенов играют ключевую роль только в ранних слоях языковой модели. LLaVA-Mini вводит предварительное слияние модальностей, чтобы объединить визуальную информацию с текстовыми токенами заранее. Эксперименты показывают, что LLaVA-Mini превосходит LLaVA-v1.5, используя всего 1 визуальный токен вместо 576, что значительно повышает эффективность обработки.'}, 'en': {'title': 'Maximizing Efficiency with Minimal Vision Tokens in LMMs', 'desc': 'This paper presents LLaVA-Mini, an efficient large multimodal model (LMM) designed to reduce the number of vision tokens while maintaining visual information integrity. The authors identify that most vision tokens are primarily important in the early layers of the language model, where they integrate visual data with text. By implementing a technique called modality pre-fusion, LLaVA-Mini compresses the input from 576 vision tokens to just one, significantly enhancing efficiency. Experimental results show that LLaVA-Mini not only outperforms its predecessor but also achieves a 77% reduction in computational load and rapid processing times for high-resolution images and videos.'}, 'zh': {'title': '高效多模态模型LLaVA-Mini的创新之路', 'desc': '本文介绍了一种高效的多模态模型LLaVA-Mini,该模型通过减少视觉标记的数量来提高效率。研究发现,大多数视觉标记在大型语言模型的早期层中起着关键作用,因此可以在此之前将视觉信息与文本标记融合。LLaVA-Mini采用了模态预融合的方法,将视觉信息提前融合,从而将输入到语言模型的视觉标记压缩为一个标记。实验结果表明,LLaVA-Mini在多个基准测试中表现优于之前的模型,且显著降低了计算复杂度和延迟。'}}}, {'id': 'https://huggingface.co/papers/2501.04001', 'title': 'Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos', 'url': 'https://huggingface.co/papers/2501.04001', 'abstract': 'This work presents Sa2VA, the first unified model for dense grounded understanding of both images and videos. Unlike existing multi-modal large language models, which are often limited to specific modalities and tasks, Sa2VA supports a wide range of image and video tasks, including referring segmentation and conversation, with minimal one-shot instruction tuning. Sa2VA combines SAM-2, a foundation video segmentation model, with LLaVA, an advanced vision-language model, and unifies text, image, and video into a shared LLM token space. Using the LLM, Sa2VA generates instruction tokens that guide SAM-2 in producing precise masks, enabling a grounded, multi-modal understanding of both static and dynamic visual content. Additionally, we introduce Ref-SAV, an auto-labeled dataset containing over 72k object expressions in complex video scenes, designed to boost model performance. We also manually validate 2k video objects in the Ref-SAV datasets to benchmark referring video object segmentation in complex environments. Experiments show that Sa2VA achieves state-of-the-art across multiple tasks, particularly in referring video object segmentation, highlighting its potential for complex real-world applications.', 'score': 16, 'issue_id': 1555, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'd079946bf74858cd', 'authors': ['Haobo Yuan', 'Xiangtai Li', 'Tao Zhang', 'Zilong Huang', 'Shilin Xu', 'Shunping Ji', 'Yunhai Tong', 'Lu Qi', 'Jiashi Feng', 'Ming-Hsuan Yang'], 'affiliations': ['Bytedance Seed', 'Peking University', 'UC Merced', 'Wuhan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04001.jpg', 'data': {'categories': ['#dataset', '#multimodal', '#benchmark', '#cv'], 'emoji': '🎥', 'ru': {'title': 'Sa2VA: Единая модель для понимания изображений и видео', 'desc': 'Sa2VA - это первая унифицированная модель для плотного заземленного понимания изображений и видео. Она объединяет SAM-2 (модель сегментации видео) с LLaVA (продвинутой моделью компьютерного зрения и языка) в едином пространстве токенов большой языковой модели. Sa2VA генерирует токены инструкций, направляющие SAM-2 в создании точных масок, что позволяет осуществлять заземленное мультимодальное понимание как статического, так и динамического визуального контента. Модель достигает передовых результатов в различных задачах, особенно в сегментации объектов по ссылкам в видео.'}, 'en': {'title': 'Sa2VA: Unifying Image and Video Understanding for Enhanced Multi-Modal Tasks', 'desc': 'Sa2VA is a groundbreaking model that integrates image and video understanding into a single framework. It combines the strengths of SAM-2 for video segmentation and LLaVA for vision-language tasks, allowing it to handle various multi-modal tasks with minimal tuning. By creating a shared token space for text, images, and videos, Sa2VA can generate specific instruction tokens that help in accurately segmenting objects in both images and videos. The introduction of the Ref-SAV dataset further enhances its capabilities, enabling it to achieve top performance in complex visual environments.'}, 'zh': {'title': 'Sa2VA:图像与视频的统一理解模型', 'desc': '本研究提出了Sa2VA,这是第一个统一的模型,能够对图像和视频进行密集的基础理解。与现有的多模态大型语言模型不同,Sa2VA支持多种图像和视频任务,包括引用分割和对话,且只需最少的一次性指令调优。Sa2VA结合了基础视频分割模型SAM-2和先进的视觉语言模型LLaVA,将文本、图像和视频统一到共享的LLM令牌空间中。实验表明,Sa2VA在多个任务上达到了最先进的水平,特别是在引用视频对象分割方面,展示了其在复杂现实应用中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.03847', 'title': 'Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control', 'url': 'https://huggingface.co/papers/2501.03847', 'abstract': 'Diffusion models have demonstrated impressive performance in generating high-quality videos from text prompts or images. However, precise control over the video generation process, such as camera manipulation or content editing, remains a significant challenge. Existing methods for controlled video generation are typically limited to a single control type, lacking the flexibility to handle diverse control demands. In this paper, we introduce Diffusion as Shader (DaS), a novel approach that supports multiple video control tasks within a unified architecture. Our key insight is that achieving versatile video control necessitates leveraging 3D control signals, as videos are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods limited to 2D control signals, DaS leverages 3D tracking videos as control inputs, making the video diffusion process inherently 3D-aware. This innovation allows DaS to achieve a wide range of video controls by simply manipulating the 3D tracking videos. A further advantage of using 3D tracking videos is their ability to effectively link frames, significantly enhancing the temporal consistency of the generated videos. With just 3 days of fine-tuning on 8 H800 GPUs using less than 10k videos, DaS demonstrates strong control capabilities across diverse tasks, including mesh-to-video generation, camera control, motion transfer, and object manipulation.', 'score': 11, 'issue_id': 1552, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '975d5fa9d59bde28', 'authors': ['Zekai Gu', 'Rui Yan', 'Jiahao Lu', 'Peng Li', 'Zhiyang Dou', 'Chenyang Si', 'Zhen Dong', 'Qifeng Liu', 'Cheng Lin', 'Ziwei Liu', 'Wenping Wang', 'Yuan Liu'], 'affiliations': ['Hong Kong University of Science and Technology, China', 'Nanyang Technological University, Singapore', 'Texas A&M University, U.S.A', 'The University of Hong Kong, China', 'Wuhan University, China', 'Zhejiang University, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.03847.jpg', 'data': {'categories': ['#video', '#diffusion', '#3d'], 'emoji': '🎬', 'ru': {'title': 'DaS: Универсальный контроль над генерацией видео через 3D-сигналы', 'desc': 'Авторы представляют новый подход под названием Diffusion as Shader (DaS) для контролируемой генерации видео с помощью диффузионных моделей. В отличие от существующих методов, ограниченных одним типом контроля, DaS поддерживает множество задач управления видео в единой архитектуре. Ключевая идея заключается в использовании 3D-сигналов управления, что делает процесс диффузии видео изначально 3D-ориентированным. DaS демонстрирует сильные возможности управления в различных задачах, включая генерацию видео из 3D-моделей, контроль камеры, перенос движения и манипуляции с объектами.'}, 'en': {'title': 'Empowering Video Generation with 3D Control Signals', 'desc': 'This paper presents Diffusion as Shader (DaS), a new method for generating videos that allows for precise control over various aspects of video creation. Unlike previous models that only used 2D control signals, DaS utilizes 3D tracking videos, which helps in managing the dynamic nature of video content. This approach enables users to manipulate video elements like camera angles and object movements more effectively. The results show that DaS can maintain high-quality video generation while ensuring temporal consistency across frames, even with limited training data.'}, 'zh': {'title': '多样化视频控制的新方法:扩散作为着色器', 'desc': '扩散模型在从文本提示或图像生成高质量视频方面表现出色。然而,精确控制视频生成过程,如相机操作或内容编辑,仍然是一个重大挑战。现有的受控视频生成方法通常仅限于单一控制类型,缺乏处理多样化控制需求的灵活性。本文提出了一种新方法——扩散作为着色器(DaS),它在统一架构中支持多种视频控制任务,利用3D控制信号来实现更灵活的视频控制。'}}}, {'id': 'https://huggingface.co/papers/2501.03936', 'title': 'PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides', 'url': 'https://huggingface.co/papers/2501.03936', 'abstract': 'Automatically generating presentations from documents is a challenging task that requires balancing content quality, visual design, and structural coherence. Existing methods primarily focus on improving and evaluating the content quality in isolation, often overlooking visual design and structural coherence, which limits their practical applicability. To address these limitations, we propose PPTAgent, which comprehensively improves presentation generation through a two-stage, edit-based approach inspired by human workflows. PPTAgent first analyzes reference presentations to understand their structural patterns and content schemas, then drafts outlines and generates slides through code actions to ensure consistency and alignment. To comprehensively evaluate the quality of generated presentations, we further introduce PPTEval, an evaluation framework that assesses presentations across three dimensions: Content, Design, and Coherence. Experiments show that PPTAgent significantly outperforms traditional automatic presentation generation methods across all three dimensions. The code and data are available at https://github.com/icip-cas/PPTAgent.', 'score': 7, 'issue_id': 1557, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '57bb4703056c9e20', 'authors': ['Hao Zheng', 'Xinyan Guan', 'Hao Kong', 'Jia Zheng', 'Hongyu Lin', 'Yaojie Lu', 'Ben He', 'Xianpei Han', 'Le Sun'], 'affiliations': ['Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences', 'Shanghai Jiexin Technology', 'University of Chinese Academy of Sciences'], 'pdf_title_img': 'assets/pdf/title_img/2501.03936.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#dataset'], 'emoji': '🎭', 'ru': {'title': 'PPTAgent: ИИ-помощник для создания презентаций нового уровня', 'desc': 'Исследователи представили PPTAgent - систему для автоматического создания презентаций из документов. В отличие от существующих методов, PPTAgent улучшает не только качество контента, но и визуальный дизайн и структурную согласованность. Система использует двухэтапный подход, вдохновленный рабочим процессом человека: сначала анализирует образцы презентаций, затем создает слайды с помощью программных действий. Авторы также разработали фреймворк PPTEval для комплексной оценки генерируемых презентаций.'}, 'en': {'title': 'PPTAgent: Elevating Presentation Generation with Content, Design, and Coherence', 'desc': 'This paper presents PPTAgent, a novel approach for automatically generating presentations from documents. Unlike existing methods that focus solely on content quality, PPTAgent enhances the overall presentation by considering visual design and structural coherence as well. It employs a two-stage, edit-based process that first analyzes reference presentations to extract patterns and then generates slides through code actions. Additionally, the authors introduce PPTEval, a framework for evaluating presentations based on content, design, and coherence, demonstrating that PPTAgent outperforms traditional methods in all areas.'}, 'zh': {'title': '智能生成高质量演示文稿的解决方案', 'desc': '本文提出了一种名为PPTAgent的自动生成演示文稿的方法。该方法通过两阶段的编辑式流程,综合考虑内容质量、视觉设计和结构一致性。PPTAgent首先分析参考演示文稿,以理解其结构模式和内容框架,然后通过代码操作草拟大纲并生成幻灯片。为了全面评估生成演示文稿的质量,本文还引入了PPTEval评估框架,从内容、设计和一致性三个维度进行评估。'}}}, {'id': 'https://huggingface.co/papers/2501.03714', 'title': 'MoDec-GS: Global-to-Local Motion Decomposition and Temporal Interval Adjustment for Compact Dynamic 3D Gaussian Splatting', 'url': 'https://huggingface.co/papers/2501.03714', 'abstract': '3D Gaussian Splatting (3DGS) has made significant strides in scene representation and neural rendering, with intense efforts focused on adapting it for dynamic scenes. Despite delivering remarkable rendering quality and speed, existing methods struggle with storage demands and representing complex real-world motions. To tackle these issues, we propose MoDecGS, a memory-efficient Gaussian splatting framework designed for reconstructing novel views in challenging scenarios with complex motions. We introduce GlobaltoLocal Motion Decomposition (GLMD) to effectively capture dynamic motions in a coarsetofine manner. This approach leverages Global Canonical Scaffolds (Global CS) and Local Canonical Scaffolds (Local CS), extending static Scaffold representation to dynamic video reconstruction. For Global CS, we propose Global Anchor Deformation (GAD) to efficiently represent global dynamics along complex motions, by directly deforming the implicit Scaffold attributes which are anchor position, offset, and local context features. Next, we finely adjust local motions via the Local Gaussian Deformation (LGD) of Local CS explicitly. Additionally, we introduce Temporal Interval Adjustment (TIA) to automatically control the temporal coverage of each Local CS during training, allowing MoDecGS to find optimal interval assignments based on the specified number of temporal segments. Extensive evaluations demonstrate that MoDecGS achieves an average 70% reduction in model size over stateoftheart methods for dynamic 3D Gaussians from realworld dynamic videos while maintaining or even improving rendering quality.', 'score': 5, 'issue_id': 1556, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'c6cfa761edc047da', 'authors': ['Sangwoon Kwak', 'Joonsoo Kim', 'Jun Young Jeong', 'Won-Sik Cheong', 'Jihyong Oh', 'Munchurl Kim'], 'affiliations': ['Chung-Ang University', 'Electronics and Telecommunications Research Institute', 'Korea Advanced Institute of Science and Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.03714.jpg', 'data': {'categories': ['#3d'], 'emoji': '🎭', 'ru': {'title': 'Эффективное представление сложных движений в динамических сценах', 'desc': 'MoDecGS - это новый фреймворк для эффективной реконструкции динамических сцен с использованием 3D Gaussian Splatting. Он вводит метод GlobaltoLocal Motion Decomposition (GLMD) для захвата сложных движений, используя Global Canonical Scaffolds и Local Canonical Scaffolds. Фреймворк также включает Global Anchor Deformation (GAD) для представления глобальной динамики и Local Gaussian Deformation (LGD) для точной настройки локальных движений. MoDecGS демонстрирует значительное сокращение размера модели при сохранении или улучшении качества рендеринга по сравнению с существующими методами.'}, 'en': {'title': 'Efficient Dynamic Scene Rendering with MoDecGS', 'desc': 'The paper presents MoDecGS, a new framework for 3D Gaussian Splatting that efficiently handles dynamic scenes in neural rendering. It introduces GlobaltoLocal Motion Decomposition (GLMD) to capture complex motions using both Global and Local Canonical Scaffolds. The method employs Global Anchor Deformation (GAD) for global dynamics and Local Gaussian Deformation (LGD) for fine-tuning local motions. MoDecGS significantly reduces model size by 70% compared to existing methods while enhancing rendering quality, making it suitable for real-world dynamic video reconstruction.'}, 'zh': {'title': '高效动态场景重建的新方法', 'desc': '3D高斯点云(3DGS)在场景表示和神经渲染方面取得了显著进展,但在处理动态场景时仍面临存储需求和复杂运动表示的挑战。为了解决这些问题,我们提出了MoDecGS,一个内存高效的高斯点云框架,旨在重建具有复杂运动的新视角。我们引入了全局到局部运动分解(GLMD),以粗到细的方式有效捕捉动态运动,并扩展了静态支架表示以适应动态视频重建。通过全局锚点变形(GAD)和局部高斯变形(LGD),MoDecGS在保持或提高渲染质量的同时,平均减少了70%的模型大小。'}}}, {'id': 'https://huggingface.co/papers/2501.03931', 'title': 'Magic Mirror: ID-Preserved Video Generation in Video Diffusion Transformers', 'url': 'https://huggingface.co/papers/2501.03931', 'abstract': 'We present Magic Mirror, a framework for generating identity-preserved videos with cinematic-level quality and dynamic motion. While recent advances in video diffusion models have shown impressive capabilities in text-to-video generation, maintaining consistent identity while producing natural motion remains challenging. Previous methods either require person-specific fine-tuning or struggle to balance identity preservation with motion diversity. Built upon Video Diffusion Transformers, our method introduces three key components: (1) a dual-branch facial feature extractor that captures both identity and structural features, (2) a lightweight cross-modal adapter with Conditioned Adaptive Normalization for efficient identity integration, and (3) a two-stage training strategy combining synthetic identity pairs with video data. Extensive experiments demonstrate that Magic Mirror effectively balances identity consistency with natural motion, outperforming existing methods across multiple metrics while requiring minimal parameters added. The code and model will be made publicly available at: https://github.com/dvlab-research/MagicMirror/', 'score': 4, 'issue_id': 1550, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '1c9696a99b57f781', 'authors': ['Yuechen Zhang', 'Yaoyang Liu', 'Bin Xia', 'Bohao Peng', 'Zexin Yan', 'Eric Lo', 'Jiaya Jia'], 'affiliations': ['CMU', 'CUHK', 'HKUST', 'SmartMore'], 'pdf_title_img': 'assets/pdf/title_img/2501.03931.jpg', 'data': {'categories': ['#training', '#video', '#multimodal', '#open_source', '#synthetic', '#architecture', '#diffusion'], 'emoji': '🪞', 'ru': {'title': 'Магическое зеркало: видео с сохранением личности и естественным движением', 'desc': 'Magic Mirror - это новая система для создания видео с сохранением идентичности и кинематографическим качеством. Она использует модель видеодиффузии и вводит три ключевых компонента: двойной экстрактор лицевых признаков, легкий кросс-модальный адаптер и двухэтапную стратегию обучения. Система эффективно сочетает сохранение идентичности с естественным движением, превосходя существующие методы по нескольким метрикам. Magic Mirror требует минимального добавления параметров и будет доступна в открытом доступе.'}, 'en': {'title': 'Magic Mirror: Identity-Preserved Video Generation with Cinematic Quality', 'desc': 'Magic Mirror is a new framework designed to create high-quality videos that maintain the identity of individuals while showcasing dynamic motion. It addresses the challenges faced by previous video generation methods, which often struggled to keep a consistent identity or required extensive fine-tuning for specific individuals. The framework utilizes Video Diffusion Transformers and introduces innovative components like a dual-branch facial feature extractor and a cross-modal adapter to enhance identity integration. Through a two-stage training approach, Magic Mirror achieves a remarkable balance between identity preservation and natural motion, outperforming existing techniques with fewer additional parameters.'}, 'zh': {'title': 'Magic Mirror:保持身份一致的动态视频生成', 'desc': '本文介绍了Magic Mirror,一个用于生成保持身份一致的视频框架,具有电影级质量和动态运动。尽管最近的视频扩散模型在文本到视频生成方面取得了显著进展,但在生成自然运动的同时保持一致的身份仍然具有挑战性。我们的方法基于视频扩散变换器,提出了三个关键组件,以有效整合身份信息并保持运动多样性。实验结果表明,Magic Mirror在多个指标上超越了现有方法,同时增加的参数极少。'}}}, {'id': 'https://huggingface.co/papers/2501.03916', 'title': 'Dolphin: Closed-loop Open-ended Auto-research through Thinking, Practice, and Feedback', 'url': 'https://huggingface.co/papers/2501.03916', 'abstract': 'The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific research), in this paper, we propose Dolphin, the first closed-loop open-ended auto-research framework to further build the entire process of human scientific research. Dolphin can generate research ideas, perform experiments, and get feedback from experimental results to generate higher-quality ideas. More specifically, Dolphin first generates novel ideas based on relevant papers which are ranked by the topic and task attributes. Then, the codes are automatically generated and debugged with the exception-traceback-guided local code structure. Finally, Dolphin automatically analyzes the results of each idea and feeds the results back to the next round of idea generation. Experiments are conducted on the benchmark datasets of different topics and results show that Dolphin can generate novel ideas continuously and complete the experiment in a loop. We highlight that Dolphin can automatically propose methods that are comparable to the state-of-the-art in some tasks such as 2D image classification and 3D point classification.', 'score': 3, 'issue_id': 1555, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '9a18a60e788b7840', 'authors': ['Jiakang Yuan', 'Xiangchao Yan', 'Botian Shi', 'Tao Chen', 'Wanli Ouyang', 'Bo Zhang', 'Lei Bai', 'Yu Qiao', 'Bowen Zhou'], 'affiliations': ['Fudan University', 'Shanghai Artificial Intelligence Laboratory'], 'pdf_title_img': 'assets/pdf/title_img/2501.03916.jpg', 'data': {'categories': ['#open_source', '#agents', '#science', '#3d', '#cv', '#benchmark', '#dataset'], 'emoji': '🐬', 'ru': {'title': 'Dolphin: ИИ-ассистент для полного цикла научных исследований', 'desc': 'Статья представляет Dolphin - первую замкнутую систему для автоматического проведения научных исследований. Dolphin генерирует идеи на основе релевантных статей, автоматически создает и отлаживает код для экспериментов, а затем анализирует результаты. Система способна непрерывно генерировать новые идеи и проводить эксперименты в цикле. Эксперименты показали, что Dolphin может предлагать методы, сопоставимые с современными подходами в некоторых задачах машинного обучения.'}, 'en': {'title': 'Dolphin: Automating Scientific Research with AI', 'desc': 'This paper introduces Dolphin, an innovative framework designed to automate the scientific research process. Dolphin operates in a closed-loop system, generating research ideas, conducting experiments, and analyzing results to refine future ideas. It utilizes AI to rank relevant literature and automatically generate and debug code, enhancing research efficiency. The framework has been tested on various benchmark datasets, demonstrating its ability to produce novel ideas and achieve results comparable to leading methods in tasks like image classification.'}, 'zh': {'title': 'Dolphin:自动化科学研究的新纪元', 'desc': '这篇论文介绍了一个名为Dolphin的闭环开放式自动研究框架,旨在提升科学研究的效率。Dolphin能够生成研究想法、进行实验,并根据实验结果反馈生成更高质量的想法。具体来说,Dolphin首先根据相关论文生成新想法,然后自动生成和调试代码,最后分析每个想法的结果并反馈到下一轮生成中。实验结果表明,Dolphin能够持续生成新想法,并在循环中完成实验,且在某些任务上与最先进的方法相当。'}}}, {'id': 'https://huggingface.co/papers/2501.02260', 'title': 'MagicFace: High-Fidelity Facial Expression Editing with Action-Unit Control', 'url': 'https://huggingface.co/papers/2501.02260', 'abstract': "We address the problem of facial expression editing by controling the relative variation of facial action-unit (AU) from the same person. This enables us to edit this specific person's expression in a fine-grained, continuous and interpretable manner, while preserving their identity, pose, background and detailed facial attributes. Key to our model, which we dub MagicFace, is a diffusion model conditioned on AU variations and an ID encoder to preserve facial details of high consistency. Specifically, to preserve the facial details with the input identity, we leverage the power of pretrained Stable-Diffusion models and design an ID encoder to merge appearance features through self-attention. To keep background and pose consistency, we introduce an efficient Attribute Controller by explicitly informing the model of current background and pose of the target. By injecting AU variations into a denoising UNet, our model can animate arbitrary identities with various AU combinations, yielding superior results in high-fidelity expression editing compared to other facial expression editing works. Code is publicly available at https://github.com/weimengting/MagicFace.", 'score': 3, 'issue_id': 1550, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': '9eeeb5b132839793', 'authors': ['Mengting Wei', 'Tuomas Varanka', 'Xingxun Jiang', 'Huai-Qian Khor', 'Guoying Zhao'], 'affiliations': ['Center for Machine Vision and Signal Analysis, Faculty of Information Technology and Electrical Engineering, University of Oulu, Oulu, FI-90014, Finland', 'Key Laboratory of Child Development and Learning Science of Ministry of Education, School of Biological Sciences and Medical Engineering, Southeast University, Nanjing 210096, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.02260.jpg', 'data': {'categories': ['#multimodal', '#diffusion', '#open_source', '#cv'], 'emoji': '🎭', 'ru': {'title': 'Точное редактирование мимики с сохранением личности', 'desc': 'Статья представляет новый подход к редактированию мимики лица с использованием диффузионной модели, названной MagicFace. Модель позволяет точно и интерпретируемо изменять выражение лица конкретного человека, сохраняя его идентичность, позу и фоновые детали. Ключевым элементом является условная генерация на основе вариаций лицевых единиц действия (AU) и использование ID-энкодера для сохранения деталей лица. MagicFace демонстрирует превосходные результаты в высококачественном редактировании выражений лица по сравнению с другими методами.'}, 'en': {'title': 'MagicFace: Fine-Grained Facial Expression Editing with Consistent Identity', 'desc': 'This paper presents a method for editing facial expressions while maintaining the identity and other attributes of the person. The proposed model, named MagicFace, utilizes a diffusion model that is conditioned on facial action unit (AU) variations, allowing for fine-grained control over expressions. It incorporates a pretrained Stable-Diffusion model and an ID encoder to ensure high consistency in facial details. Additionally, an Attribute Controller is introduced to maintain background and pose consistency during the editing process, resulting in high-fidelity expression animations.'}, 'zh': {'title': '魔法面孔:高保真面部表情编辑的创新之路', 'desc': '我们提出了一种面部表情编辑的方法,通过控制同一人的面部动作单元(AU)的相对变化来实现。这种方法可以细致、连续且可解释地编辑特定人的表情,同时保持他们的身份、姿势、背景和面部细节。我们的模型称为MagicFace,核心是一个基于AU变化的扩散模型和一个ID编码器,以保持面部细节的一致性。通过将AU变化注入去噪UNet,我们的模型能够以高保真度编辑面部表情,效果优于其他相关工作。'}}}, {'id': 'https://huggingface.co/papers/2501.02790', 'title': 'Segmenting Text and Learning Their Rewards for Improved RLHF in Language Model', 'url': 'https://huggingface.co/papers/2501.02790', 'abstract': 'Reinforcement learning from human feedback (RLHF) has been widely adopted to align language models (LMs) with human preference. Prior RLHF works typically take a bandit formulation, which, though intuitive, ignores the sequential nature of LM generation and can suffer from the sparse reward issue. While recent works propose dense token-level RLHF, treating each token as an action may be oversubtle to proper reward assignment. In this paper, we seek to get the best of both by training and utilizing a segment-level reward model, which assigns a reward to each semantically complete text segment that spans over a short sequence of tokens. For reward learning, our method allows dynamic text segmentation and compatibility with standard sequence-preference datasets. For effective RL-based LM training against segment reward, we generalize the classical scalar bandit reward normalizers into location-aware normalizer functions and interpolate the segment reward for further densification. With these designs, our method performs competitively on three popular RLHF benchmarks for LM policy: AlpacaEval 2.0, Arena-Hard, and MT-Bench. Ablation studies are conducted to further demonstrate our method.', 'score': 2, 'issue_id': 1562, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'bd19e4a3e48539d4', 'authors': ['Yueqin Yin', 'Shentao Yang', 'Yujia Xie', 'Ziyi Yang', 'Yuting Sun', 'Hany Awadalla', 'Weizhu Chen', 'Mingyuan Zhou'], 'affiliations': ['Microsoft', 'The University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.02790.jpg', 'data': {'categories': ['#training', '#reasoning', '#alignment', '#rlhf', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Сегментарный RLHF: золотая середина между токенами и бандитами', 'desc': 'Данная статья представляет новый подход к обучению языковых моделей с подкреплением на основе обратной связи от человека (RLHF). Авторы предлагают использовать сегментарную модель вознаграждения, которая присваивает награду семантически завершенным текстовым сегментам. Метод позволяет динамическую сегментацию текста и совместим со стандартными наборами данных последовательных предпочтений. Для эффективного RL-обучения языковой модели авторы обобщают классические нормализаторы скалярного бандитного вознаграждения в локально-зависимые функции нормализации.'}, 'en': {'title': 'Enhancing Language Models with Segment-Level Rewards in RLHF', 'desc': 'This paper discusses a new approach to Reinforcement Learning from Human Feedback (RLHF) for language models (LMs). It critiques previous methods that treat the task as a bandit problem, which can overlook the sequential nature of text generation and lead to sparse rewards. The authors propose a segment-level reward model that assigns rewards to complete text segments, improving reward assignment. Their method incorporates dynamic text segmentation and enhances training efficiency by using location-aware normalizer functions, showing competitive results on established RLHF benchmarks.'}, 'zh': {'title': '段落级奖励模型:强化学习的新突破', 'desc': '本论文探讨了如何通过人类反馈进行强化学习(RLHF),以使语言模型(LM)更符合人类偏好。以往的RLHF研究通常采用赌博机模型,但这种方法忽视了语言模型生成的序列特性,并可能面临稀疏奖励的问题。我们提出了一种基于段落级奖励模型的方法,为每个语义完整的文本段落分配奖励,从而克服了以往方法的不足。通过动态文本分割和与标准序列偏好数据集的兼容性,我们的方法在多个RLHF基准测试中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.02393', 'title': 'Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers', 'url': 'https://huggingface.co/papers/2501.02393', 'abstract': "We present an approach to modifying Transformer architectures by integrating graph-aware relational reasoning into the attention mechanism, merging concepts from graph neural networks and language modeling. Building on the inherent connection between attention and graph theory, we reformulate the Transformer's attention mechanism as a graph operation and propose Graph-Aware Isomorphic Attention. This method leverages advanced graph modeling strategies, including Graph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA), to enrich the representation of relational structures. Our approach captures complex dependencies and generalizes across tasks, as evidenced by a reduced generalization gap and improved learning performance. Additionally, we expand the concept of graph-aware attention to introduce Sparse GIN-Attention, a fine-tuning approach that employs sparse GINs. By interpreting attention matrices as sparse adjacency graphs, this technique enhances the adaptability of pre-trained foundational models with minimal computational overhead, endowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning achieves improved training dynamics and better generalization compared to alternative methods like low-rank adaption (LoRA). We discuss latent graph-like structures within traditional attention mechanisms, offering a new lens through which Transformers can be understood. By evolving Transformers as hierarchical GIN models for relational reasoning. This perspective suggests profound implications for foundational model development, enabling the design of architectures that dynamically adapt to both local and global dependencies. Applications in bioinformatics, materials science, language modeling, and beyond could benefit from this synthesis of relational and sequential data modeling, setting the stage for interpretable and generalizable modeling strategies.", 'score': 1, 'issue_id': 1563, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': 'a200448c9795e159', 'authors': ['Markus J. Buehler'], 'affiliations': ['Laboratory for Atomistic and Molecular Mechanics (LAMM) MIT Cambridge, MA 02139, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.02393.jpg', 'data': {'categories': ['#graphs', '#architecture', '#interpretability', '#training'], 'emoji': '🕸️', 'ru': {'title': 'Трансформеры эволюционируют в графовые модели для реляционного рассуждения', 'desc': 'Статья представляет новый подход к модификации архитектуры Трансформеров путем интеграции графового реляционного рассуждения в механизм внимания. Авторы переформулируют механизм внимания Трансформера как графовую операцию и предлагают Graph-Aware Isomorphic Attention, используя стратегии моделирования графов, такие как Graph Isomorphism Networks (GIN) и Principal Neighborhood Aggregation (PNA). Метод позволяет улучшить представление реляционных структур, уменьшить разрыв в обобщении и повысить производительность обучения. Также предложен метод тонкой настройки Sparse GIN-Attention, который интерпретирует матрицы внимания как разреженные графы смежности, улучшая адаптивность предобученных моделей.'}, 'en': {'title': 'Transforming Attention: Merging Graphs and Transformers for Enhanced Learning', 'desc': 'This paper introduces a new way to enhance Transformer models by incorporating graph-based reasoning into their attention mechanisms. By treating attention as a graph operation, the authors propose a method called Graph-Aware Isomorphic Attention, which utilizes advanced graph techniques to better capture relationships in data. They also present Sparse GIN-Attention, a fine-tuning method that interprets attention matrices as sparse graphs, improving the adaptability of pre-trained models with less computational cost. Overall, this approach not only improves learning performance but also opens up new possibilities for applying Transformers in various fields like bioinformatics and language modeling.'}, 'zh': {'title': '图感知注意力:Transformer的新视角', 'desc': '本文提出了一种通过将图感知关系推理整合到注意力机制中来修改Transformer架构的方法。这种方法将Transformer的注意力机制重新表述为图操作,并提出了图感知同构注意力(Graph-Aware Isomorphic Attention)。该方法利用图同构网络(GIN)和主邻域聚合(PNA)等先进的图建模策略,增强了关系结构的表示能力。通过引入稀疏GIN注意力(Sparse GIN-Attention),我们展示了如何在保持计算效率的同时,提升预训练模型的适应性和泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2501.08313', 'title': 'MiniMax-01: Scaling Foundation Models with Lightning Attention', 'url': 'https://huggingface.co/papers/2501.08313', 'abstract': 'We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, of which 45.9 billion are activated for each token. We develop an optimized parallel strategy and highly efficient computation-communication overlap techniques for MoE and lightning attention. This approach enables us to conduct efficient training and inference on models with hundreds of billions of parameters across contexts spanning millions of tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens during training and extrapolate to 4 million tokens during inference at an affordable cost. Our vision-language model, MiniMax-VL-01 is built through continued training with 512 billion vision-language tokens. Experiments on both standard and in-house benchmarks show that our models match the performance of state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 times longer context window. We publicly release MiniMax-01 at https://github.com/MiniMax-AI.', 'score': 192, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'a57d7b1914e7383a', 'authors': ['MiniMax', 'Aonian Li', 'Bangwei Gong', 'Bo Yang', 'Boji Shan', 'Chang Liu', 'Cheng Zhu', 'Chunhao Zhang', 'Congchao Guo', 'Da Chen', 'Dong Li', 'Enwei Jiao', 'Gengxin Li', 'Guojun Zhang', 'Haohai Sun', 'Houze Dong', 'Jiadai Zhu', 'Jiaqi Zhuang', 'Jiayuan Song', 'Jin Zhu', 'Jingtao Han', 'Jingyang Li', 'Junbin Xie', 'Junhao Xu', 'Junjie Yan', 'Kaishun Zhang', 'Kecheng Xiao', 'Kexi Kang', 'Le Han', 'Leyang Wang', 'Lianfei Yu', 'Liheng Feng', 'Lin Zheng', 'Linbo Chai', 'Long Xing', 'Meizhi Ju', 'Mingyuan Chi', 'Mozhi Zhang', 'Peikai Huang', 'Pengcheng Niu', 'Pengfei Li', 'Pengyu Zhao', 'Qi Yang', 'Qidi Xu', 'Qiexiang Wang', 'Qin Wang', 'Qiuhui Li', 'Ruitao Leng', 'Shengmin Shi', 'Shuqi Yu', 'Sichen Li', 'Songquan Zhu', 'Tao Huang', 'Tianrun Liang', 'Weigao Sun', 'Weixuan Sun', 'Weiyu Cheng', 'Wenkai Li', 'Xiangjun Song', 'Xiao Su', 'Xiaodong Han', 'Xinjie Zhang', 'Xinzhu Hou', 'Xu Min', 'Xun Zou', 'Xuyang Shen', 'Yan Gong', 'Yingjie Zhu', 'Yipeng Zhou', 'Yiran Zhong', 'Yongyi Hu', 'Yuanxiang Fan', 'Yue Yu', 'Yufeng Yang', 'Yuhao Li', 'Yunan Huang', 'Yunji Li', 'Yunpeng Huang', 'Yunzhi Xu', 'Yuxin Mao', 'Zehan Li', 'Zekang Li', 'Zewei Tao', 'Zewen Ying', 'Zhaoyang Cong', 'Zhen Qin', 'Zhenhua Fan', 'Zhihang Yu', 'Zhuo Jiang', 'Zijia Wu'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08313.jpg', 'data': {'categories': ['#open_source', '#architecture', '#optimization', '#benchmark', '#long_context', '#training'], 'emoji': '🚀', 'ru': {'title': 'MiniMax-01: Революция в обработке длинных контекстов', 'desc': 'Исследователи представили серию моделей MiniMax-01, включая MiniMax-Text-01 и MiniMax-VL-01, которые сравнимы с лучшими моделями, но обладают улучшенными возможностями обработки длинных контекстов. В основе лежит технология lightning attention и ее эффективное масштабирование, интегрированные с Mixture of Experts (MoE). Модель имеет 32 эксперта и 456 миллиардов параметров, из которых 45,9 миллиардов активируются для каждого токена. Контекстное окно MiniMax-Text-01 может достигать 1 миллиона токенов при обучении и экстраполироваться до 4 миллионов токенов при инференсе.'}, 'en': {'title': 'Unleashing Long Contexts with MiniMax-01 Models', 'desc': 'The MiniMax-01 series introduces advanced models, MiniMax-Text-01 and MiniMax-VL-01, designed to handle longer contexts effectively. These models utilize lightning attention and a Mixture of Experts (MoE) architecture, featuring 32 experts and a staggering 456 billion parameters, optimizing the activation of 45.9 billion parameters per token. By implementing efficient parallel strategies and computation-communication overlap techniques, the models can train and infer on extensive datasets, reaching context windows of up to 1 million tokens during training and 4 million during inference. Performance evaluations indicate that MiniMax-01 models rival leading models like GPT-4o and Claude-3.5-Sonnet while significantly extending context capabilities.'}, 'zh': {'title': 'MiniMax-01:超长上下文处理的新纪元', 'desc': '我们介绍了MiniMax-01系列,包括MiniMax-Text-01和MiniMax-VL-01,这些模型在处理更长的上下文时具有优越的能力。核心技术是闪电注意力和高效的扩展能力。为了最大化计算能力,我们将其与专家混合模型(MoE)结合,创建了一个拥有32个专家和4560亿参数的模型。我们的实验表明,这些模型在标准和内部基准测试中表现出色,能够与最先进的模型相媲美,同时提供20到32倍更长的上下文窗口。'}}}, {'id': 'https://huggingface.co/papers/2501.08332', 'title': 'MangaNinja: Line Art Colorization with Precise Reference Following', 'url': 'https://huggingface.co/papers/2501.08332', 'abstract': 'Derived from diffusion models, MangaNinjia specializes in the task of reference-guided line art colorization. We incorporate two thoughtful designs to ensure precise character detail transcription, including a patch shuffling module to facilitate correspondence learning between the reference color image and the target line art, and a point-driven control scheme to enable fine-grained color matching. Experiments on a self-collected benchmark demonstrate the superiority of our model over current solutions in terms of precise colorization. We further showcase the potential of the proposed interactive point control in handling challenging cases, cross-character colorization, multi-reference harmonization, beyond the reach of existing algorithms.', 'score': 31, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '20ea6b75639e2ced', 'authors': ['Zhiheng Liu', 'Ka Leong Cheng', 'Xi Chen', 'Jie Xiao', 'Hao Ouyang', 'Kai Zhu', 'Yu Liu', 'Yujun Shen', 'Qifeng Chen', 'Ping Luo'], 'affiliations': ['Ant Group', 'HKU', 'HKUST', 'Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.08332.jpg', 'data': {'categories': ['#cv', '#diffusion', '#benchmark'], 'emoji': '🎨', 'ru': {'title': 'Прецизионное раскрашивание манги с помощью ИИ', 'desc': 'MangaNinjia - это модель для раскрашивания линейных рисунков манги, основанная на диффузионных моделях. Она использует модуль перемешивания патчей для обучения соответствиям между цветным изображением-образцом и целевым линейным рисунком. Модель также включает схему точечного контроля для точного подбора цветов. Эксперименты показывают превосходство MangaNinjia над существующими решениями в точности раскрашивания.'}, 'en': {'title': 'MangaNinjia: Mastering Line Art Colorization with Precision', 'desc': 'MangaNinjia is a model designed for coloring line art by using reference images. It employs a patch shuffling module to help the model learn how to match colors from the reference image to the target line art accurately. Additionally, it features a point-driven control scheme that allows for detailed color adjustments, ensuring that colors are applied precisely. Our experiments show that MangaNinjia outperforms existing methods in colorization tasks, especially in complex scenarios involving multiple references and different characters.'}, 'zh': {'title': 'MangaNinjia:精准上色的新方法', 'desc': 'MangaNinjia 是一种基于扩散模型的参考引导线条艺术上色技术。我们设计了两个模块来确保角色细节的准确转录,包括补丁洗牌模块和点驱动控制方案,以实现精细的颜色匹配。实验结果表明,我们的模型在精确上色方面优于现有解决方案。我们还展示了所提议的交互式点控制在处理复杂案例和多参考协调方面的潜力,超越了现有算法的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.06751', 'title': 'Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models', 'url': 'https://huggingface.co/papers/2501.06751', 'abstract': "Text-to-image (T2I) diffusion models rely on encoded prompts to guide the image generation process. Typically, these prompts are extended to a fixed length by adding padding tokens before text encoding. Despite being a default practice, the influence of padding tokens on the image generation process has not been investigated. In this work, we conduct the first in-depth analysis of the role padding tokens play in T2I models. We develop two causal techniques to analyze how information is encoded in the representation of tokens across different components of the T2I pipeline. Using these techniques, we investigate when and how padding tokens impact the image generation process. Our findings reveal three distinct scenarios: padding tokens may affect the model's output during text encoding, during the diffusion process, or be effectively ignored. Moreover, we identify key relationships between these scenarios and the model's architecture (cross or self-attention) and its training process (frozen or trained text encoder). These insights contribute to a deeper understanding of the mechanisms of padding tokens, potentially informing future model design and training practices in T2I systems.", 'score': 27, 'issue_id': 1677, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': '05733e8e82e23568', 'authors': ['Michael Toker', 'Ido Galil', 'Hadas Orgad', 'Rinon Gal', 'Yoad Tewel', 'Gal Chechik', 'Yonatan Belinkov'], 'affiliations': ['Bar-Ilan University', 'NVIDIA', 'Technion Israel Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.06751.jpg', 'data': {'categories': ['#cv', '#architecture', '#interpretability', '#diffusion', '#training'], 'emoji': '🧩', 'ru': {'title': 'Раскрытие тайн токенов заполнения в генерации изображений', 'desc': 'Исследователи провели первый глубокий анализ роли токенов заполнения в моделях преобразования текста в изображение (T2I). Они разработали две причинно-следственные техники для изучения того, как информация кодируется в представлении токенов в различных компонентах конвейера T2I. Результаты показали три различных сценария влияния токенов заполнения на процесс генерации изображений. Исследование выявило ключевые взаимосвязи между этими сценариями и архитектурой модели, а также процессом ее обучения.'}, 'en': {'title': 'Unpacking Padding: The Hidden Role in Text-to-Image Models', 'desc': "This paper explores the impact of padding tokens in text-to-image (T2I) diffusion models, which are used to generate images from text prompts. The authors analyze how these padding tokens influence the image generation process at different stages, including text encoding and the diffusion process. They identify three scenarios where padding tokens can either affect the output or be ignored, depending on the model's architecture and training methods. The findings provide valuable insights that could guide future improvements in T2I model design and training practices."}, 'zh': {'title': '填充标记在图像生成中的关键作用', 'desc': '本文研究了文本到图像(T2I)扩散模型中填充标记的作用。填充标记通常用于将提示扩展到固定长度,但其对图像生成过程的影响尚未被深入探讨。我们开发了两种因果分析技术,探讨填充标记在T2I模型不同组件中的信息编码方式。研究结果表明,填充标记在文本编码、扩散过程中的影响各不相同,并与模型架构和训练过程存在重要关系。'}}}, {'id': 'https://huggingface.co/papers/2501.08316', 'title': 'Diffusion Adversarial Post-Training for One-Step Video Generation', 'url': 'https://huggingface.co/papers/2501.08316', 'abstract': 'The diffusion models are widely used for image and video generation, but their iterative generation process is slow and expansive. While existing distillation approaches have demonstrated the potential for one-step generation in the image domain, they still suffer from significant quality degradation. In this work, we propose Adversarial Post-Training (APT) against real data following diffusion pre-training for one-step video generation. To improve the training stability and quality, we introduce several improvements to the model architecture and training procedures, along with an approximated R1 regularization objective. Empirically, our experiments show that our adversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720, 24fps videos in real time using a single forward evaluation step. Additionally, our model is capable of generating 1024px images in a single step, achieving quality comparable to state-of-the-art methods.', 'score': 19, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '4122a780e8356ce7', 'authors': ['Shanchuan Lin', 'Xin Xia', 'Yuxi Ren', 'Ceyuan Yang', 'Xuefeng Xiao', 'Lu Jiang'], 'affiliations': ['ByteDance'], 'pdf_title_img': 'assets/pdf/title_img/2501.08316.jpg', 'data': {'categories': ['#architecture', '#optimization', '#video', '#diffusion', '#training'], 'emoji': '🎬', 'ru': {'title': 'Революция в генерации видео: от итераций к мгновенному результату', 'desc': 'Эта статья представляет новый метод под названием Adversarial Post-Training (APT) для одношаговой генерации видео. Авторы предлагают улучшения архитектуры модели и процедур обучения, включая аппроксимированную регуляризацию R1. Их модель Seaweed-APT способна генерировать 2-секундные видео высокого разрешения в реальном времени за один проход. Кроме того, модель может создавать изображения размером 1024px за один шаг, достигая качества, сравнимого с современными методами.'}, 'en': {'title': 'Fast and High-Quality Video Generation with Seaweed-APT', 'desc': 'This paper addresses the slow and costly iterative process of generating images and videos using diffusion models. The authors introduce Adversarial Post-Training (APT) to enhance one-step video generation while maintaining high quality. They implement architectural and procedural improvements, including an approximated R1 regularization, to stabilize training. Their model, Seaweed-APT, successfully generates high-quality 2-second videos and 1024px images in real time with a single forward evaluation step.'}, 'zh': {'title': '对抗后训练:快速高质量视频生成的新方法', 'desc': '扩散模型广泛应用于图像和视频生成,但其迭代生成过程较慢且成本高昂。现有的蒸馏方法在图像领域展示了单步生成的潜力,但仍存在显著的质量下降。本文提出了一种针对真实数据的对抗后训练(APT)方法,以实现单步视频生成。我们的实验表明,经过对抗后训练的模型Seaweed-APT能够实时生成1280x720、24fps的2秒视频,并且在单步生成1024px图像时,其质量可与最先进的方法相媲美。'}}}, {'id': 'https://huggingface.co/papers/2501.08187', 'title': 'A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction Following', 'url': 'https://huggingface.co/papers/2501.08187', 'abstract': 'Large language models excel at interpreting complex natural language instructions, enabling them to perform a wide range of tasks. In the life sciences, single-cell RNA sequencing (scRNA-seq) data serves as the "language of cellular biology", capturing intricate gene expression patterns at the single-cell level. However, interacting with this "language" through conventional tools is often inefficient and unintuitive, posing challenges for researchers. To address these limitations, we present InstructCell, a multi-modal AI copilot that leverages natural language as a medium for more direct and flexible single-cell analysis. We construct a comprehensive multi-modal instruction dataset that pairs text-based instructions with scRNA-seq profiles from diverse tissues and species. Building on this, we develop a multi-modal cell language architecture capable of simultaneously interpreting and processing both modalities. InstructCell empowers researchers to accomplish critical tasks-such as cell type annotation, conditional pseudo-cell generation, and drug sensitivity prediction-using straightforward natural language commands. Extensive evaluations demonstrate that InstructCell consistently meets or exceeds the performance of existing single-cell foundation models, while adapting to diverse experimental conditions. More importantly, InstructCell provides an accessible and intuitive tool for exploring complex single-cell data, lowering technical barriers and enabling deeper biological insights.', 'score': 18, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'de984ce7cc62fa5e', 'authors': ['Yin Fang', 'Xinle Deng', 'Kangwei Liu', 'Ningyu Zhang', 'Jingyang Qian', 'Penghui Yang', 'Xiaohui Fan', 'Huajun Chen'], 'affiliations': ['College of Computer Science and Technology, Zhejiang University, Hangzhou 310027, China', 'College of Pharmaceutical Sciences, Zhejiang University, Hangzhou 310058, China', 'Future Health Laboratory, Innovation Center of Yangtze River Delta, Zhejiang University, Jiaxing 314100, China', 'Innovation Center in Zhejiang University, State Key Laboratory of Component-Based Chinese Medicine, Hangzhou 310058, China', 'School of Software Technology, Zhejiang University, Ningbo 315048, China', 'ZJU-Hangzhou Global Scientific and Technological Innovation Center, Hangzhou 311200, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.08187.jpg', 'data': {'categories': ['#architecture', '#multimodal', '#dataset', '#science', '#healthcare'], 'emoji': '🧬', 'ru': {'title': 'Естественный язык как ключ к расшифровке клеточной биологии', 'desc': 'InstructCell - это мультимодальный ИИ-помощник для анализа данных одноклеточного РНК-секвенирования (scRNA-seq). Он использует архитектуру, способную интерпретировать как естественный язык, так и профили экспрессии генов. InstructCell позволяет исследователям выполнять такие задачи, как аннотация типов клеток и предсказание чувствительности к лекарствам, с помощью простых текстовых команд. Модель демонстрирует высокую производительность и адаптивность к различным экспериментальным условиям.'}, 'en': {'title': 'InstructCell: Bridging Language and Biology for Seamless Single-Cell Analysis', 'desc': 'This paper introduces InstructCell, an AI tool designed to simplify the analysis of single-cell RNA sequencing (scRNA-seq) data using natural language instructions. By creating a dataset that links text commands with scRNA-seq profiles, InstructCell allows researchers to perform complex tasks like cell type annotation and drug sensitivity prediction more intuitively. The model employs a multi-modal architecture that processes both text and biological data simultaneously, enhancing its usability. Evaluations show that InstructCell outperforms existing models, making single-cell analysis more accessible and efficient for researchers in the life sciences.'}, 'zh': {'title': '用自然语言解锁单细胞数据的潜力', 'desc': '这篇论文介绍了InstructCell,一个多模态的人工智能助手,旨在通过自然语言简化单细胞RNA测序(scRNA-seq)数据的分析。传统工具在处理细胞生物学的复杂数据时效率低下,而InstructCell通过将文本指令与scRNA-seq数据结合,提供了更直接和灵活的分析方式。该系统能够执行细胞类型注释、条件伪细胞生成和药物敏感性预测等关键任务,且使用简单的自然语言命令即可完成。评估结果表明,InstructCell在性能上优于现有的单细胞基础模型,同时适应多种实验条件,降低了技术门槛,促进了生物学的深入理解。'}}}, {'id': 'https://huggingface.co/papers/2501.08225', 'title': 'FramePainter: Endowing Interactive Image Editing with Video Diffusion Priors', 'url': 'https://huggingface.co/papers/2501.08225', 'abstract': 'Interactive image editing allows users to modify images through visual interaction operations such as drawing, clicking, and dragging. Existing methods construct such supervision signals from videos, as they capture how objects change with various physical interactions. However, these models are usually built upon text-to-image diffusion models, so necessitate (i) massive training samples and (ii) an additional reference encoder to learn real-world dynamics and visual consistency. In this paper, we reformulate this task as an image-to-video generation problem, so that inherit powerful video diffusion priors to reduce training costs and ensure temporal consistency. Specifically, we introduce FramePainter as an efficient instantiation of this formulation. Initialized with Stable Video Diffusion, it only uses a lightweight sparse control encoder to inject editing signals. Considering the limitations of temporal attention in handling large motion between two frames, we further propose matching attention to enlarge the receptive field while encouraging dense correspondence between edited and source image tokens. We highlight the effectiveness and efficiency of FramePainter across various of editing signals: it domainantly outperforms previous state-of-the-art methods with far less training data, achieving highly seamless and coherent editing of images, \\eg, automatically adjust the reflection of the cup. Moreover, FramePainter also exhibits exceptional generalization in scenarios not present in real-world videos, \\eg, transform the clownfish into shark-like shape. Our code will be available at https://github.com/YBYBZhang/FramePainter.', 'score': 12, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '811cfd0f18eb1e53', 'authors': ['Yabo Zhang', 'Xinpeng Zhou', 'Yihan Zeng', 'Hang Xu', 'Hui Li', 'Wangmeng Zuo'], 'affiliations': ['Harbin Institute of Technology', 'Huawei Noahs Ark Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.08225.jpg', 'data': {'categories': ['#video', '#cv', '#optimization', '#diffusion'], 'emoji': '🎨', 'ru': {'title': 'FramePainter: эффективное редактирование изображений через генерацию видео', 'desc': 'Статья представляет FramePainter - новый подход к интерактивному редактированию изображений, основанный на генерации видео. В отличие от существующих методов, использующих модели диффузии текст-изображение, FramePainter опирается на мощные видео-диффузионные модели для обеспечения временной согласованности и снижения затрат на обучение. Метод использует легковесный энкодер для внедрения сигналов редактирования и вводит механизм согласованного внимания для улучшения обработки крупных движений между кадрами. FramePainter превосходит современные методы, требуя значительно меньше обучающих данных и демонстрируя высокую обобщающую способность.'}, 'en': {'title': 'Revolutionizing Image Editing with Efficient Video Diffusion', 'desc': 'This paper presents FramePainter, a novel approach to interactive image editing that reformulates the task as image-to-video generation. By leveraging video diffusion models, FramePainter reduces the need for extensive training data while ensuring temporal consistency in edited images. It utilizes a lightweight sparse control encoder to effectively incorporate editing signals, and introduces matching attention to improve the handling of large motion between frames. The results demonstrate that FramePainter significantly outperforms existing methods, achieving seamless image edits and showcasing strong generalization capabilities.'}, 'zh': {'title': 'FramePainter:高效的图像编辑新方法', 'desc': '本文提出了一种交互式图像编辑的新方法,称为FramePainter。该方法将图像编辑任务重新定义为图像到视频的生成问题,从而利用强大的视频扩散先验,降低训练成本并确保时间一致性。FramePainter使用轻量级的稀疏控制编码器来注入编辑信号,并通过匹配注意力机制增强了对大运动的处理能力。实验结果表明,FramePainter在各种编辑信号下表现优异,能够实现无缝且连贯的图像编辑,且在未见过的场景中也展现出卓越的泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2501.08326', 'title': 'Omni-RGPT: Unifying Image and Video Region-level Understanding via Token Marks', 'url': 'https://huggingface.co/papers/2501.08326', 'abstract': 'We present Omni-RGPT, a multimodal large language model designed to facilitate region-level comprehension for both images and videos. To achieve consistent region representation across spatio-temporal dimensions, we introduce Token Mark, a set of tokens highlighting the target regions within the visual feature space. These tokens are directly embedded into spatial regions using region prompts (e.g., boxes or masks) and simultaneously incorporated into the text prompt to specify the target, establishing a direct connection between visual and text tokens. To further support robust video understanding without requiring tracklets, we introduce an auxiliary task that guides Token Mark by leveraging the consistency of the tokens, enabling stable region interpretation across the video. Additionally, we introduce a large-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT achieves state-of-the-art results on image and video-based commonsense reasoning benchmarks while showing strong performance in captioning and referring expression comprehension tasks.', 'score': 11, 'issue_id': 1678, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '463580cacfaa6789', 'authors': ['Miran Heo', 'Min-Hung Chen', 'De-An Huang', 'Sifei Liu', 'Subhashree Radhakrishnan', 'Seon Joo Kim', 'Yu-Chiang Frank Wang', 'Ryo Hachiuma'], 'affiliations': ['NVIDIA', 'Yonsei University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08326.jpg', 'data': {'categories': ['#multimodal', '#reasoning', '#agi', '#cv', '#dataset', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'Omni-RGPT: Новый уровень понимания изображений и видео искусственным интеллектом', 'desc': 'Omni-RGPT - это мультимодальная большая языковая модель, разработанная для понимания изображений и видео на уровне регионов. Модель использует технологию Token Mark для выделения целевых регионов в визуальном пространстве признаков. Для улучшения понимания видео без необходимости трекинга объектов введена вспомогательная задача, использующая согласованность токенов. Авторы также представили большой набор данных RegVID-300k для обучения на видео с инструкциями на уровне регионов.'}, 'en': {'title': 'Omni-RGPT: Bridging Visual and Textual Understanding with Token Mark', 'desc': 'Omni-RGPT is a multimodal large language model that enhances understanding of specific regions in images and videos. It uses a novel approach called Token Mark, which embeds tokens into visual features to highlight target areas, linking them with text prompts. This model also includes an auxiliary task that ensures consistent token representation across video frames, improving video comprehension. With the introduction of the RegVID-300k dataset, Omni-RGPT sets new benchmarks in commonsense reasoning, captioning, and referring expression tasks.'}, 'zh': {'title': 'Omni-RGPT:图像与视频的区域理解新突破', 'desc': '本文介绍了Omni-RGPT,这是一种多模态的大型语言模型,旨在促进图像和视频的区域级理解。为了在时空维度上实现一致的区域表示,我们引入了Token Mark,这是一组突出视觉特征空间中目标区域的标记。通过使用区域提示(如框或掩码),这些标记被直接嵌入到空间区域中,并同时与文本提示结合,以指定目标,从而建立视觉和文本标记之间的直接联系。此外,我们还引入了一个辅助任务,通过利用标记的一致性来指导Token Mark,从而支持稳健的视频理解。'}}}, {'id': 'https://huggingface.co/papers/2501.07730', 'title': 'Democratizing Text-to-Image Masked Generative Models with Compact Text-Aware One-Dimensional Tokens', 'url': 'https://huggingface.co/papers/2501.07730', 'abstract': 'Image tokenizers form the foundation of modern text-to-image generative models but are notoriously difficult to train. Furthermore, most existing text-to-image models rely on large-scale, high-quality private datasets, making them challenging to replicate. In this work, we introduce Text-Aware Transformer-based 1-Dimensional Tokenizer (TA-TiTok), an efficient and powerful image tokenizer that can utilize either discrete or continuous 1-dimensional tokens. TA-TiTok uniquely integrates textual information during the tokenizer decoding stage (i.e., de-tokenization), accelerating convergence and enhancing performance. TA-TiTok also benefits from a simplified, yet effective, one-stage training process, eliminating the need for the complex two-stage distillation used in previous 1-dimensional tokenizers. This design allows for seamless scalability to large datasets. Building on this, we introduce a family of text-to-image Masked Generative Models (MaskGen), trained exclusively on open data while achieving comparable performance to models trained on private data. We aim to release both the efficient, strong TA-TiTok tokenizers and the open-data, open-weight MaskGen models to promote broader access and democratize the field of text-to-image masked generative models.', 'score': 10, 'issue_id': 1673, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '80f40715084c602b', 'authors': ['Dongwon Kim', 'Ju He', 'Qihang Yu', 'Chenglin Yang', 'Xiaohui Shen', 'Suha Kwak', 'Liang-Chieh Chen'], 'affiliations': ['ByteDance Seed', 'POSTECH'], 'pdf_title_img': 'assets/pdf/title_img/2501.07730.jpg', 'data': {'categories': ['#dataset', '#data', '#training', '#cv', '#open_source'], 'emoji': '🖼️', 'ru': {'title': 'Демократизация генерации изображений с помощью эффективной токенизации и открытых данных', 'desc': 'В этой статье представлен новый подход к токенизации изображений для генеративных моделей текст-в-изображение под названием TA-TiTok. Данный токенизатор использует одномерные токены и интегрирует текстовую информацию на этапе детокенизации, что ускоряет сходимость и улучшает производительность. На основе TA-TiTok авторы разработали семейство моделей MaskGen, обученных исключительно на открытых данных. Целью работы является демократизация области генеративных моделей текст-в-изображение путем публикации эффективных токенизаторов и моделей с открытыми весами.'}, 'en': {'title': 'Democratizing Text-to-Image Generation with TA-TiTok', 'desc': 'This paper presents TA-TiTok, a novel image tokenizer designed for text-to-image generative models, which simplifies the training process and improves performance. Unlike traditional models that require large private datasets, TA-TiTok can effectively utilize open data, making it more accessible for researchers. The tokenizer incorporates textual information during the decoding stage, which helps it learn faster and perform better. Additionally, the authors introduce MaskGen, a family of generative models that leverage TA-TiTok and are trained on publicly available datasets, aiming to democratize access to advanced text-to-image generation technology.'}, 'zh': {'title': '高效的文本到图像生成模型,推动开放数据的使用', 'desc': '本文介绍了一种新的图像标记器,称为TA-TiTok,它可以有效地处理文本到图像的生成任务。TA-TiTok在解码阶段整合了文本信息,从而加快了模型的收敛速度并提高了性能。与以往的标记器不同,TA-TiTok采用了一种简化的一阶段训练过程,避免了复杂的两阶段蒸馏过程。我们还提出了一系列基于开放数据训练的文本到图像生成模型MaskGen,旨在促进更广泛的访问和民主化。'}}}, {'id': 'https://huggingface.co/papers/2501.05131', 'title': '3DIS-FLUX: simple and efficient multi-instance generation with DiT rendering', 'url': 'https://huggingface.co/papers/2501.05131', 'abstract': "The growing demand for controllable outputs in text-to-image generation has driven significant advancements in multi-instance generation (MIG), enabling users to define both instance layouts and attributes. Currently, the state-of-the-art methods in MIG are primarily adapter-based. However, these methods necessitate retraining a new adapter each time a more advanced model is released, resulting in significant resource consumption. A methodology named Depth-Driven Decoupled Instance Synthesis (3DIS) has been introduced, which decouples MIG into two distinct phases: 1) depth-based scene construction and 2) detail rendering with widely pre-trained depth control models. The 3DIS method requires adapter training solely during the scene construction phase, while enabling various models to perform training-free detail rendering. Initially, 3DIS focused on rendering techniques utilizing U-Net architectures such as SD1.5, SD2, and SDXL, without exploring the potential of recent DiT-based models like FLUX. In this paper, we present 3DIS-FLUX, an extension of the 3DIS framework that integrates the FLUX model for enhanced rendering capabilities. Specifically, we employ the FLUX.1-Depth-dev model for depth map controlled image generation and introduce a detail renderer that manipulates the Attention Mask in FLUX's Joint Attention mechanism based on layout information. This approach allows for the precise rendering of fine-grained attributes of each instance. Our experimental results indicate that 3DIS-FLUX, leveraging the FLUX model, outperforms the original 3DIS method, which utilized SD2 and SDXL, and surpasses current state-of-the-art adapter-based methods in terms of both performance and image quality. Project Page: https://limuloo.github.io/3DIS/.", 'score': 9, 'issue_id': 1684, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ca5ad23cb146f3aa', 'authors': ['Dewei Zhou', 'Ji Xie', 'Zongxin Yang', 'Yi Yang'], 'affiliations': ['DBMI, HMS, Harvard University', 'RELER, CCAI, Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05131.jpg', 'data': {'categories': ['#cv', '#games', '#architecture', '#multimodal', '#optimization'], 'emoji': '🎨', 'ru': {'title': '3DIS-FLUX: Новый уровень контролируемой генерации мульти-объектных изображений', 'desc': 'Статья представляет метод 3DIS-FLUX для управляемой генерации изображений с несколькими объектами. Этот подход разделяет процесс на создание сцены на основе глубины и детализированный рендеринг с использованием предобученных моделей контроля глубины. 3DIS-FLUX интегрирует модель FLUX для улучшенного рендеринга, манипулируя маской внимания в механизме совместного внимания FLUX. Эксперименты показывают, что 3DIS-FLUX превосходит предыдущие методы по производительности и качеству изображений.'}, 'en': {'title': 'Enhancing Text-to-Image Generation with 3DIS-FLUX', 'desc': 'This paper introduces a new method called 3DIS-FLUX for improving text-to-image generation by enhancing the multi-instance generation (MIG) process. The 3DIS framework separates the generation into two phases: constructing the scene based on depth and rendering details using pre-trained models. By integrating the FLUX model, the method allows for better control over the rendering of fine details while reducing the need for retraining adapters. Experimental results show that 3DIS-FLUX outperforms previous methods in both performance and image quality, making it a significant advancement in controllable image generation.'}, 'zh': {'title': '深度驱动解耦实例合成:提升图像生成的可控性与质量', 'desc': '随着对可控文本到图像生成输出的需求增加,多实例生成(MIG)技术得到了显著进展。现有的MIG方法主要基于适配器,但每次新模型发布时都需要重新训练适配器,消耗大量资源。本文提出了一种名为深度驱动解耦实例合成(3DIS)的方法,将MIG分为两个阶段:基于深度的场景构建和细节渲染。通过引入FLUX模型,3DIS-FLUX在细节渲染方面实现了更高的性能和图像质量。'}}}, {'id': 'https://huggingface.co/papers/2501.08328', 'title': 'PokerBench: Training Large Language Models to become Professional Poker Players', 'url': 'https://huggingface.co/papers/2501.08328', 'abstract': 'We introduce PokerBench - a benchmark for evaluating the poker-playing abilities of large language models (LLMs). As LLMs excel in traditional NLP tasks, their application to complex, strategic games like poker poses a new challenge. Poker, an incomplete information game, demands a multitude of skills such as mathematics, reasoning, planning, strategy, and a deep understanding of game theory and human psychology. This makes Poker the ideal next frontier for large language models. PokerBench consists of a comprehensive compilation of 11,000 most important scenarios, split between pre-flop and post-flop play, developed in collaboration with trained poker players. We evaluate prominent models including GPT-4, ChatGPT 3.5, and various Llama and Gemma series models, finding that all state-of-the-art LLMs underperform in playing optimal poker. However, after fine-tuning, these models show marked improvements. We validate PokerBench by having models with different scores compete with each other, demonstrating that higher scores on PokerBench lead to higher win rates in actual poker games. Through gameplay between our fine-tuned model and GPT-4, we also identify limitations of simple supervised fine-tuning for learning optimal playing strategy, suggesting the need for more advanced methodologies for effectively training language models to excel in games. PokerBench thus presents a unique benchmark for a quick and reliable evaluation of the poker-playing ability of LLMs as well as a comprehensive benchmark to study the progress of LLMs in complex game-playing scenarios. The dataset and code will be made available at: https://github.com/pokerllm/pokerbench.', 'score': 9, 'issue_id': 1674, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '7b4dacedffdbfa15', 'authors': ['Richard Zhuang', 'Akshat Gupta', 'Richard Yang', 'Aniket Rahane', 'Zhengyu Li', 'Gopala Anumanchipalli'], 'affiliations': ['Georgia Institute of Technology', 'University of California, Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.08328.jpg', 'data': {'categories': ['#training', '#reasoning', '#games', '#optimization', '#benchmark'], 'emoji': '🃏', 'ru': {'title': 'PokerBench: новый рубеж для оценки стратегических способностей языковых моделей', 'desc': 'PokerBench - это новый бенчмарк для оценки способностей больших языковых моделей (LLM) играть в покер. Он включает 11000 важнейших сценариев игры, разработанных совместно с профессиональными игроками. Авторы оценили производительность современных LLM, таких как GPT-4 и ChatGPT 3.5, обнаружив, что все модели показывают результаты ниже оптимальных. После дообучения модели демонстрируют значительное улучшение, но авторы отмечают ограничения простого обучения с учителем для освоения оптимальной стратегии игры.'}, 'en': {'title': 'PokerBench: Elevating LLMs to Master the Game of Poker', 'desc': 'PokerBench is a new benchmark designed to assess the poker-playing skills of large language models (LLMs). It focuses on the unique challenges of poker, which requires a blend of mathematical skills, strategic reasoning, and an understanding of human psychology. The benchmark includes 11,000 scenarios that cover various aspects of the game, and it has been tested on several leading models, revealing that they initially struggle with optimal poker play. However, after fine-tuning, these models show significant improvement, highlighting the need for advanced training techniques to enhance their performance in complex games.'}, 'zh': {'title': 'PokerBench:评估语言模型扑克能力的新基准', 'desc': '我们介绍了PokerBench,这是一个用于评估大型语言模型(LLMs)扑克游戏能力的基准。扑克是一种不完全信息游戏,需要数学、推理、规划、策略以及对博弈论和人类心理的深刻理解。PokerBench包含11,000个重要场景,分为翻牌前和翻牌后游戏,经过训练的扑克玩家共同开发。通过对不同模型的评估,我们发现尽管当前的LLMs在扑克游戏中表现不佳,但经过微调后,它们的表现有显著提升。'}}}, {'id': 'https://huggingface.co/papers/2501.08319', 'title': 'Enhancing Automated Interpretability with Output-Centric Feature Descriptions', 'url': 'https://huggingface.co/papers/2501.08319', 'abstract': 'Automated interpretability pipelines generate natural language descriptions for the concepts represented by features in large language models (LLMs), such as plants or the first word in a sentence. These descriptions are derived using inputs that activate the feature, which may be a dimension or a direction in the model\'s representation space. However, identifying activating inputs is costly, and the mechanistic role of a feature in model behavior is determined both by how inputs cause a feature to activate and by how feature activation affects outputs. Using steering evaluations, we reveal that current pipelines provide descriptions that fail to capture the causal effect of the feature on outputs. To fix this, we propose efficient, output-centric methods for automatically generating feature descriptions. These methods use the tokens weighted higher after feature stimulation or the highest weight tokens after applying the vocabulary "unembedding" head directly to the feature. Our output-centric descriptions better capture the causal effect of a feature on model outputs than input-centric descriptions, but combining the two leads to the best performance on both input and output evaluations. Lastly, we show that output-centric descriptions can be used to find inputs that activate features previously thought to be "dead".', 'score': 7, 'issue_id': 1677, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '22615e3bb16f93af', 'authors': ['Yoav Gur-Arieh', 'Roy Mayan', 'Chen Agassy', 'Atticus Geiger', 'Mor Geva'], 'affiliations': ['Blavatnik School of Computer Science and AI, Tel Aviv University', 'Pr(Ai)2R Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.08319.jpg', 'data': {'categories': ['#interpretability', '#inference', '#training', '#data'], 'emoji': '🔍', 'ru': {'title': 'Взгляд изнутри: новый метод интерпретации больших языковых моделей', 'desc': 'Статья описывает новый подход к автоматической интерпретации нейронных сетей, фокусируясь на выходных данных модели вместо входных. Авторы предлагают эффективные методы для генерации описаний признаков, основанные на токенах с наибольшим весом после стимуляции признака. Эксперименты показывают, что ориентированные на выход описания лучше отражают причинно-следственное влияние признака на результаты модели. Комбинация подходов, ориентированных на вход и выход, дает наилучшие результаты в оценке как входных, так и выходных данных.'}, 'en': {'title': 'Unlocking Feature Interpretability in Language Models', 'desc': 'This paper discusses how automated interpretability pipelines can create natural language descriptions for features in large language models (LLMs). It highlights the challenge of identifying inputs that activate these features, which is essential for understanding their role in model behavior. The authors propose new methods that focus on the output effects of features, leading to more accurate descriptions of their causal impact. By combining both input-centric and output-centric approaches, the proposed methods improve the overall interpretability of LLMs and can even identify previously overlooked features.'}, 'zh': {'title': '以输出为中心的特征描述生成方法', 'desc': '这篇论文讨论了自动化可解释性管道如何为大型语言模型中的特征生成自然语言描述。特征的描述是通过激活特征的输入生成的,但识别这些输入的过程成本高昂。研究表明,现有的描述方法未能有效捕捉特征对输出的因果影响。为此,作者提出了一种以输出为中心的方法,能够更好地生成特征描述,并结合输入和输出的评估来提高性能。'}}}, {'id': 'https://huggingface.co/papers/2501.08197', 'title': 'OpenCSG Chinese Corpus: A Series of High-quality Chinese Datasets for LLM Training', 'url': 'https://huggingface.co/papers/2501.08197', 'abstract': 'Large language models (LLMs) have demonstrated remarkable capabilities, but their success heavily relies on the quality of pretraining corpora. For Chinese LLMs, the scarcity of high-quality Chinese datasets presents a significant challenge, often limiting their performance. To address this issue, we propose the OpenCSG Chinese Corpus, a series of high-quality datasets specifically designed for LLM pretraining, post-training, and fine-tuning. This corpus includes Fineweb-edu-chinese, Fineweb-edu-chinese-v2, Cosmopedia-chinese, and Smoltalk-chinese, each with distinct characteristics: Fineweb-edu datasets focus on filtered, high-quality content derived from diverse Chinese web sources; Cosmopedia-chinese provides synthetic, textbook-style data for knowledge-intensive training; and Smoltalk-chinese emphasizes stylistic and diverse chat-format data. The OpenCSG Chinese Corpus is characterized by its high-quality text, diverse coverage across domains, and scalable, reproducible data curation processes. Additionally, we conducted extensive experimental analyses, including evaluations on smaller parameter models, which demonstrated significant performance improvements in tasks such as C-Eval, showcasing the effectiveness of the corpus for training Chinese LLMs.', 'score': 5, 'issue_id': 1675, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '27267ae1a569051c', 'authors': ['Yijiong Yu', 'Ziyun Dai', 'Zekun Wang', 'Wei Wang', 'Ran Chen', 'Ji Pei'], 'affiliations': ['OpenCSG', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08197.jpg', 'data': {'categories': ['#data', '#open_source', '#dataset', '#synthetic', '#training', '#low_resource'], 'emoji': '🐉', 'ru': {'title': 'Прорыв в обучении китайских языковых моделей: OpenCSG Chinese Corpus', 'desc': 'Эта статья представляет OpenCSG Chinese Corpus - набор высококачественных китайских датасетов для предобучения, пост-обучения и тонкой настройки больших языковых моделей (LLM). Корпус включает в себя несколько датасетов, каждый с уникальными характеристиками: от отфильтрованного веб-контента до синтетических учебных данных и разговорных форматов. Авторы подчеркивают высокое качество текста, разнообразие тематик и масштабируемость процесса сбора данных. Эксперименты показали значительное улучшение производительности моделей на различных задачах, включая C-Eval.'}, 'en': {'title': 'Empowering Chinese LLMs with OpenCSG Corpus', 'desc': 'This paper introduces the OpenCSG Chinese Corpus, a collection of high-quality datasets aimed at improving the performance of Chinese large language models (LLMs). The corpus includes several datasets, each tailored for different training needs: Fineweb-edu datasets focus on high-quality web content, Cosmopedia-chinese offers synthetic textbook-style data, and Smoltalk-chinese provides diverse chat-format data. The authors highlight the importance of quality pretraining data for LLMs and demonstrate through experiments that using this corpus leads to significant performance gains in various evaluation tasks. Overall, the OpenCSG Chinese Corpus addresses the challenge of limited high-quality datasets for Chinese LLMs, promoting better training outcomes.'}, 'zh': {'title': '提升中文LLM性能的高质量语料库', 'desc': '大型语言模型(LLMs)在处理自然语言方面表现出色,但其成功依赖于高质量的预训练语料库。针对中文LLMs,优质中文数据集的稀缺性成为了一个重大挑战,限制了它们的性能。为了解决这个问题,我们提出了OpenCSG中文语料库,这是一系列专门为LLM预训练、后训练和微调设计的高质量数据集。该语料库包括Fineweb-edu-chinese、Fineweb-edu-chinese-v2、Cosmopedia-chinese和Smoltalk-chinese,涵盖了多样化的内容和风格,显著提升了中文LLMs的训练效果。'}}}, {'id': 'https://huggingface.co/papers/2501.08167', 'title': 'Potential and Perils of Large Language Models as Judges of Unstructured Textual Data', 'url': 'https://huggingface.co/papers/2501.08167', 'abstract': "Rapid advancements in large language models have unlocked remarkable capabilities when it comes to processing and summarizing unstructured text data. This has implications for the analysis of rich, open-ended datasets, such as survey responses, where LLMs hold the promise of efficiently distilling key themes and sentiments. However, as organizations increasingly turn to these powerful AI systems to make sense of textual feedback, a critical question arises, can we trust LLMs to accurately represent the perspectives contained within these text based datasets? While LLMs excel at generating human-like summaries, there is a risk that their outputs may inadvertently diverge from the true substance of the original responses. Discrepancies between the LLM-generated outputs and the actual themes present in the data could lead to flawed decision-making, with far-reaching consequences for organizations. This research investigates the effectiveness of LLMs as judge models to evaluate the thematic alignment of summaries generated by other LLMs. We utilized an Anthropic Claude model to generate thematic summaries from open-ended survey responses, with Amazon's Titan Express, Nova Pro, and Meta's Llama serving as LLM judges. The LLM-as-judge approach was compared to human evaluations using Cohen's kappa, Spearman's rho, and Krippendorff's alpha, validating a scalable alternative to traditional human centric evaluation methods. Our findings reveal that while LLMs as judges offer a scalable solution comparable to human raters, humans may still excel at detecting subtle, context-specific nuances. This research contributes to the growing body of knowledge on AI assisted text analysis. We discuss limitations and provide recommendations for future research, emphasizing the need for careful consideration when generalizing LLM judge models across various contexts and use cases.", 'score': 5, 'issue_id': 1675, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '866161709624c632', 'authors': ['Rewina Bedemariam', 'Natalie Perez', 'Sreyoshi Bhaduri', 'Satya Kapoor', 'Alex Gil', 'Elizabeth Conjar', 'Ikkei Itoku', 'David Theil', 'Aman Chadha', 'Naumaan Nayyar'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08167.jpg', 'data': {'categories': ['#data', '#dataset', '#science', '#ethics', '#multimodal', '#benchmark', '#interpretability'], 'emoji': '🤖', 'ru': {'title': 'LLM как судьи: масштабируемая альтернатива человеческим оценкам в анализе текста', 'desc': 'Исследование посвящено использованию больших языковых моделей (LLM) для анализа неструктурированных текстовых данных, таких как ответы на опросы. Авторы изучают эффективность применения LLM в качестве судей для оценки тематического соответствия сгенерированных другими LLM резюме. Результаты показывают, что LLM-судьи предлагают масштабируемое решение, сопоставимое с оценками людей, хотя люди все еще могут превосходить их в обнаружении тонких, контекстно-зависимых нюансов. Исследование вносит вклад в растущий объем знаний об анализе текста с помощью искусственного интеллекта.'}, 'en': {'title': 'Trusting AI: Evaluating LLMs for Accurate Text Analysis', 'desc': 'This paper explores the use of large language models (LLMs) for summarizing and analyzing unstructured text data, particularly from open-ended survey responses. It raises concerns about the trustworthiness of LLM-generated summaries, as they may not accurately reflect the original sentiments and themes present in the data. The research introduces an LLM-as-judge framework, where one LLM generates summaries while others evaluate their thematic alignment, comparing this method to human evaluations. The findings suggest that while LLMs can provide a scalable alternative to human raters, they may struggle with detecting subtle nuances that humans can identify, highlighting the importance of careful application in different contexts.'}, 'zh': {'title': '信任大型语言模型的总结能力吗?', 'desc': '这篇论文探讨了大型语言模型(LLMs)在处理和总结非结构化文本数据方面的能力,尤其是在分析开放式调查反馈时的应用。研究表明,虽然LLMs能够生成类似人类的总结,但它们的输出可能与原始文本的真实主题存在偏差,这可能导致错误的决策。为了评估LLMs生成的总结与实际主题的一致性,研究使用了LLMs作为评判模型,并与人类评估进行了比较。结果显示,LLMs作为评判者提供了一种可扩展的解决方案,但人类在捕捉细微的上下文特征方面仍然表现更佳。'}}}, {'id': 'https://huggingface.co/papers/2501.07888', 'title': 'Tarsier2: Advancing Large Vision-Language Models from Detailed Video Description to Comprehensive Video Understanding', 'url': 'https://huggingface.co/papers/2501.07888', 'abstract': 'We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM) designed for generating detailed and accurate video descriptions, while also exhibiting superior general video understanding capabilities. Tarsier2 achieves significant advancements through three key upgrades: (1) Scaling pre-training data from 11M to 40M video-text pairs, enriching both volume and diversity; (2) Performing fine-grained temporal alignment during supervised fine-tuning; (3) Using model-based sampling to automatically construct preference data and applying DPO training for optimization. Extensive experiments show that Tarsier2-7B consistently outperforms leading proprietary models, including GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K benchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\% performance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B also sets new state-of-the-art results across 15 public benchmarks, spanning tasks such as video question-answering, video grounding, hallucination test, and embodied question-answering, demonstrating its versatility as a robust generalist vision-language model.', 'score': 5, 'issue_id': 1674, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '54780a4b6f93fb10', 'authors': ['Liping Yuan', 'Jiawei Wang', 'Haomiao Sun', 'Yuchen Zhang', 'Yuan Lin'], 'affiliations': ['ByteDance Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.07888.jpg', 'data': {'categories': ['#dataset', '#training', '#cv', '#hallucinations', '#optimization', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'Tarsier2: Революция в понимании видео искусственным интеллектом', 'desc': 'Tarsier2 - это современная крупномасштабная модель для понимания видео и языка (LVLM), разработанная для создания детальных и точных описаний видео. Модель достигает значительных улучшений благодаря увеличению объема обучающих данных, точной временной синхронизации при тонкой настройке и применению обучения с предпочтениями (DPO). Tarsier2-7B превосходит ведущие проприетарные модели, такие как GPT-4o и Gemini 1.5 Pro, в задачах детального описания видео. Модель также устанавливает новые рекорды в 15 публичных бенчмарках, демонстрируя свою универсальность как надежная модель общего назначения для понимания видео и языка.'}, 'en': {'title': 'Tarsier2: Redefining Video Understanding with Advanced LVLM Technology', 'desc': "Tarsier2 is a cutting-edge large vision-language model (LVLM) that excels in generating precise and detailed descriptions of videos while showcasing advanced video comprehension skills. The model's improvements stem from three main enhancements: increasing the pre-training dataset from 11 million to 40 million video-text pairs, implementing fine-grained temporal alignment during fine-tuning, and utilizing model-based sampling for preference data construction with DPO training for optimization. Extensive testing reveals that Tarsier2-7B surpasses top proprietary models like GPT-4o and Gemini 1.5 Pro in video description tasks, achieving notable F1 score improvements on the DREAM-1K benchmark. Additionally, Tarsier2-7B sets new records across 15 public benchmarks, proving its effectiveness in various tasks such as video question-answering and video grounding."}, 'zh': {'title': 'Tarsier2:视频描述的新标杆', 'desc': 'Tarsier2是一种先进的大型视觉语言模型,专门用于生成详细且准确的视频描述,同时具备出色的视频理解能力。该模型通过三个关键升级实现了显著进步:首先,预训练数据从1100万对视频文本扩展到4000万对,增加了数据的数量和多样性;其次,在监督微调过程中进行精细的时间对齐;最后,采用基于模型的采样自动构建偏好数据,并应用DPO训练进行优化。实验结果表明,Tarsier2-7B在视频描述任务中持续超越领先的专有模型,展现出其作为强大通用视觉语言模型的多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.08292', 'title': 'HALoGEN: Fantastic LLM Hallucinations and Where to Find Them', 'url': 'https://huggingface.co/papers/2501.08292', 'abstract': 'Despite their impressive ability to generate high-quality and fluent text, generative large language models (LLMs) also produce hallucinations: statements that are misaligned with established world knowledge or provided input context. However, measuring hallucination can be challenging, as having humans verify model generations on-the-fly is both expensive and time-consuming. In this work, we release HALoGEN, a comprehensive hallucination benchmark consisting of: (1) 10,923 prompts for generative models spanning nine domains including programming, scientific attribution, and summarization, and (2) automatic high-precision verifiers for each use case that decompose LLM generations into atomic units, and verify each unit against a high-quality knowledge source. We use this framework to evaluate ~150,000 generations from 14 language models, finding that even the best-performing models are riddled with hallucinations (sometimes up to 86% of generated atomic facts depending on the domain). We further define a novel error classification for LLM hallucinations based on whether they likely stem from incorrect recollection of training data (Type A errors), or incorrect knowledge in training data (Type B errors), or are fabrication (Type C errors). We hope our framework provides a foundation to enable the principled study of why generative models hallucinate, and advances the development of trustworthy large language models.', 'score': 5, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'f6751d682ff824ed', 'authors': ['Abhilasha Ravichander', 'Shrusti Ghela', 'David Wadden', 'Yejin Choi'], 'affiliations': ['Google', 'NVIDIA', 'University of Washington'], 'pdf_title_img': 'assets/pdf/title_img/2501.08292.jpg', 'data': {'categories': ['#dataset', '#hallucinations', '#benchmark'], 'emoji': '🔍', 'ru': {'title': 'HALoGEN: Автоматическая проверка галлюцинаций в языковых моделях', 'desc': 'Эта статья представляет HALoGEN - комплексный инструмент для оценки галлюцинаций в больших языковых моделях (LLM). Авторы создали набор из 10,923 промптов в девяти различных областях и автоматические верификаторы высокой точности для проверки генераций LLM. Исследование выявило, что даже лучшие модели страдают от галлюцинаций, иногда до 86% сгенерированных фактов оказываются неверными. Авторы также предложили новую классификацию ошибок LLM, разделив их на три типа в зависимости от источника галлюцинаций.'}, 'en': {'title': 'HALoGEN: A Benchmark for Measuring Hallucinations in Language Models', 'desc': 'This paper introduces HALoGEN, a new benchmark designed to measure hallucinations in generative large language models (LLMs). Hallucinations refer to incorrect statements generated by these models that do not align with known facts or the given context. The benchmark includes over 10,000 prompts across various domains and employs automatic verifiers to assess the accuracy of model outputs. The study reveals that even top-performing models exhibit significant hallucinations, prompting a classification system for different types of errors to better understand their origins and improve model reliability.'}, 'zh': {'title': '揭示生成模型的幻觉问题', 'desc': '尽管生成性大型语言模型(LLMs)能够生成高质量和流畅的文本,但它们也会产生幻觉,即与已知世界知识或输入上下文不一致的陈述。测量幻觉的难度在于,实时验证模型生成的内容既昂贵又耗时。为此,我们推出了HALoGEN,这是一个全面的幻觉基准,包含10,923个跨越九个领域的提示和自动高精度验证器。我们的研究发现,即使是表现最好的模型,其生成的原子事实中也有高达86%可能存在幻觉,这为理解生成模型的幻觉提供了基础。'}}}, {'id': 'https://huggingface.co/papers/2501.08284', 'title': 'AfriHate: A Multilingual Collection of Hate Speech and Abusive Language Datasets for African Languages', 'url': 'https://huggingface.co/papers/2501.08284', 'abstract': 'Hate speech and abusive language are global phenomena that need socio-cultural background knowledge to be understood, identified, and moderated. However, in many regions of the Global South, there have been several documented occurrences of (1) absence of moderation and (2) censorship due to the reliance on keyword spotting out of context. Further, high-profile individuals have frequently been at the center of the moderation process, while large and targeted hate speech campaigns against minorities have been overlooked. These limitations are mainly due to the lack of high-quality data in the local languages and the failure to include local communities in the collection, annotation, and moderation processes. To address this issue, we present AfriHate: a multilingual collection of hate speech and abusive language datasets in 15 African languages. Each instance in AfriHate is annotated by native speakers familiar with the local culture. We report the challenges related to the construction of the datasets and present various classification baseline results with and without using LLMs. The datasets, individual annotations, and hate speech and offensive language lexicons are available on https://github.com/AfriHate/AfriHate', 'score': 3, 'issue_id': 1676, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '8c76dd102740009c', 'authors': ['Shamsuddeen Hassan Muhammad', 'Idris Abdulmumin', 'Abinew Ali Ayele', 'David Ifeoluwa Adelani', 'Ibrahim Said Ahmad', 'Saminu Mohammad Aliyu', 'Nelson Odhiambo Onyango', 'Lilian D. A. Wanzare', 'Samuel Rutunda', 'Lukman Jibril Aliyu', 'Esubalew Alemneh', 'Oumaima Hourrane', 'Hagos Tesfahun Gebremichael', 'Elyas Abdi Ismail', 'Meriem Beloucif', 'Ebrahim Chekol Jibril', 'Andiswa Bukula', 'Rooweither Mabuya', 'Salomey Osei', 'Abigail Oppong', 'Tadesse Destaw Belay', 'Tadesse Kebede Guge', 'Tesfa Tegegne Asfaw', 'Chiamaka Ijeoma Chukwuneke', 'Paul Röttger', 'Seid Muhie Yimam', 'Nedjma Ousidhoum'], 'affiliations': ['Addis Ababa University', 'Al Akhawayn University', 'Bahir Dar University', 'Bayero University Kano', 'Bocconi University', 'Cardiff University', 'DSFSI, University of Pretoria', 'Digital Umuganda', 'Haramaya University', 'HausaNLP', 'Imperial College London', 'Independent Researcher', 'Instituto Politécnico Nacional', 'Istanbul Technical University', 'Lancaster University', 'Maseno University', 'Mila, McGill University & Canada CIFAR AI Chair', 'Northeastern University', 'SADiLaR', 'University of Deusto', 'University of Hamburg', 'Uppsala University', 'Wollo University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08284.jpg', 'data': {'categories': ['#dataset', '#ethics', '#multilingual', '#data', '#low_resource', '#open_source'], 'emoji': '🌍', 'ru': {'title': 'AfriHate: борьба с языком вражды в Африке с помощью локальных данных и экспертизы', 'desc': 'Статья представляет AfriHate - многоязычный набор данных по языку вражды и оскорбительной лексике на 15 африканских языках. Датасет создан для решения проблемы недостатка качественных данных на местных языках и отсутствия вовлечения локальных сообществ в процессы сбора, разметки и модерации контента. Каждый пример в AfriHate размечен носителями языка, знакомыми с местной культурой. Авторы описывают трудности, связанные с созданием датасетов, и представляют результаты базовых классификационных моделей, в том числе с использованием языковых моделей.'}, 'en': {'title': 'Empowering Local Voices Against Hate Speech with AfriHate', 'desc': 'This paper addresses the challenges of identifying and moderating hate speech in the Global South, particularly in African languages. It highlights the limitations of existing moderation techniques that rely on keyword spotting without cultural context, leading to ineffective censorship and oversight of targeted hate campaigns. To combat this, the authors introduce AfriHate, a multilingual dataset of hate speech and abusive language in 15 African languages, annotated by native speakers. The paper also discusses the difficulties faced during dataset construction and presents baseline classification results, demonstrating the potential of using large language models (LLMs) for this task.'}, 'zh': {'title': '构建多语言仇恨言论数据集,助力社会文化理解', 'desc': '本论文介绍了AfriHate,这是一个包含15种非洲语言的仇恨言论和辱骂语言数据集。该数据集由熟悉当地文化的母语者进行标注,以解决全球南方地区在仇恨言论管理中的数据缺乏问题。研究还探讨了数据集构建过程中的挑战,并展示了使用和不使用大型语言模型(LLMs)进行分类的基线结果。所有数据集、标注和相关词汇表均可在指定网站上获取。'}}}, {'id': 'https://huggingface.co/papers/2501.08120', 'title': 'In-situ graph reasoning and knowledge expansion using Graph-PReFLexOR', 'url': 'https://huggingface.co/papers/2501.08120', 'abstract': "The pursuit of automated scientific discovery has fueled progress from symbolic logic to modern AI, forging new frontiers in reasoning and pattern recognition. Transformers function as potential systems, where every possible relationship remains latent potentiality until tasks impose constraints, akin to measurement. Yet, refining their sampling requires more than probabilistic selection: solutions must conform to specific structures or rules, ensuring consistency and the invocation of general principles. We present Graph-PReFLexOR (Graph-based Preference-based Recursive Language Modeling for Exploratory Optimization of Reasoning), a framework that combines graph reasoning with symbolic abstraction to dynamically expand domain knowledge. Inspired by reinforcement learning, Graph-PReFLexOR defines reasoning as a structured mapping, where tasks yield knowledge graphs, abstract patterns, and ultimately, final answers. Inspired by category theory, it encodes concepts as nodes and their relationships as edges, supporting hierarchical inference and adaptive learning through isomorphic representations. Demonstrations include hypothesis generation, materials design, and creative reasoning, such as discovering relationships between mythological concepts like 'thin places' with materials science. We propose a 'knowledge garden growth' strategy that integrates insights across domains, promoting interdisciplinary connections. Results with a 3-billion-parameter Graph-PReFLexOR model show superior reasoning depth and adaptability, underscoring the potential for transparent, multidisciplinary AI-driven discovery. It lays the groundwork for general autonomous reasoning solutions.", 'score': 1, 'issue_id': 1683, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'f8f5360d1fb8fb75', 'authors': ['Markus J. Buehler'], 'affiliations': ['Laboratory for Atomistic and Molecular Mechanics, MIT, Cambridge, MA 02139, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.08120.jpg', 'data': {'categories': ['#multimodal', '#reasoning', '#agents', '#graphs', '#rl', '#science', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Graph-PReFLexOR: Новый горизонт в автономном научном открытии', 'desc': 'Статья представляет Graph-PReFLexOR - фреймворк, объединяющий графовые рассуждения с символьной абстракцией для динамического расширения предметных знаний. Вдохновленный теорией категорий, он кодирует концепции как узлы, а их отношения как ребра, поддерживая иерархический вывод и адаптивное обучение. Демонстрации включают генерацию гипотез, дизайн материалов и творческие рассуждения, такие как обнаружение связей между мифологическими концепциями и материаловедением. Результаты с 3-миллиардной моделью Graph-PReFLexOR показывают превосходную глубину рассуждений и адаптивность, подчеркивая потенциал для прозрачных, междисциплинарных решений на основе ИИ.'}, 'en': {'title': 'Empowering AI with Graph-Based Reasoning for Scientific Discovery', 'desc': 'This paper introduces Graph-PReFLexOR, a novel framework that enhances automated scientific discovery by integrating graph reasoning with symbolic abstraction. It utilizes a structured mapping approach inspired by reinforcement learning, allowing for the generation of knowledge graphs and abstract patterns from various tasks. The framework supports hierarchical inference and adaptive learning, enabling it to explore interdisciplinary connections effectively. Demonstrations of its capabilities include hypothesis generation and creative reasoning, showcasing its potential for deep and adaptable reasoning in AI-driven discovery.'}, 'zh': {'title': '知识花园的成长:跨领域的智能推理', 'desc': '这篇论文介绍了一种名为Graph-PReFLexOR的框架,它结合了图推理和符号抽象,以动态扩展领域知识。该框架通过结构化映射定义推理,利用知识图谱和抽象模式来生成最终答案。它的灵感来自强化学习和范畴理论,将概念编码为节点,关系编码为边,支持层次推理和自适应学习。实验结果表明,Graph-PReFLexOR在推理深度和适应性方面表现优越,为自动化推理解决方案奠定了基础。'}}}, {'id': 'https://huggingface.co/papers/2501.07556', 'title': 'MatchAnything: Universal Cross-Modality Image Matching with Large-Scale Pre-Training', 'url': 'https://huggingface.co/papers/2501.07556', 'abstract': 'Image matching, which aims to identify corresponding pixel locations between images, is crucial in a wide range of scientific disciplines, aiding in image registration, fusion, and analysis. In recent years, deep learning-based image matching algorithms have dramatically outperformed humans in rapidly and accurately finding large amounts of correspondences. However, when dealing with images captured under different imaging modalities that result in significant appearance changes, the performance of these algorithms often deteriorates due to the scarcity of annotated cross-modal training data. This limitation hinders applications in various fields that rely on multiple image modalities to obtain complementary information. To address this challenge, we propose a large-scale pre-training framework that utilizes synthetic cross-modal training signals, incorporating diverse data from various sources, to train models to recognize and match fundamental structures across images. This capability is transferable to real-world, unseen cross-modality image matching tasks. Our key finding is that the matching model trained with our framework achieves remarkable generalizability across more than eight unseen cross-modality registration tasks using the same network weight, substantially outperforming existing methods, whether designed for generalization or tailored for specific tasks. This advancement significantly enhances the applicability of image matching technologies across various scientific disciplines and paves the way for new applications in multi-modality human and artificial intelligence analysis and beyond.', 'score': 0, 'issue_id': 1688, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': 'ad0c408491c545d5', 'authors': ['Xingyi He', 'Hao Yu', 'Sida Peng', 'Dongli Tan', 'Zehong Shen', 'Hujun Bao', 'Xiaowei Zhou'], 'affiliations': ['Shandong University', 'State Key Lab of CAD&CG, Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07556.jpg', 'data': {'categories': ['#synthetic', '#dataset', '#multimodal', '#transfer_learning', '#cv'], 'emoji': '🔍', 'ru': {'title': 'Универсальное сопоставление изображений разных модальностей с помощью глубокого обучения', 'desc': 'Статья представляет новый подход к сопоставлению изображений разных модальностей с использованием глубокого обучения. Авторы предлагают фреймворк для предварительного обучения на синтетических кросс-модальных данных, что позволяет модели распознавать фундаментальные структуры в изображениях. Обученная модель демонстрирует впечатляющую обобщаемость на более чем восемь новых задач кросс-модальной регистрации, значительно превосходя существующие методы. Это достижение открывает новые возможности для применения технологий сопоставления изображений в различных научных дисциплинах.'}, 'en': {'title': 'Enhancing Image Matching Across Modalities with Synthetic Training', 'desc': "This paper presents a new framework for image matching that helps identify corresponding pixel locations between images taken in different ways. Traditional deep learning methods struggle with this task due to a lack of annotated training data for different image types. The proposed solution uses synthetic training signals from diverse sources to improve the model's ability to recognize and match structures across various images. As a result, the model shows excellent performance in unseen cross-modal tasks, making it highly useful for applications in many scientific fields."}, 'zh': {'title': '跨模态图像匹配的新突破', 'desc': '本文提出了一种大规模预训练框架,用于解决图像匹配中的跨模态问题。该框架利用合成的跨模态训练信号,结合来自不同来源的多样化数据,训练模型识别和匹配图像中的基本结构。研究发现,使用该框架训练的匹配模型在超过八个未见的跨模态配准任务中表现出显著的泛化能力,远超现有方法。此进展大大增强了图像匹配技术在各科学领域的适用性,并为多模态人类和人工智能分析的新应用铺平了道路。'}}}, {'id': 'https://huggingface.co/papers/2501.13200', 'title': 'SRMT: Shared Memory for Multi-agent Lifelong Pathfinding', 'url': 'https://huggingface.co/papers/2501.13200', 'abstract': "Multi-agent reinforcement learning (MARL) demonstrates significant progress in solving cooperative and competitive multi-agent problems in various environments. One of the principal challenges in MARL is the need for explicit prediction of the agents' behavior to achieve cooperation. To resolve this issue, we propose the Shared Recurrent Memory Transformer (SRMT) which extends memory transformers to multi-agent settings by pooling and globally broadcasting individual working memories, enabling agents to exchange information implicitly and coordinate their actions. We evaluate SRMT on the Partially Observable Multi-Agent Pathfinding problem in a toy Bottleneck navigation task that requires agents to pass through a narrow corridor and on a POGEMA benchmark set of tasks. In the Bottleneck task, SRMT consistently outperforms a variety of reinforcement learning baselines, especially under sparse rewards, and generalizes effectively to longer corridors than those seen during training. On POGEMA maps, including Mazes, Random, and MovingAI, SRMT is competitive with recent MARL, hybrid, and planning-based algorithms. These results suggest that incorporating shared recurrent memory into the transformer-based architectures can enhance coordination in decentralized multi-agent systems. The source code for training and evaluation is available on GitHub: https://github.com/Aloriosa/srmt.", 'score': 53, 'issue_id': 1846, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '52d8b3716543aa42', 'authors': ['Alsu Sagirova', 'Yuri Kuratov', 'Mikhail Burtsev'], 'affiliations': ['AIRI, Moscow, Russia', 'London Institute for Mathematical Sciences, London, UK', 'Neural Networks and Deep Learning Lab, MIPT, Dolgoprudny, Russia'], 'pdf_title_img': 'assets/pdf/title_img/2501.13200.jpg', 'data': {'categories': ['#training', '#games', '#rl', '#agents', '#benchmark', '#optimization'], 'emoji': '🤖', 'ru': {'title': 'SRMT: Улучшение координации в децентрализованных мультиагентных системах', 'desc': 'В статье представлен новый подход к мультиагентному обучению с подкреплением (MARL) - Shared Recurrent Memory Transformer (SRMT). SRMT расширяет возможности трансформеров с памятью для мультиагентных систем, объединяя и глобально транслируя индивидуальную рабочую память агентов. Этот метод позволяет агентам неявно обмениваться информацией и координировать свои действия. SRMT показал превосходные результаты на задаче частично наблюдаемого мультиагентного поиска пути, превзойдя базовые алгоритмы обучения с подкреплением и продемонстрировав эффективную генерализацию.'}, 'en': {'title': 'Enhancing Agent Coordination with Shared Memory Transformers', 'desc': 'This paper introduces the Shared Recurrent Memory Transformer (SRMT), a novel approach in multi-agent reinforcement learning (MARL) that enhances cooperation among agents. SRMT utilizes a memory transformer architecture to allow agents to share and broadcast their individual memories, facilitating implicit communication and coordination. The effectiveness of SRMT is demonstrated through experiments on the Partially Observable Multi-Agent Pathfinding problem, where it outperforms traditional reinforcement learning methods, particularly in scenarios with sparse rewards. The results indicate that integrating shared memory into transformer models significantly improves the performance of decentralized multi-agent systems.'}, 'zh': {'title': '共享记忆提升多智能体协调能力', 'desc': '多智能体强化学习(MARL)在解决合作和竞争的多智能体问题上取得了显著进展。本文提出了一种共享递归记忆变换器(SRMT),通过汇聚和全局广播个体工作记忆,帮助智能体隐式交换信息并协调行动。我们在部分可观察的多智能体路径规划问题上评估了SRMT,结果显示其在稀疏奖励下表现优于多种强化学习基线,并且在训练时未见过的更长走廊上也能有效泛化。SRMT在多个基准任务中与最新的MARL、混合和基于规划的算法具有竞争力,表明共享递归记忆的引入可以增强去中心化多智能体系统的协调能力。'}}}, {'id': 'https://huggingface.co/papers/2501.13629', 'title': 'Sigma: Differential Rescaling of Query, Key and Value for Efficient Language Models', 'url': 'https://huggingface.co/papers/2501.13629', 'abstract': "We introduce Sigma, an efficient large language model specialized for the system domain, empowered by a novel architecture including DiffQKV attention, and pre-trained on our meticulously collected system domain data. DiffQKV attention significantly enhances the inference efficiency of Sigma by optimizing the Query (Q), Key (K), and Value (V) components in the attention mechanism differentially, based on their varying impacts on the model performance and efficiency indicators. Specifically, we (1) conduct extensive experiments that demonstrate the model's varying sensitivity to the compression of K and V components, leading to the development of differentially compressed KV, and (2) propose augmented Q to expand the Q head dimension, which enhances the model's representation capacity with minimal impacts on the inference speed. Rigorous theoretical and empirical analyses reveal that DiffQKV attention significantly enhances efficiency, achieving up to a 33.36% improvement in inference speed over the conventional grouped-query attention (GQA) in long-context scenarios. We pre-train Sigma on 6T tokens from various sources, including 19.5B system domain data that we carefully collect and 1T tokens of synthesized and rewritten data. In general domains, Sigma achieves comparable performance to other state-of-arts models. In the system domain, we introduce the first comprehensive benchmark AIMicius, where Sigma demonstrates remarkable performance across all tasks, significantly outperforming GPT-4 with an absolute improvement up to 52.5%.", 'score': 37, 'issue_id': 1842, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'd036f75a81877ded', 'authors': ['Zhenghao Lin', 'Zihao Tang', 'Xiao Liu', 'Yeyun Gong', 'Yi Cheng', 'Qi Chen', 'Hang Li', 'Ying Xin', 'Ziyue Yang', 'Kailai Yang', 'Yu Yan', 'Xiao Liang', 'Shuai Lu', 'Yiming Huang', 'Zheheng Luo', 'Lei Qu', 'Xuan Feng', 'Yaoxiang Wang', 'Yuqing Xia', 'Feiyang Chen', 'Yuting Jiang', 'Yasen Hu', 'Hao Ni', 'Binyang Li', 'Guoshuai Zhao', 'Jui-Hao Chiang', 'Zhongxin Guo', 'Chen Lin', 'Kun Kuang', 'Wenjie Li', 'Yelong Shen', 'Jian Jiao', 'Peng Cheng', 'Mao Yang'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.13629.jpg', 'data': {'categories': ['#optimization', '#architecture', '#dataset', '#benchmark', '#long_context', '#training', '#synthetic', '#data', '#inference'], 'emoji': '🖥️', 'ru': {'title': 'Sigma: эффективная ЯМ для системной области с инновационным механизмом внимания', 'desc': 'Исследователи представили Sigma - эффективную большую языковую модель, специализированную для системной области. Модель использует новую архитектуру с DiffQKV-вниманием, которая оптимизирует компоненты Q, K и V механизма внимания для повышения эффективности. Sigma предобучена на тщательно собранных данных системной области объемом 6T токенов. На общих задачах модель показывает результаты на уровне современных аналогов, а в системной области значительно превосходит GPT-4 на новом бенчмарке AIMicius.'}, 'en': {'title': 'Sigma: Revolutionizing System Domain Language Models with DiffQKV Attention', 'desc': 'The paper presents Sigma, a specialized large language model designed for the system domain, utilizing a new architecture called DiffQKV attention. This innovative attention mechanism optimizes the Query, Key, and Value components to improve inference efficiency, particularly in long-context scenarios. Through extensive experiments, the authors show that Sigma achieves significant speed improvements, outperforming traditional models like GPT-4 in various tasks. The model is pre-trained on a vast dataset, including 19.5 billion tokens from the system domain, establishing a new benchmark for performance in this area.'}, 'zh': {'title': 'Sigma:系统领域的高效语言模型', 'desc': '我们介绍了Sigma,这是一个高效的大型语言模型,专门针对系统领域。它采用了一种新颖的架构,包括DiffQKV注意力机制,并在我们精心收集的系统领域数据上进行了预训练。DiffQKV注意力通过优化注意力机制中的查询(Q)、键(K)和值(V)组件,显著提高了推理效率。实验结果表明,Sigma在系统领域的表现优于GPT-4,绝对提升幅度可达52.5%。'}}}, {'id': 'https://huggingface.co/papers/2501.13918', 'title': 'Improving Video Generation with Human Feedback', 'url': 'https://huggingface.co/papers/2501.13918', 'abstract': 'Video generation has achieved significant advances through rectified flow techniques, but issues like unsmooth motion and misalignment between videos and prompts persist. In this work, we develop a systematic pipeline that harnesses human feedback to mitigate these problems and refine the video generation model. Specifically, we begin by constructing a large-scale human preference dataset focused on modern video generation models, incorporating pairwise annotations across multi-dimensions. We then introduce VideoReward, a multi-dimensional video reward model, and examine how annotations and various design choices impact its rewarding efficacy. From a unified reinforcement learning perspective aimed at maximizing reward with KL regularization, we introduce three alignment algorithms for flow-based models by extending those from diffusion models. These include two training-time strategies: direct preference optimization for flow (Flow-DPO) and reward weighted regression for flow (Flow-RWR), and an inference-time technique, Flow-NRG, which applies reward guidance directly to noisy videos. Experimental results indicate that VideoReward significantly outperforms existing reward models, and Flow-DPO demonstrates superior performance compared to both Flow-RWR and standard supervised fine-tuning methods. Additionally, Flow-NRG lets users assign custom weights to multiple objectives during inference, meeting personalized video quality needs. Project page: https://gongyeliu.github.io/videoalign.', 'score': 34, 'issue_id': 1849, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '933a6a47d8d5e20a', 'authors': ['Jie Liu', 'Gongye Liu', 'Jiajun Liang', 'Ziyang Yuan', 'Xiaokun Liu', 'Mingwu Zheng', 'Xiele Wu', 'Qiulin Wang', 'Wenyu Qin', 'Menghan Xia', 'Xintao Wang', 'Xiaohong Liu', 'Fei Yang', 'Pengfei Wan', 'Di Zhang', 'Kun Gai', 'Yujiu Yang', 'Wanli Ouyang'], 'affiliations': ['Kuaishou Technology', 'Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13918.jpg', 'data': {'categories': ['#dataset', '#training', '#optimization', '#alignment', '#video', '#rlhf'], 'emoji': '🎬', 'ru': {'title': 'Улучшение генерации видео с помощью человеческой обратной связи и обучения с подкреплением', 'desc': 'Данная работа представляет систематический подход к улучшению генерации видео с использованием обратной связи от людей. Авторы создали большой датасет человеческих предпочтений для современных моделей генерации видео и разработали многомерную модель оценки видео VideoReward. Они также предложили три алгоритма выравнивания для моделей на основе потоков: Flow-DPO, Flow-RWR и Flow-NRG. Эксперименты показали, что VideoReward значительно превосходит существующие модели оценки, а Flow-DPO демонстрирует лучшие результаты по сравнению с другими методами.'}, 'en': {'title': 'Enhancing Video Generation with Human Feedback and Reward Models', 'desc': 'This paper addresses challenges in video generation, particularly issues of motion smoothness and alignment with prompts. The authors propose a new pipeline that utilizes human feedback to enhance video generation models. They create a large dataset of human preferences and introduce VideoReward, a model that evaluates video quality based on these preferences. The study also presents three innovative algorithms for improving flow-based video generation, demonstrating that their methods outperform existing models and allow for personalized video quality adjustments.'}, 'zh': {'title': '优化视频生成,提升用户体验', 'desc': '本研究提出了一种系统化的视频生成模型优化方法,旨在解决视频生成中的不平滑运动和视频与提示之间的错位问题。我们构建了一个大规模的人类偏好数据集,专注于现代视频生成模型,并进行了多维度的成对注释。引入的VideoReward模型通过强化学习最大化奖励,并提出了三种对齐算法,以提高流模型的性能。实验结果表明,VideoReward在奖励模型中表现优异,Flow-DPO在性能上优于其他方法,满足用户个性化的视频质量需求。'}}}, {'id': 'https://huggingface.co/papers/2501.13926', 'title': "Can We Generate Images with CoT? Let's Verify and Reinforce Image Generation Step by Step", 'url': 'https://huggingface.co/papers/2501.13926', 'abstract': 'Chain-of-Thought (CoT) reasoning has been extensively explored in large models to tackle complex understanding tasks. However, it still remains an open question whether such strategies can be applied to verifying and reinforcing image generation scenarios. In this paper, we provide the first comprehensive investigation of the potential of CoT reasoning to enhance autoregressive image generation. We focus on three techniques: scaling test-time computation for verification, aligning model preferences with Direct Preference Optimization (DPO), and integrating these techniques for complementary effects. Our results demonstrate that these approaches can be effectively adapted and combined to significantly improve image generation performance. Furthermore, given the pivotal role of reward models in our findings, we propose the Potential Assessment Reward Model (PARM) and PARM++, specialized for autoregressive image generation. PARM adaptively assesses each generation step through a potential assessment approach, merging the strengths of existing reward models, and PARM++ further introduces a reflection mechanism to self-correct the generated unsatisfactory image. Using our investigated reasoning strategies, we enhance a baseline model, Show-o, to achieve superior results, with a significant +24% improvement on the GenEval benchmark, surpassing Stable Diffusion 3 by +15%. We hope our study provides unique insights and paves a new path for integrating CoT reasoning with autoregressive image generation. Code and models are released at https://github.com/ZiyuGuo99/Image-Generation-CoT', 'score': 21, 'issue_id': 1841, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '61611cbe661736ff', 'authors': ['Ziyu Guo', 'Renrui Zhang', 'Chengzhuo Tong', 'Zhizheng Zhao', 'Peng Gao', 'Hongsheng Li', 'Pheng-Ann Heng'], 'affiliations': ['CUHK', 'MMLab', 'MiuLar Lab', 'Peking University', 'Shanghai AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.13926.jpg', 'data': {'categories': ['#rlhf', '#games', '#dataset', '#cv', '#reasoning', '#optimization', '#benchmark'], 'emoji': '🖼️', 'ru': {'title': 'Рассуждения по цепочке мыслей открывают новые горизонты в генерации изображений', 'desc': 'Статья исследует применение рассуждений по цепочке мыслей (Chain-of-Thought) для улучшения автореграссивной генерации изображений. Авторы предлагают три метода: масштабирование вычислений во время тестирования, оптимизацию предпочтений модели и интеграцию этих техник. Они также представляют новые модели вознаграждения PARM и PARM++, специально разработанные для генерации изображений. Результаты показывают значительное улучшение производительности базовой модели Show-o на 24% по сравнению с эталоном GenEval.'}, 'en': {'title': 'Enhancing Image Generation with Chain-of-Thought Reasoning', 'desc': 'This paper explores the use of Chain-of-Thought (CoT) reasoning to improve autoregressive image generation models. It investigates three main techniques: enhancing verification through increased computation, aligning model preferences using Direct Preference Optimization (DPO), and combining these methods for better outcomes. The authors introduce the Potential Assessment Reward Model (PARM) and its enhanced version PARM++, which help assess and correct image generation steps. The results show a significant performance boost, achieving a 24% improvement on the GenEval benchmark compared to previous models.'}, 'zh': {'title': '链式思维提升图像生成性能', 'desc': '本文探讨了链式思维(CoT)推理在自回归图像生成中的应用潜力。我们提出了三种技术:测试时计算的扩展、与直接偏好优化(DPO)对齐模型偏好,以及这些技术的整合。研究结果表明,这些方法可以有效结合,显著提升图像生成性能。此外,我们提出了潜力评估奖励模型(PARM)和PARM++,专门用于自回归图像生成,进一步提高了生成质量。'}}}, {'id': 'https://huggingface.co/papers/2501.13826', 'title': 'Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline Professional Videos', 'url': 'https://huggingface.co/papers/2501.13826', 'abstract': "Humans acquire knowledge through three cognitive stages: perceiving information, comprehending knowledge, and adapting knowledge to solve novel problems. Videos serve as an effective medium for this learning process, facilitating a progression through these cognitive stages. However, existing video benchmarks fail to systematically evaluate the knowledge acquisition capabilities in Large Multimodal Models (LMMs). To address this gap, we introduce Video-MMMU, a multi-modal, multi-disciplinary benchmark designed to assess LMMs' ability to acquire and utilize knowledge from videos. Video-MMMU features a curated collection of 300 expert-level videos and 900 human-annotated questions across six disciplines, evaluating knowledge acquisition through stage-aligned question-answer pairs: Perception, Comprehension, and Adaptation. A proposed knowledge gain metric, {\\Delta}knowledge, quantifies improvement in performance after video viewing. Evaluation of LMMs reveals a steep decline in performance as cognitive demands increase and highlights a significant gap between human and model knowledge acquisition, underscoring the need for methods to enhance LMMs' capability to learn and adapt from videos.", 'score': 18, 'issue_id': 1848, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '4475243a608bc988', 'authors': ['Kairui Hu', 'Penghao Wu', 'Fanyi Pu', 'Wang Xiao', 'Yuanhan Zhang', 'Xiang Yue', 'Bo Li', 'Ziwei Liu'], 'affiliations': ['Carnegie Mellon University', 'S-Lab, Nanyang Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13826.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#science', '#multimodal', '#video'], 'emoji': '🎓', 'ru': {'title': 'Новый рубеж в оценке мультимодального обучения ИИ', 'desc': 'Статья представляет новый бенчмарк Video-MMMU для оценки способности больших мультимодальных моделей (LMM) приобретать знания из видео. Бенчмарк включает 300 экспертных видео и 900 вопросов по шести дисциплинам, оценивая восприятие, понимание и адаптацию знаний. Введена метрика ∆knowledge для измерения прироста знаний после просмотра видео. Результаты показывают значительный разрыв между человеческим и машинным обучением, подчеркивая необходимость улучшения LMM в области обучения на основе видео.'}, 'en': {'title': 'Enhancing Knowledge Acquisition in LMMs through Video Learning', 'desc': 'This paper introduces Video-MMMU, a benchmark designed to evaluate Large Multimodal Models (LMMs) in their ability to learn from videos. It focuses on three cognitive stages: perception, comprehension, and adaptation, using a set of 300 expert videos and 900 questions. The benchmark assesses how well LMMs can acquire knowledge through these stages, revealing a significant performance gap compared to humans. A new metric, Δknowledge, measures the improvement in LMM performance after watching videos, highlighting the need for better learning methods in these models.'}, 'zh': {'title': '提升多模态模型的视频知识获取能力', 'desc': '本文探讨了人类通过三个认知阶段获取知识的过程:感知信息、理解知识和适应知识以解决新问题。视频作为一种有效的学习媒介,能够促进这些认知阶段的进展。然而,现有的视频基准未能系统地评估大型多模态模型(LMMs)在知识获取方面的能力。为此,我们提出了Video-MMMU,这是一个多模态、多学科的基准,旨在评估LMMs从视频中获取和利用知识的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.13919', 'title': 'Temporal Preference Optimization for Long-Form Video Understanding', 'url': 'https://huggingface.co/papers/2501.13919', 'abstract': 'Despite significant advancements in video large multimodal models (video-LMMs), achieving effective temporal grounding in long-form videos remains a challenge for existing models. To address this limitation, we propose Temporal Preference Optimization (TPO), a novel post-training framework designed to enhance the temporal grounding capabilities of video-LMMs through preference learning. TPO adopts a self-training approach that enables models to differentiate between well-grounded and less accurate temporal responses by leveraging curated preference datasets at two granularities: localized temporal grounding, which focuses on specific video segments, and comprehensive temporal grounding, which captures extended temporal dependencies across entire video sequences. By optimizing on these preference datasets, TPO significantly enhances temporal understanding while reducing reliance on manually annotated data. Extensive experiments on three long-form video understanding benchmarks--LongVideoBench, MLVU, and Video-MME--demonstrate the effectiveness of TPO across two state-of-the-art video-LMMs. Notably, LLaVA-Video-TPO establishes itself as the leading 7B model on the Video-MME benchmark, underscoring the potential of TPO as a scalable and efficient solution for advancing temporal reasoning in long-form video understanding. Project page: https://ruili33.github.io/tpo_website.', 'score': 17, 'issue_id': 1843, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '6e08b56893fb98a9', 'authors': ['Rui Li', 'Xiaohan Wang', 'Yuhui Zhang', 'Zeyu Wang', 'Serena Yeung-Levy'], 'affiliations': ['Stanford University', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.13919.jpg', 'data': {'categories': ['#multimodal', '#long_context', '#reasoning', '#training', '#optimization', '#video', '#benchmark'], 'emoji': '⏳', 'ru': {'title': 'TPO: Улучшение временного понимания в видео-LMM без ручной разметки', 'desc': 'Статья представляет новый метод под названием Temporal Preference Optimization (TPO) для улучшения временной привязки в видео-LMM моделях. TPO использует самообучение на основе предпочтений для различения хорошо и плохо привязанных во времени ответов. Метод работает на двух уровнях: локальная временная привязка для конкретных сегментов видео и комплексная для всей последовательности. Эксперименты на трех бенчмарках для длинных видео показали эффективность TPO для улучшения временного понимания в видео-LMM.'}, 'en': {'title': 'Enhancing Temporal Understanding in Long Videos with TPO', 'desc': "This paper introduces Temporal Preference Optimization (TPO), a new framework aimed at improving how video large multimodal models (video-LMMs) understand time in long videos. TPO uses a self-training method that helps models learn to tell the difference between accurate and inaccurate timing responses by using specially curated preference datasets. These datasets focus on both specific video segments and the overall flow of the entire video, enhancing the model's ability to grasp temporal relationships. The results show that TPO significantly boosts performance on various benchmarks, making it a promising approach for better temporal reasoning in video analysis."}, 'zh': {'title': '时间偏好优化:提升视频理解的关键', 'desc': '尽管视频大型多模态模型(video-LMMs)取得了显著进展,但在长视频中实现有效的时间定位仍然是一个挑战。为了解决这个问题,我们提出了一种新的后训练框架——时间偏好优化(TPO),旨在通过偏好学习增强视频-LMMs的时间定位能力。TPO采用自我训练的方法,利用精心策划的偏好数据集,使模型能够区分准确的时间响应和不太准确的时间响应。通过在这两个层次上优化偏好数据集,TPO显著提高了时间理解能力,同时减少了对手动标注数据的依赖。'}}}, {'id': 'https://huggingface.co/papers/2501.13920', 'title': 'IMAGINE-E: Image Generation Intelligence Evaluation of State-of-the-art Text-to-Image Models', 'url': 'https://huggingface.co/papers/2501.13920', 'abstract': "With the rapid development of diffusion models, text-to-image(T2I) models have made significant progress, showcasing impressive abilities in prompt following and image generation. Recently launched models such as FLUX.1 and Ideogram2.0, along with others like Dall-E3 and Stable Diffusion 3, have demonstrated exceptional performance across various complex tasks, raising questions about whether T2I models are moving towards general-purpose applicability. Beyond traditional image generation, these models exhibit capabilities across a range of fields, including controllable generation, image editing, video, audio, 3D, and motion generation, as well as computer vision tasks like semantic segmentation and depth estimation. However, current evaluation frameworks are insufficient to comprehensively assess these models' performance across expanding domains. To thoroughly evaluate these models, we developed the IMAGINE-E and tested six prominent models: FLUX.1, Ideogram2.0, Midjourney, Dall-E3, Stable Diffusion 3, and Jimeng. Our evaluation is divided into five key domains: structured output generation, realism, and physical consistency, specific domain generation, challenging scenario generation, and multi-style creation tasks. This comprehensive assessment highlights each model's strengths and limitations, particularly the outstanding performance of FLUX.1 and Ideogram2.0 in structured and specific domain tasks, underscoring the expanding applications and potential of T2I models as foundational AI tools. This study provides valuable insights into the current state and future trajectory of T2I models as they evolve towards general-purpose usability. Evaluation scripts will be released at https://github.com/jylei16/Imagine-e.", 'score': 12, 'issue_id': 1843, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '837193826ae51376', 'authors': ['Jiayi Lei', 'Renrui Zhang', 'Xiangfei Hu', 'Weifeng Lin', 'Zhen Li', 'Wenjian Sun', 'Ruoyi Du', 'Le Zhuo', 'Zhongyu Li', 'Xinyue Li', 'Shitian Zhao', 'Ziyu Guo', 'Yiting Lu', 'Peng Gao', 'Hongsheng Li'], 'affiliations': ['CUHK MMLab', 'Shanghai AI Laboratory', 'Shanghai Jiaotong University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13920.jpg', 'data': {'categories': ['#audio', '#multimodal', '#cv', '#3d', '#diffusion', '#video', '#benchmark', '#survey'], 'emoji': '🎨', 'ru': {'title': 'Новый рубеж в оценке моделей текст-изображение: путь к универсальному ИИ', 'desc': 'Эта статья посвящена оценке современных моделей преобразования текста в изображение (T2I). Авторы разработали новую систему оценки IMAGINE-E для тестирования шести ведущих моделей в пяти ключевых областях. Исследование выявило выдающиеся способности моделей FLUX.1 и Ideogram2.0 в структурированных задачах и задачах специфических доменов. Результаты подчеркивают растущий потенциал моделей T2I как универсальных инструментов искусственного интеллекта.'}, 'en': {'title': 'Evaluating the Future of Text-to-Image Models', 'desc': 'This paper discusses the advancements in text-to-image (T2I) models, particularly focusing on recent models like FLUX.1 and Ideogram2.0. These models not only excel in generating images from text prompts but also show versatility in various tasks such as image editing and video generation. The authors introduce a new evaluation framework called IMAGINE-E to assess the performance of six leading T2I models across multiple domains. The findings reveal that while some models perform exceptionally well in specific tasks, there is a need for better evaluation methods to fully understand their capabilities and limitations.'}, 'zh': {'title': '文本到图像模型的未来:通用性与评估的挑战', 'desc': '随着扩散模型的快速发展,文本到图像(T2I)模型在提示跟随和图像生成方面取得了显著进展。新推出的模型如FLUX.1和Ideogram2.0,以及Dall-E3和Stable Diffusion 3等,展示了在各种复杂任务中的卓越表现,提出了T2I模型是否朝着通用适用性发展的疑问。除了传统的图像生成,这些模型在可控生成、图像编辑、视频、音频、3D和运动生成等多个领域也展现了能力。为了全面评估这些模型的性能,我们开发了IMAGINE-E,并对六个主要模型进行了测试,强调了它们在不同领域的优势和局限性,特别是FLUX.1和Ideogram2.0在结构化和特定领域任务中的出色表现。'}}}, {'id': 'https://huggingface.co/papers/2501.10799', 'title': 'Step-KTO: Optimizing Mathematical Reasoning through Stepwise Binary Feedback', 'url': 'https://huggingface.co/papers/2501.10799', 'abstract': 'Large language models (LLMs) have recently demonstrated remarkable success in mathematical reasoning. Despite progress in methods like chain-of-thought prompting and self-consistency sampling, these advances often focus on final correctness without ensuring that the underlying reasoning process is coherent and reliable. This paper introduces Step-KTO, a training framework that combines process-level and outcome-level binary feedback to guide LLMs toward more trustworthy reasoning trajectories. By providing binary evaluations for both the intermediate reasoning steps and the final answer, Step-KTO encourages the model to adhere to logical progressions rather than relying on superficial shortcuts. Our experiments on challenging mathematical benchmarks show that Step-KTO significantly improves both final answer accuracy and the quality of intermediate reasoning steps. For example, on the MATH-500 dataset, Step-KTO achieves a notable improvement in Pass@1 accuracy over strong baselines. These results highlight the promise of integrating stepwise process feedback into LLM training, paving the way toward more interpretable and dependable reasoning capabilities.', 'score': 11, 'issue_id': 1842, 'pub_date': '2025-01-18', 'pub_date_card': {'ru': '18 января', 'en': 'January 18', 'zh': '1月18日'}, 'hash': 'd43b005a69156930', 'authors': ['Yen-Ting Lin', 'Di Jin', 'Tengyu Xu', 'Tianhao Wu', 'Sainbayar Sukhbaatar', 'Chen Zhu', 'Yun He', 'Yun-Nung Chen', 'Jason Weston', 'Yuandong Tian', 'Arash Rahnama', 'Sinong Wang', 'Hao Ma', 'Han Fang'], 'affiliations': ['Meta FAIR', 'Meta GenAI', 'National Taiwan University', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.10799.jpg', 'data': {'categories': ['#interpretability', '#training', '#math', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Шаг за шагом к надежным математическим рассуждениям ИИ', 'desc': 'Статья представляет новый подход к обучению больших языковых моделей (LLM) для математических рассуждений. Метод Step-KTO использует бинарную обратную связь как для промежуточных шагов рассуждения, так и для конечного результата. Это позволяет модели следовать логичному ходу мыслей, а не полагаться на поверхностные шаблоны. Эксперименты на сложных математических тестах показали значительное улучшение как точности конечного ответа, так и качества промежуточных шагов рассуждения.'}, 'en': {'title': 'Enhancing Trustworthy Reasoning in LLMs with Step-KTO', 'desc': 'This paper presents Step-KTO, a new training framework for large language models (LLMs) that enhances their mathematical reasoning abilities. Unlike previous methods that focus solely on the final answer, Step-KTO provides feedback on both the reasoning process and the outcome, promoting logical coherence. By evaluating intermediate reasoning steps alongside the final result, the framework helps LLMs avoid shortcuts and develop more reliable reasoning paths. Experiments show that Step-KTO significantly boosts accuracy and improves the quality of reasoning in challenging mathematical tasks, indicating its potential for creating more interpretable AI systems.'}, 'zh': {'title': '提升推理可信度的Step-KTO框架', 'desc': '大型语言模型(LLMs)在数学推理方面取得了显著成功。尽管链式思维提示和自一致性采样等方法有所进展,但这些方法往往只关注最终结果的正确性,而未能确保推理过程的连贯性和可靠性。本文提出了Step-KTO,这是一种结合过程级和结果级二元反馈的训练框架,旨在引导LLMs朝着更可信的推理轨迹发展。实验结果表明,Step-KTO显著提高了最终答案的准确性和中间推理步骤的质量,展示了逐步过程反馈在LLM训练中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.10018', 'title': 'DiffuEraser: A Diffusion Model for Video Inpainting', 'url': 'https://huggingface.co/papers/2501.10018', 'abstract': 'Recent video inpainting algorithms integrate flow-based pixel propagation with transformer-based generation to leverage optical flow for restoring textures and objects using information from neighboring frames, while completing masked regions through visual Transformers. However, these approaches often encounter blurring and temporal inconsistencies when dealing with large masks, highlighting the need for models with enhanced generative capabilities. Recently, diffusion models have emerged as a prominent technique in image and video generation due to their impressive performance. In this paper, we introduce DiffuEraser, a video inpainting model based on stable diffusion, designed to fill masked regions with greater details and more coherent structures. We incorporate prior information to provide initialization and weak conditioning,which helps mitigate noisy artifacts and suppress hallucinations. Additionally, to improve temporal consistency during long-sequence inference, we expand the temporal receptive fields of both the prior model and DiffuEraser, and further enhance consistency by leveraging the temporal smoothing property of Video Diffusion Models. Experimental results demonstrate that our proposed method outperforms state-of-the-art techniques in both content completeness and temporal consistency while maintaining acceptable efficiency.', 'score': 10, 'issue_id': 1846, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '8ebb9334e60b0dd7', 'authors': ['Xiaowen Li', 'Haolan Xue', 'Peiran Ren', 'Liefeng Bo'], 'affiliations': ['Tongyi Lab, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.10018.jpg', 'data': {'categories': ['#diffusion', '#video', '#long_context', '#hallucinations', '#cv'], 'emoji': '🎬', 'ru': {'title': 'DiffuEraser: Улучшенное восстановление видео с помощью диффузионных моделей', 'desc': 'DiffuEraser - это новая модель для восстановления видео на основе стабильной диффузии. Она использует предварительную информацию для инициализации и слабого кондиционирования, что помогает уменьшить шумовые артефакты. Модель расширяет временные рецептивные поля для улучшения временной согласованности при выводе длинных последовательностей. Экспериментальные результаты показывают, что DiffuEraser превосходит современные методы по полноте содержания и временной согласованности.'}, 'en': {'title': 'Enhancing Video Inpainting with Diffusion Models for Better Consistency and Detail', 'desc': 'This paper presents DiffuEraser, a novel video inpainting model that utilizes stable diffusion techniques to improve the restoration of masked regions in videos. By integrating prior information for initialization and weak conditioning, the model effectively reduces noise and visual artifacts. The authors enhance temporal consistency by expanding the temporal receptive fields and utilizing the smoothing properties of Video Diffusion Models. Experimental results show that DiffuEraser surpasses existing methods in terms of content completeness and temporal coherence, while also being efficient.'}, 'zh': {'title': 'DiffuEraser:提升视频修复的细节与一致性', 'desc': '本文介绍了一种名为DiffuEraser的视频修复模型,基于稳定扩散技术,旨在用更丰富的细节和更连贯的结构填补被遮挡的区域。我们通过引入先验信息来提供初始化和弱条件,从而减少噪声伪影和抑制幻觉现象。为了提高长序列推理过程中的时间一致性,我们扩展了先验模型和DiffuEraser的时间感受野,并利用视频扩散模型的时间平滑特性进一步增强一致性。实验结果表明,我们的方法在内容完整性和时间一致性方面优于现有的最先进技术,同时保持了可接受的效率。'}}}, {'id': 'https://huggingface.co/papers/2501.13554', 'title': 'One-Prompt-One-Story: Free-Lunch Consistent Text-to-Image Generation Using a Single Prompt', 'url': 'https://huggingface.co/papers/2501.13554', 'abstract': 'Text-to-image generation models can create high-quality images from input prompts. However, they struggle to support the consistent generation of identity-preserving requirements for storytelling. Existing approaches to this problem typically require extensive training in large datasets or additional modifications to the original model architectures. This limits their applicability across different domains and diverse diffusion model configurations. In this paper, we first observe the inherent capability of language models, coined context consistency, to comprehend identity through context with a single prompt. Drawing inspiration from the inherent context consistency, we propose a novel training-free method for consistent text-to-image (T2I) generation, termed "One-Prompt-One-Story" (1Prompt1Story). Our approach 1Prompt1Story concatenates all prompts into a single input for T2I diffusion models, initially preserving character identities. We then refine the generation process using two novel techniques: Singular-Value Reweighting and Identity-Preserving Cross-Attention, ensuring better alignment with the input description for each frame. In our experiments, we compare our method against various existing consistent T2I generation approaches to demonstrate its effectiveness through quantitative metrics and qualitative assessments. Code is available at https://github.com/byliutao/1Prompt1Story.', 'score': 8, 'issue_id': 1852, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '15ba8f8e21d0e703', 'pdf_title_img': 'img/title_stub.png', 'data': {'categories': ['#training', '#cv', '#story_generation', '#open_source', '#optimization', '#diffusion', '#dataset'], 'emoji': '🎬', 'ru': {'title': 'Один промпт - одна история: последовательная генерация изображений без дообучения', 'desc': "Статья представляет новый метод генерации последовательных изображений из текста под названием '1Prompt1Story'. Этот подход объединяет все промпты в единый вход для диффузионных моделей, сохраняя идентичность персонажей. Метод использует две новые техники: переоценку сингулярных значений и сохраняющее идентичность кросс-внимание. '1Prompt1Story' не требует дополнительного обучения и применим к различным конфигурациям диффузионных моделей."}, 'en': {'title': 'Consistent Storytelling in Text-to-Image Generation', 'desc': 'This paper addresses the challenge of generating consistent images that preserve character identities in text-to-image (T2I) models. The authors introduce a novel method called "One-Prompt-One-Story" (1Prompt1Story), which allows for the concatenation of multiple prompts into a single input, enhancing the model\'s ability to maintain character consistency. They employ two innovative techniques, Singular-Value Reweighting and Identity-Preserving Cross-Attention, to refine the image generation process and ensure alignment with the input descriptions. The proposed method is evaluated against existing approaches, showing improved performance in both quantitative metrics and qualitative assessments.'}, 'zh': {'title': '一提示一故事:提升文本到图像生成的一致性', 'desc': '本文提出了一种新的文本到图像生成方法,称为"一提示一故事"(1Prompt1Story),旨在解决生成过程中角色身份一致性的问题。该方法通过将所有提示合并为单个输入,初步保持角色身份,并利用两种新技术进行生成过程的优化。我们的方法不需要大量训练数据或对模型架构的修改,具有更广泛的适用性。实验结果表明,1Prompt1Story在定量和定性评估中均优于现有的一致性生成方法。'}}, 'authors': [], 'affiliations': []}, {'id': 'https://huggingface.co/papers/2501.13824', 'title': 'Hallucinations Can Improve Large Language Models in Drug Discovery', 'url': 'https://huggingface.co/papers/2501.13824', 'abstract': 'Concerns about hallucinations in Large Language Models (LLMs) have been raised by researchers, yet their potential in areas where creativity is vital, such as drug discovery, merits exploration. In this paper, we come up with the hypothesis that hallucinations can improve LLMs in drug discovery. To verify this hypothesis, we use LLMs to describe the SMILES string of molecules in natural language and then incorporate these descriptions as part of the prompt to address specific tasks in drug discovery. Evaluated on seven LLMs and five classification tasks, our findings confirm the hypothesis: LLMs can achieve better performance with text containing hallucinations. Notably, Llama-3.1-8B achieves an 18.35% gain in ROC-AUC compared to the baseline without hallucination. Furthermore, hallucinations generated by GPT-4o provide the most consistent improvements across models. Additionally, we conduct empirical analyses and a case study to investigate key factors affecting performance and the underlying reasons. Our research sheds light on the potential use of hallucinations for LLMs and offers new perspectives for future research leveraging LLMs in drug discovery.', 'score': 5, 'issue_id': 1853, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'bd66442952551d3e', 'authors': ['Shuzhou Yuan', 'Michael Färber'], 'affiliations': ['Center for Scalable Data Analytics and Artificial Intelligence (ScaDS.AI), Germany', 'Dresden University of Technology, Germany'], 'pdf_title_img': 'assets/pdf/title_img/2501.13824.jpg', 'data': {'categories': ['#healthcare', '#rlhf', '#science', '#hallucinations'], 'emoji': '💊', 'ru': {'title': 'Галлюцинации LLM ускоряют разработку лекарств', 'desc': 'Исследователи изучили потенциал галлюцинаций в больших языковых моделях (LLM) для улучшения процесса открытия новых лекарств. Они использовали LLM для описания молекул на естественном языке и включили эти описания в промпты для решения задач в области разработки лекарств. Эксперименты на семи LLM и пяти задачах классификации подтвердили гипотезу: модели показали лучшие результаты с текстами, содержащими галлюцинации. Исследование открывает новые перспективы использования LLM в фармацевтике.'}, 'en': {'title': 'Harnessing Hallucinations: Boosting Drug Discovery with LLMs', 'desc': "This paper explores the idea that hallucinations in Large Language Models (LLMs) can enhance their performance in drug discovery tasks. The authors hypothesize that by using LLMs to generate natural language descriptions of molecular SMILES strings, they can improve the models' effectiveness in specific classification tasks. Their experiments show that LLMs, particularly Llama-3.1-8B, achieve significant performance gains when incorporating these hallucinated descriptions, with an 18.35% increase in ROC-AUC. The study provides insights into how hallucinations can be beneficial in creative applications like drug discovery, suggesting new avenues for future research."}, 'zh': {'title': '利用幻觉提升药物发现中的大型语言模型表现', 'desc': '本研究探讨了大型语言模型(LLMs)在药物发现中的潜力,尤其是它们的幻觉现象。我们提出假设,幻觉可以提升LLMs在药物发现任务中的表现。通过将LLMs生成的分子SMILES字符串描述作为提示的一部分,我们在七个LLMs和五个分类任务上进行了评估。结果表明,包含幻觉的文本能显著提高模型性能,尤其是Llama-3.1-8B在ROC-AUC上比基线提高了18.35%。'}}}, {'id': 'https://huggingface.co/papers/2501.13452', 'title': 'EchoVideo: Identity-Preserving Human Video Generation by Multimodal Feature Fusion', 'url': 'https://huggingface.co/papers/2501.13452', 'abstract': 'Recent advancements in video generation have significantly impacted various downstream applications, particularly in identity-preserving video generation (IPT2V). However, existing methods struggle with "copy-paste" artifacts and low similarity issues, primarily due to their reliance on low-level facial image information. This dependence can result in rigid facial appearances and artifacts reflecting irrelevant details. To address these challenges, we propose EchoVideo, which employs two key strategies: (1) an Identity Image-Text Fusion Module (IITF) that integrates high-level semantic features from text, capturing clean facial identity representations while discarding occlusions, poses, and lighting variations to avoid the introduction of artifacts; (2) a two-stage training strategy, incorporating a stochastic method in the second phase to randomly utilize shallow facial information. The objective is to balance the enhancements in fidelity provided by shallow features while mitigating excessive reliance on them. This strategy encourages the model to utilize high-level features during training, ultimately fostering a more robust representation of facial identities. EchoVideo effectively preserves facial identities and maintains full-body integrity. Extensive experiments demonstrate that it achieves excellent results in generating high-quality, controllability and fidelity videos.', 'score': 5, 'issue_id': 1846, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'b98d987f7439b94b', 'authors': ['Jiangchuan Wei', 'Shiyue Yan', 'Wenfeng Lin', 'Boyuan Liu', 'Renjie Chen', 'Mingyu Guo'], 'affiliations': ['ByteDance'], 'pdf_title_img': 'assets/pdf/title_img/2501.13452.jpg', 'data': {'categories': ['#video'], 'emoji': '🎭', 'ru': {'title': 'EchoVideo: Новый подход к генерации видео с сохранением идентичности', 'desc': 'EchoVideo - это новый метод генерации видео с сохранением идентичности (IPT2V). Он использует модуль слияния изображения и текста (IITF) для интеграции семантических признаков и получения чистых представлений лиц. Применяется двухэтапная стратегия обучения со стохастическим использованием поверхностной информации о лицах. EchoVideo эффективно сохраняет идентичность лиц и целостность всего тела, демонстрируя отличные результаты в генерации качественных и контролируемых видео.'}, 'en': {'title': 'EchoVideo: Enhancing Identity Preservation in Video Generation', 'desc': "The paper introduces EchoVideo, a novel approach to identity-preserving video generation that addresses common issues like 'copy-paste' artifacts and low similarity in generated videos. It utilizes an Identity Image-Text Fusion Module (IITF) to merge high-level semantic features from text, ensuring clean facial identity representations while avoiding irrelevant details. Additionally, a two-stage training strategy is implemented, which includes a stochastic method to balance the use of shallow facial information with high-level features. This results in improved fidelity and robustness in facial identity representation, leading to high-quality video generation with better controllability."}, 'zh': {'title': 'EchoVideo:提升视频生成的身份保留与质量', 'desc': '近年来,视频生成技术的进步对身份保留视频生成(IPT2V)产生了重要影响。然而,现有方法在生成过程中常常出现“复制粘贴”伪影和低相似度的问题,这主要是因为它们依赖于低级别的面部图像信息。为了解决这些挑战,我们提出了EchoVideo,采用了身份图像-文本融合模块(IITF)和两阶段训练策略,旨在平衡浅层特征的增强与高层特征的利用。实验表明,EchoVideo在生成高质量、可控性和保真度的视频方面表现出色,有效保留了面部身份和全身完整性。'}}}, {'id': 'https://huggingface.co/papers/2501.10979', 'title': 'Control LLM: Controlled Evolution for Intelligence Retention in LLM', 'url': 'https://huggingface.co/papers/2501.10979', 'abstract': "Large Language Models (LLMs) demand significant computational resources, making it essential to enhance their capabilities without retraining from scratch. A key challenge in this domain is catastrophic forgetting (CF), which hampers performance during Continuous Pre-training (CPT) and Continuous Supervised Fine-Tuning (CSFT). We propose Control LLM, a novel approach that leverages parallel pre-trained and expanded transformer blocks, aligning their hidden-states through interpolation strategies This method effectively preserves performance on existing tasks while seamlessly integrating new knowledge. Extensive experiments demonstrate the effectiveness of Control LLM in both CPT and CSFT. On Llama3.1-8B-Instruct, it achieves significant improvements in mathematical reasoning (+14.4% on Math-Hard) and coding performance (+10% on MBPP-PLUS). On Llama3.1-8B, it enhances multilingual capabilities (+10.6% on C-Eval, +6.8% on CMMLU, and +30.2% on CMMLU-0shot-CoT). It surpasses existing methods and achieves SOTA among open-source models tuned from the same base model, using substantially less data and compute. Crucially, these gains are realized while preserving strong original capabilities, with minimal degradation (<4.3% on MMLU) compared to >35% in open-source Math and Coding models. This approach has been successfully deployed in LinkedIn's GenAI-powered job seeker and Ads unit products. To support further research, we release the training and evaluation code (https://github.com/linkedin/ControlLLM) along with models trained on public datasets ( https://huggingface.co/ControlLLM) to the community.", 'score': 3, 'issue_id': 1858, 'pub_date': '2025-01-19', 'pub_date_card': {'ru': '19 января', 'en': 'January 19', 'zh': '1月19日'}, 'hash': 'dd48db75ab08337c', 'authors': ['Haichao Wei', 'Yunxiang Ren', 'Zhoutong Fu', 'Aman Lunia', 'Yi-Lin Chen', 'Alice Leung', 'Ya Xu'], 'affiliations': ['LinkedIn'], 'pdf_title_img': 'assets/pdf/title_img/2501.10979.jpg', 'data': {'categories': ['#dataset', '#training', '#open_source', '#math', '#optimization', '#multilingual'], 'emoji': '🧠', 'ru': {'title': 'Контроль над забыванием: новый метод обучения языковых моделей', 'desc': 'Control LLM - это новый подход к обучению больших языковых моделей, который решает проблему катастрофического забывания при непрерывном предобучении и дообучении. Метод использует параллельные предобученные и расширенные блоки трансформера, интерполируя их скрытые состояния. Эксперименты показали значительное улучшение производительности в математических рассуждениях, программировании и многоязычных задачах без существенной потери изначальных возможностей. Подход успешно применен в продуктах LinkedIn и открыт для исследовательского сообщества.'}, 'en': {'title': 'Enhancing LLMs Without Starting Over!', 'desc': 'This paper introduces Control LLM, a new method designed to improve the performance of Large Language Models (LLMs) without the need for complete retraining. It addresses the issue of catastrophic forgetting that occurs during Continuous Pre-training (CPT) and Continuous Supervised Fine-Tuning (CSFT) by using parallel pre-trained transformer blocks and interpolation strategies to align hidden states. The results show that Control LLM significantly enhances performance in various tasks, including mathematical reasoning and coding, while maintaining strong original capabilities. The method has been validated through extensive experiments and is made available for further research, demonstrating its effectiveness in real-world applications.'}, 'zh': {'title': '提升大型语言模型能力的新方法', 'desc': '大型语言模型(LLMs)需要大量计算资源,因此在不从头开始重新训练的情况下提升其能力至关重要。本文提出了一种新方法Control LLM,通过并行预训练和扩展的变换器块,利用插值策略对齐其隐藏状态,从而有效地保留现有任务的性能并无缝整合新知识。实验结果表明,Control LLM在连续预训练和连续监督微调中表现出色,显著提高了数学推理和编码性能,同时在多语言能力上也有显著提升。该方法在保持原有强大能力的同时,减少了数据和计算的需求,展示了其在开源模型中的领先地位。'}}}, {'id': 'https://huggingface.co/papers/2501.13075', 'title': 'Evolution and The Knightian Blindspot of Machine Learning', 'url': 'https://huggingface.co/papers/2501.13075', 'abstract': "This paper claims that machine learning (ML) largely overlooks an important facet of general intelligence: robustness to a qualitatively unknown future in an open world. Such robustness relates to Knightian uncertainty (KU) in economics, i.e. uncertainty that cannot be quantified, which is excluded from consideration in ML's key formalisms. This paper aims to identify this blind spot, argue its importance, and catalyze research into addressing it, which we believe is necessary to create truly robust open-world AI. To help illuminate the blind spot, we contrast one area of ML, reinforcement learning (RL), with the process of biological evolution. Despite staggering ongoing progress, RL still struggles in open-world situations, often failing under unforeseen situations. For example, the idea of zero-shot transferring a self-driving car policy trained only in the US to the UK currently seems exceedingly ambitious. In dramatic contrast, biological evolution routinely produces agents that thrive within an open world, sometimes even to situations that are remarkably out-of-distribution (e.g. invasive species; or humans, who do undertake such zero-shot international driving). Interestingly, evolution achieves such robustness without explicit theory, formalisms, or mathematical gradients. We explore the assumptions underlying RL's typical formalisms, showing how they limit RL's engagement with the unknown unknowns characteristic of an ever-changing complex world. Further, we identify mechanisms through which evolutionary processes foster robustness to novel and unpredictable challenges, and discuss potential pathways to algorithmically embody them. The conclusion is that the intriguing remaining fragility of ML may result from blind spots in its formalisms, and that significant gains may result from direct confrontation with the challenge of KU.", 'score': 3, 'issue_id': 1845, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '5be12844b33bd729', 'authors': ['Joel Lehman', 'Elliot Meyerson', 'Tarek El-Gaaly', 'Kenneth O. Stanley', 'Tarin Ziyaee'], 'affiliations': ['Cognizant AI Labs', 'Second Nature AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.13075.jpg', 'data': {'categories': ['#rl', '#agi', '#agents', '#reasoning', '#math'], 'emoji': '🧬', 'ru': {'title': 'Преодоление неизвестного: уроки эволюции для машинного обучения', 'desc': 'Статья утверждает, что машинное обучение упускает важный аспект общего интеллекта: устойчивость к качественно неизвестному будущему в открытом мире. Авторы сравнивают обучение с подкреплением (RL) и биологическую эволюцию, показывая, что RL часто не справляется с непредвиденными ситуациями. В статье исследуются предположения, лежащие в основе формализмов RL, и выявляются механизмы, с помощью которых эволюционные процессы способствуют устойчивости к новым и непредсказуемым вызовам. Авторы приходят к выводу, что хрупкость машинного обучения может быть результатом слепых пятен в его формализмах, и значительные улучшения могут быть достигнуты путем прямого противостояния проблеме неопределенности Найта.'}, 'en': {'title': 'Bridging the Gap: Enhancing ML Robustness through Evolutionary Insights', 'desc': 'This paper highlights a critical gap in machine learning (ML) regarding its ability to handle unknown future scenarios, which is essential for general intelligence. It draws parallels between reinforcement learning (RL) and biological evolution, emphasizing that while RL struggles with unforeseen situations, evolution naturally adapts to them. The authors argue that current ML formalisms overlook Knightian uncertainty, which limits the robustness of AI systems in open-world environments. They propose that by understanding and integrating evolutionary mechanisms, ML can improve its resilience to unpredictable challenges.'}, 'zh': {'title': '机器学习需面对未知不确定性挑战', 'desc': '这篇论文指出,机器学习(ML)在处理开放世界中的未知未来时,忽视了一个重要方面:对未知不确定性的鲁棒性。作者将这种鲁棒性与经济学中的奈特不确定性(Knightian Uncertainty)相联系,认为这是机器学习关键形式化中被排除的因素。通过对比强化学习(RL)与生物进化过程,论文强调了RL在开放世界情境中的局限性,并探讨了生物进化如何在没有明确理论的情况下,培养出适应复杂环境的能力。最后,作者认为,机器学习的脆弱性可能源于其形式化中的盲点,直接面对奈特不确定性挑战可能会带来显著的进步。'}}}, {'id': 'https://huggingface.co/papers/2501.13124', 'title': 'Debate Helps Weak-to-Strong Generalization', 'url': 'https://huggingface.co/papers/2501.13124', 'abstract': 'Common methods for aligning already-capable models with desired behavior rely on the ability of humans to provide supervision. However, future superhuman models will surpass the capability of humans. Therefore, humans will only be able to weakly supervise superhuman models. This expected deficiency of human evaluation would weaken the safety of future AI systems. Scalable oversight and weak-to-strong generalization are two complementary approaches to tackle this issue. In this paper, we attempt to combine the strengths of these two approaches to further improve alignment. Specifically, we investigate ways of improving human supervision with a strong pretrained model and then supervise the strong model with enhanced weak human supervision. To make iterative empirical progress, we consider an analogy: can we use a strong model to improve weak model supervision and then use it to supervise the strong model? We empirically test it by finetuning a small weak model on ground truth labels with the additional help from a large strong model, and then finetuning the strong model on labels generated by the weak model. We find that debate can assist a weak model in extracting trustworthy information from an untrustworthy strong model, which provides leverage as context on samples when training a weak model. We also show that an ensemble of weak models helps exploit long arguments generated by strong model debaters and obtain a more robust supervision estimate. Extensive experiments on the OpenAI weak-to-strong NLP benchmarks show that the combination approach leads to better alignment, which indicates that debate has the potential to help weak-to-strong generalization.', 'score': 3, 'issue_id': 1843, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'cacd0d01e3d119ee', 'authors': ['Hao Lang', 'Fei Huang', 'Yongbin Li'], 'affiliations': ['Tongyi Lab, Alibaba Inc.'], 'pdf_title_img': 'assets/pdf/title_img/2501.13124.jpg', 'data': {'categories': ['#alignment', '#training', '#rlhf'], 'emoji': '🤖', 'ru': {'title': 'Улучшение контроля над ИИ: от слабого к сильному', 'desc': 'Эта статья исследует методы улучшения контроля над сверхчеловеческими моделями искусственного интеллекта. Авторы предлагают комбинированный подход, используя сильную предобученную модель для улучшения слабого человеческого надзора, а затем применяя этот улучшенный надзор для обучения сильной модели. Эксперименты показывают, что метод дебатов помогает слабой модели извлекать достоверную информацию из ненадежной сильной модели. Результаты на бенчмарках OpenAI демонстрируют, что комбинированный подход приводит к лучшему выравниванию моделей с желаемым поведением.'}, 'en': {'title': 'Enhancing AI Alignment through Model Debate and Supervision', 'desc': 'This paper addresses the challenge of aligning superhuman AI models with desired behaviors, given that human supervision may be insufficient. It proposes a novel approach that combines scalable oversight with weak-to-strong generalization to enhance model alignment. The authors explore how a strong pretrained model can improve the supervision of a weak model, and in turn, how the weak model can provide valuable feedback to the strong model. Their experiments demonstrate that using debate between models can help extract reliable information, leading to improved alignment and performance on NLP tasks.'}, 'zh': {'title': '利用辩论提升AI模型的监督能力', 'desc': '本文探讨了如何在未来超人类模型的监督下改善人类的监督能力。由于人类的监督能力有限,未来的AI系统可能会面临安全性问题。我们提出了一种结合可扩展监督和弱到强泛化的方法,通过强大的预训练模型来增强人类的监督。实验结果表明,辩论可以帮助弱模型从强模型中提取可靠信息,从而提高监督的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.11858', 'title': 'EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents', 'url': 'https://huggingface.co/papers/2501.11858', 'abstract': 'Multimodal Large Language Models (MLLMs) have shown significant advancements, providing a promising future for embodied agents. Existing benchmarks for evaluating MLLMs primarily utilize static images or videos, limiting assessments to non-interactive scenarios. Meanwhile, existing embodied AI benchmarks are task-specific and not diverse enough, which do not adequately evaluate the embodied capabilities of MLLMs. To address this, we propose EmbodiedEval, a comprehensive and interactive evaluation benchmark for MLLMs with embodied tasks. EmbodiedEval features 328 distinct tasks within 125 varied 3D scenes, each of which is rigorously selected and annotated. It covers a broad spectrum of existing embodied AI tasks with significantly enhanced diversity, all within a unified simulation and evaluation framework tailored for MLLMs. The tasks are organized into five categories: navigation, object interaction, social interaction, attribute question answering, and spatial question answering to assess different capabilities of the agents. We evaluated the state-of-the-art MLLMs on EmbodiedEval and found that they have a significant shortfall compared to human level on embodied tasks. Our analysis demonstrates the limitations of existing MLLMs in embodied capabilities, providing insights for their future development. We open-source all evaluation data and simulation framework at https://github.com/thunlp/EmbodiedEval.', 'score': 2, 'issue_id': 1862, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'af76793f3055f7e0', 'authors': ['Zhili Cheng', 'Yuge Tu', 'Ran Li', 'Shiqi Dai', 'Jinyi Hu', 'Shengding Hu', 'Jiahao Li', 'Yang Shi', 'Tianyu Yu', 'Weize Chen', 'Lei Shi', 'Maosong Sun'], 'affiliations': ['Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.11858.jpg', 'data': {'categories': ['#3d', '#multimodal', '#benchmark', '#games', '#open_source', '#agents'], 'emoji': '🤖', 'ru': {'title': 'EmbodiedEval: Новый рубеж в оценке воплощенных возможностей MLLM', 'desc': 'Статья представляет новый комплексный бенчмарк EmbodiedEval для оценки мультимодальных больших языковых моделей (MLLM) в задачах воплощенного искусственного интеллекта. EmbodiedEval включает 328 разнообразных задач в 125 трехмерных сценах, охватывающих навигацию, взаимодействие с объектами, социальное взаимодействие и ответы на вопросы. Оценка современных MLLM на EmbodiedEval выявила значительное отставание от человеческого уровня в воплощенных задачах. Результаты демонстрируют ограничения существующих MLLM и предоставляют insights для их дальнейшего развития.'}, 'en': {'title': 'Empowering MLLMs with Interactive Evaluation for Embodied Tasks', 'desc': "This paper introduces EmbodiedEval, a new evaluation benchmark designed for Multimodal Large Language Models (MLLMs) in the context of embodied tasks. Unlike previous benchmarks that rely on static images or videos, EmbodiedEval offers a diverse set of 328 interactive tasks across 125 3D scenes, allowing for a more comprehensive assessment of MLLMs' capabilities. The tasks are categorized into five areas: navigation, object interaction, social interaction, attribute question answering, and spatial question answering, which helps evaluate different aspects of embodied AI. The findings reveal that current MLLMs fall short of human performance in these tasks, highlighting the need for further advancements in their embodied capabilities."}, 'zh': {'title': '全面评估多模态大型语言模型的具身能力', 'desc': '多模态大型语言模型(MLLMs)在智能体领域取得了显著进展,但现有的评估基准主要依赖静态图像或视频,限制了对交互场景的评估。为了更全面地评估MLLMs的能力,我们提出了EmbodiedEval,这是一个包含328个任务和125个多样化3D场景的互动评估基准。该基准涵盖了导航、物体交互、社交互动等五大类任务,旨在全面评估智能体的不同能力。通过对最先进的MLLMs进行评估,我们发现它们在具身任务上与人类水平存在显著差距,揭示了现有模型的局限性,为未来的发展提供了重要见解。'}}}, {'id': 'https://huggingface.co/papers/2501.10283', 'title': 'GSTAR: Gaussian Surface Tracking and Reconstruction', 'url': 'https://huggingface.co/papers/2501.10283', 'abstract': '3D Gaussian Splatting techniques have enabled efficient photo-realistic rendering of static scenes. Recent works have extended these approaches to support surface reconstruction and tracking. However, tracking dynamic surfaces with 3D Gaussians remains challenging due to complex topology changes, such as surfaces appearing, disappearing, or splitting. To address these challenges, we propose GSTAR, a novel method that achieves photo-realistic rendering, accurate surface reconstruction, and reliable 3D tracking for general dynamic scenes with changing topology. Given multi-view captures as input, GSTAR binds Gaussians to mesh faces to represent dynamic objects. For surfaces with consistent topology, GSTAR maintains the mesh topology and tracks the meshes using Gaussians. In regions where topology changes, GSTAR adaptively unbinds Gaussians from the mesh, enabling accurate registration and the generation of new surfaces based on these optimized Gaussians. Additionally, we introduce a surface-based scene flow method that provides robust initialization for tracking between frames. Experiments demonstrate that our method effectively tracks and reconstructs dynamic surfaces, enabling a range of applications. Our project page with the code release is available at https://eth-ait.github.io/GSTAR/.', 'score': 1, 'issue_id': 1847, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '2ce1394526d61cff', 'authors': ['Chengwei Zheng', 'Lixin Xue', 'Juan Zarate', 'Jie Song'], 'affiliations': ['ETH Zurich', 'HKUST', 'HKUST(GZ)'], 'pdf_title_img': 'assets/pdf/title_img/2501.10283.jpg', 'data': {'categories': ['#3d'], 'emoji': '🌊', 'ru': {'title': 'GSTAR: Гауссово сплаттинг для динамических 3D-сцен', 'desc': 'GSTAR - это новый метод, который позволяет достичь фотореалистичного рендеринга, точной реконструкции поверхности и надежного 3D-трекинга для динамических сцен с изменяющейся топологией. Метод связывает гауссианы с гранями меша для представления динамических объектов и адаптивно отвязывает их в областях с изменяющейся топологией. GSTAR также вводит метод поверхностного потока сцены для надежной инициализации трекинга между кадрами. Эксперименты показывают эффективность метода в отслеживании и реконструкции динамических поверхностей.'}, 'en': {'title': 'GSTAR: Dynamic Surface Tracking with 3D Gaussian Splatting', 'desc': 'The paper presents GSTAR, a new method for rendering and tracking dynamic surfaces using 3D Gaussian Splatting. It effectively handles changes in surface topology, such as when surfaces appear or disappear, by binding Gaussians to mesh faces. For consistent topologies, GSTAR maintains the mesh structure, while it adaptively unbinds Gaussians in areas with topology changes to ensure accurate surface reconstruction. The method also includes a surface-based scene flow technique for improved tracking across frames, demonstrating its effectiveness in various applications.'}, 'zh': {'title': 'GSTAR:动态场景中的高效3D跟踪与重建', 'desc': '3D高斯点技术使得静态场景的照片级真实感渲染变得高效。最近的研究扩展了这些方法,以支持表面重建和跟踪。然而,使用3D高斯点跟踪动态表面仍然面临挑战,因为表面可能出现、消失或分裂。为了解决这些问题,我们提出了GSTAR,这是一种新方法,能够在拓扑变化的动态场景中实现照片级真实感渲染、准确的表面重建和可靠的3D跟踪。'}}}, {'id': 'https://huggingface.co/papers/2501.08828', 'title': 'MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents', 'url': 'https://huggingface.co/papers/2501.08828', 'abstract': 'Multi-modal document retrieval is designed to identify and retrieve various forms of multi-modal content, such as figures, tables, charts, and layout information from extensive documents. Despite its significance, there is a notable lack of a robust benchmark to effectively evaluate the performance of systems in multi-modal document retrieval. To address this gap, this work introduces a new benchmark, named as MMDocIR, encompassing two distinct tasks: page-level and layout-level retrieval. The former focuses on localizing the most relevant pages within a long document, while the latter targets the detection of specific layouts, offering a more fine-grained granularity than whole-page analysis. A layout can refer to a variety of elements such as textual paragraphs, equations, figures, tables, or charts. The MMDocIR benchmark comprises a rich dataset featuring expertly annotated labels for 1,685 questions and bootstrapped labels for 173,843 questions, making it a pivotal resource for advancing multi-modal document retrieval for both training and evaluation. Through rigorous experiments, we reveal that (i) visual retrievers significantly outperform their text counterparts, (ii) MMDocIR train set can effectively benefit the training process of multi-modal document retrieval and (iii) text retrievers leveraging on VLM-text perform much better than those using OCR-text. These findings underscores the potential advantages of integrating visual elements for multi-modal document retrieval.', 'score': 17, 'issue_id': 1698, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'bf9a6df8fecd4ec1', 'authors': ['Kuicai Dong', 'Yujing Chang', 'Xin Deik Goh', 'Dexun Li', 'Ruiming Tang', 'Yong Liu'], 'affiliations': ['Noahs Ark Lab, Huawei'], 'pdf_title_img': 'assets/pdf/title_img/2501.08828.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'MMDocIR: Новый стандарт для мультимодального поиска документов', 'desc': 'Статья представляет новый бенчмарк MMDocIR для оценки систем мультимодального поиска документов. Бенчмарк включает две задачи: поиск на уровне страниц и на уровне макетов. Датасет содержит экспертные аннотации для 1,685 вопросов и автоматически сгенерированные метки для 173,843 вопросов. Эксперименты показали, что визуальные ретриверы превосходят текстовые, а использование визуально-языковых моделей дает лучшие результаты, чем OCR-текст.'}, 'en': {'title': 'Unlocking Multi-Modal Document Retrieval with MMDocIR', 'desc': 'This paper addresses the challenge of multi-modal document retrieval, which involves finding various types of content like figures and tables in large documents. It introduces a new benchmark called MMDocIR, which includes two tasks: page-level retrieval for finding relevant pages and layout-level retrieval for identifying specific layouts within those pages. The benchmark is supported by a comprehensive dataset with thousands of annotated questions, facilitating better training and evaluation of retrieval systems. The results show that visual retrieval methods outperform text-based methods, highlighting the importance of incorporating visual information in multi-modal retrieval tasks.'}, 'zh': {'title': '多模态文档检索的新基准MMDocIR', 'desc': '多模态文档检索旨在从大量文档中识别和提取各种形式的内容,如图形、表格、图表和布局信息。尽管其重要性显著,但目前缺乏有效评估多模态文档检索系统性能的基准。为了解决这一问题,本文提出了一个新的基准MMDocIR,包含页面级和布局级检索两个任务。通过严格的实验,我们发现视觉检索器的表现显著优于文本检索器,且MMDocIR训练集能有效促进多模态文档检索的训练过程。'}}}, {'id': 'https://huggingface.co/papers/2501.08365', 'title': 'Towards Best Practices for Open Datasets for LLM Training', 'url': 'https://huggingface.co/papers/2501.08365', 'abstract': 'Many AI companies are training their large language models (LLMs) on data without the permission of the copyright owners. The permissibility of doing so varies by jurisdiction: in countries like the EU and Japan, this is allowed under certain restrictions, while in the United States, the legal landscape is more ambiguous. Regardless of the legal status, concerns from creative producers have led to several high-profile copyright lawsuits, and the threat of litigation is commonly cited as a reason for the recent trend towards minimizing the information shared about training datasets by both corporate and public interest actors. This trend in limiting data information causes harm by hindering transparency, accountability, and innovation in the broader ecosystem by denying researchers, auditors, and impacted individuals access to the information needed to understand AI models. While this could be mitigated by training language models on open access and public domain data, at the time of writing, there are no such models (trained at a meaningful scale) due to the substantial technical and sociological challenges in assembling the necessary corpus. These challenges include incomplete and unreliable metadata, the cost and complexity of digitizing physical records, and the diverse set of legal and technical skills required to ensure relevance and responsibility in a quickly changing landscape. Building towards a future where AI systems can be trained on openly licensed data that is responsibly curated and governed requires collaboration across legal, technical, and policy domains, along with investments in metadata standards, digitization, and fostering a culture of openness.', 'score': 16, 'issue_id': 1702, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '90686080aa439157', 'authors': ['Stefan Baack', 'Stella Biderman', 'Kasia Odrozek', 'Aviya Skowron', 'Ayah Bdeir', 'Jillian Bommarito', 'Jennifer Ding', 'Maximilian Gahntz', 'Paul Keller', 'Pierre-Carl Langlais', 'Greg Lindahl', 'Sebastian Majstorovic', 'Nik Marda', 'Guilherme Penedo', 'Maarten Van Segbroeck', 'Jennifer Wang', 'Leandro von Werra', 'Mitchell Baker', 'Julie Belião', 'Kasia Chmielinski', 'Marzieh Fadaee', 'Lisa Gutermuth', 'Hynek Kydlíček', 'Greg Leppert', 'EM Lewis-Jong', 'Solana Larsen', 'Shayne Longpre', 'Angela Oduor Lungati', 'Cullen Miller', 'Victor Miller', 'Max Ryabinin', 'Kathleen Siminyu', 'Andrew Strait', 'Mark Surman', 'Anna Tumadóttir', 'Maurice Weber', 'Rebecca Weiss', 'Lee White', 'Thomas Wolf'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08365.jpg', 'data': {'categories': ['#open_source', '#ethics', '#data', '#dataset'], 'emoji': '📚', 'ru': {'title': 'Открытые данные для ответственного ИИ: вызовы и перспективы', 'desc': 'Статья рассматривает проблему обучения больших языковых моделей (LLM) на данных без разрешения правообладателей. Анализируются юридические аспекты этой практики в разных странах и связанные с ней судебные иски. Отмечается тенденция к ограничению информации о наборах данных для обучения, что негативно влияет на прозрачность и подотчетность в сфере ИИ. Обсуждаются вызовы создания моделей на основе открытых данных, включая технические и социологические аспекты.'}, 'en': {'title': 'Towards Transparent AI: The Need for Open Data Collaboration', 'desc': 'This paper discusses the legal and ethical challenges surrounding the training of large language models (LLMs) using copyrighted data without permission. It highlights the varying legal frameworks across different countries, particularly the ambiguity in the United States compared to more defined rules in the EU and Japan. The authors argue that the trend of limiting information about training datasets undermines transparency and innovation in AI, making it difficult for researchers and stakeholders to understand the models. They propose that a shift towards using open access and public domain data is necessary, but emphasize the need for collaboration and investment in infrastructure to overcome the technical and sociological barriers involved.'}, 'zh': {'title': '推动开放许可数据的AI训练未来', 'desc': '许多人工智能公司在没有版权拥有者许可的情况下训练大型语言模型(LLMs)。不同国家对这种做法的合法性有不同的规定,欧盟和日本在某些限制下允许,而美国的法律环境则较为模糊。这种限制数据共享的信息趋势,妨碍了透明度、问责制和创新,影响了研究人员和受影响个体获取理解AI模型所需的信息。为了实现未来能够在开放许可数据上训练AI系统,需要在法律、技术和政策领域进行合作,并投资于元数据标准和数字化。'}}}, {'id': 'https://huggingface.co/papers/2501.08983', 'title': 'CityDreamer4D: Compositional Generative Model of Unbounded 4D Cities', 'url': 'https://huggingface.co/papers/2501.08983', 'abstract': '3D scene generation has garnered growing attention in recent years and has made significant progress. Generating 4D cities is more challenging than 3D scenes due to the presence of structurally complex, visually diverse objects like buildings and vehicles, and heightened human sensitivity to distortions in urban environments. To tackle these issues, we propose CityDreamer4D, a compositional generative model specifically tailored for generating unbounded 4D cities. Our main insights are 1) 4D city generation should separate dynamic objects (e.g., vehicles) from static scenes (e.g., buildings and roads), and 2) all objects in the 4D scene should be composed of different types of neural fields for buildings, vehicles, and background stuff. Specifically, we propose Traffic Scenario Generator and Unbounded Layout Generator to produce dynamic traffic scenarios and static city layouts using a highly compact BEV representation. Objects in 4D cities are generated by combining stuff-oriented and instance-oriented neural fields for background stuff, buildings, and vehicles. To suit the distinct characteristics of background stuff and instances, the neural fields employ customized generative hash grids and periodic positional embeddings as scene parameterizations. Furthermore, we offer a comprehensive suite of datasets for city generation, including OSM, GoogleEarth, and CityTopia. The OSM dataset provides a variety of real-world city layouts, while the Google Earth and CityTopia datasets deliver large-scale, high-quality city imagery complete with 3D instance annotations. Leveraging its compositional design, CityDreamer4D supports a range of downstream applications, such as instance editing, city stylization, and urban simulation, while delivering state-of-the-art performance in generating realistic 4D cities.', 'score': 11, 'issue_id': 1698, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '39cd0826d4232170', 'authors': ['Haozhe Xie', 'Zhaoxi Chen', 'Fangzhou Hong', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University, Singapore 637335'], 'pdf_title_img': 'assets/pdf/title_img/2501.08983.jpg', 'data': {'categories': ['#3d', '#dataset'], 'emoji': '🏙️', 'ru': {'title': 'Композиционная генерация 4D-городов с разделением динамики и статики', 'desc': 'CityDreamer4D - это генеративная модель для создания неограниченных 4D-городов. Она разделяет генерацию динамических объектов (например, транспорта) и статических сцен (зданий, дорог). Модель использует разные типы нейронных полей для зданий, транспорта и фона, применяя специализированные генеративные хеш-сетки и периодические позиционные эмбеддинги. CityDreamer4D демонстрирует передовые результаты в генерации реалистичных 4D-городов и поддерживает различные приложения, включая редактирование объектов и городское моделирование.'}, 'en': {'title': 'Revolutionizing Urban Landscapes: CityDreamer4D for Dynamic City Generation', 'desc': "This paper introduces CityDreamer4D, a generative model designed for creating unbounded 4D cities, which include both static and dynamic elements. The model distinguishes between dynamic objects like vehicles and static structures such as buildings, using specialized neural fields for each type. It employs a compact bird's-eye view (BEV) representation to generate realistic traffic scenarios and city layouts. Additionally, the paper provides extensive datasets for training, enabling various applications like instance editing and urban simulation while achieving high-quality results in 4D city generation."}, 'zh': {'title': 'CityDreamer4D:无限4D城市生成的新突破', 'desc': '近年来,3D场景生成受到了越来越多的关注,并取得了显著进展。生成4D城市比3D场景更具挑战性,因为城市环境中存在结构复杂、视觉多样的物体,如建筑和车辆。为了解决这些问题,我们提出了CityDreamer4D,这是一种专门用于生成无限4D城市的组合生成模型。该模型通过将动态物体与静态场景分离,并使用不同类型的神经场来组合城市中的所有物体,从而实现高质量的城市生成。'}}}, {'id': 'https://huggingface.co/papers/2501.08994', 'title': 'RepVideo: Rethinking Cross-Layer Representation for Video Generation', 'url': 'https://huggingface.co/papers/2501.08994', 'abstract': 'Video generation has achieved remarkable progress with the introduction of diffusion models, which have significantly improved the quality of generated videos. However, recent research has primarily focused on scaling up model training, while offering limited insights into the direct impact of representations on the video generation process. In this paper, we initially investigate the characteristics of features in intermediate layers, finding substantial variations in attention maps across different layers. These variations lead to unstable semantic representations and contribute to cumulative differences between features, which ultimately reduce the similarity between adjacent frames and negatively affect temporal coherence. To address this, we propose RepVideo, an enhanced representation framework for text-to-video diffusion models. By accumulating features from neighboring layers to form enriched representations, this approach captures more stable semantic information. These enhanced representations are then used as inputs to the attention mechanism, thereby improving semantic expressiveness while ensuring feature consistency across adjacent frames. Extensive experiments demonstrate that our RepVideo not only significantly enhances the ability to generate accurate spatial appearances, such as capturing complex spatial relationships between multiple objects, but also improves temporal consistency in video generation.', 'score': 10, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '0d164d45ba2a5c71', 'authors': ['Chenyang Si', 'Weichen Fan', 'Zhengyao Lv', 'Ziqi Huang', 'Yu Qiao', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University, Singapore, 639798', 'Shanghai Artificial Intelligence Laboratory, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.08994.jpg', 'data': {'categories': ['#video', '#diffusion', '#architecture'], 'emoji': '🎬', 'ru': {'title': 'RepVideo: стабильные представления для качественной генерации видео', 'desc': 'Статья представляет RepVideo - улучшенную систему представлений для диффузионных моделей генерации видео на основе текста. Авторы обнаружили, что вариации в картах внимания между слоями приводят к нестабильным семантическим представлениям и снижают согласованность соседних кадров. RepVideo решает эту проблему путем накопления признаков из соседних слоев для создания обогащенных представлений. Эксперименты показывают, что RepVideo значительно улучшает способность генерировать точные пространственные образы и повышает временную согласованность при генерации видео.'}, 'en': {'title': 'Enhancing Video Generation with Stable Representations', 'desc': "This paper presents RepVideo, a new framework designed to improve video generation using text-to-video diffusion models. It identifies issues with unstable semantic representations caused by variations in attention maps across different layers of the model. By accumulating features from neighboring layers, RepVideo creates more stable and enriched representations that enhance the model's ability to maintain consistency between adjacent frames. The results show that RepVideo significantly improves both the spatial accuracy of generated videos and their temporal coherence, leading to more realistic video outputs."}, 'zh': {'title': '提升视频生成质量的RepVideo框架', 'desc': '本论文探讨了扩散模型在视频生成中的应用,提出了RepVideo框架以改善视频生成的质量。研究发现中间层特征的注意力图存在显著差异,这导致语义表示的不稳定性,进而影响相邻帧之间的相似性和时间一致性。RepVideo通过从相邻层累积特征,形成更丰富的表示,从而捕捉更稳定的语义信息。实验结果表明,RepVideo显著提高了生成视频的空间表现能力和时间一致性。'}}}, {'id': 'https://huggingface.co/papers/2501.07783', 'title': 'Parameter-Inverted Image Pyramid Networks for Visual Perception and Multimodal Understanding', 'url': 'https://huggingface.co/papers/2501.07783', 'abstract': 'Image pyramids are widely adopted in top-performing methods to obtain multi-scale features for precise visual perception and understanding. However, current image pyramids use the same large-scale model to process multiple resolutions of images, leading to significant computational cost. To address this challenge, we propose a novel network architecture, called Parameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses pretrained models (ViTs or CNNs) as branches to process multi-scale images, where images of higher resolutions are processed by smaller network branches to balance computational cost and performance. To integrate information from different spatial scales, we further propose a novel cross-branch feature interaction mechanism. To validate PIIP, we apply it to various perception models and a representative multimodal large language model called LLaVA, and conduct extensive experiments on various tasks such as object detection, segmentation, image classification and multimodal understanding. PIIP achieves superior performance compared to single-branch and existing multi-resolution approaches with lower computational cost. When applied to InternViT-6B, a large-scale vision foundation model, PIIP can improve its performance by 1%-2% on detection and segmentation with only 40%-60% of the original computation, finally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For multimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and 74.5% on MMBench with only 2.8M training data. Our code is released at https://github.com/OpenGVLab/PIIP.', 'score': 5, 'issue_id': 1701, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '87295e912b5b0670', 'authors': ['Zhaokai Wang', 'Xizhou Zhu', 'Xue Yang', 'Gen Luo', 'Hao Li', 'Changyao Tian', 'Wenhan Dou', 'Junqi Ge', 'Lewei Lu', 'Yu Qiao', 'Jifeng Dai'], 'affiliations': ['Sensetime', 'Shanghai Artificial Intelligence Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07783.jpg', 'data': {'categories': ['#architecture', '#multimodal', '#cv'], 'emoji': '🔍', 'ru': {'title': 'Эффективные многомасштабные сети для точного визуального восприятия', 'desc': 'Статья представляет новую архитектуру нейронных сетей под названием Parameter-Inverted Image Pyramid Networks (PIIP). PIIP использует предобученные модели (ViT или CNN) в качестве ветвей для обработки многомасштабных изображений, где изображения с более высоким разрешением обрабатываются меньшими сетевыми ветвями для баланса вычислительных затрат и производительности. Авторы также предлагают новый механизм взаимодействия признаков между ветвями. PIIP демонстрирует превосходную производительность по сравнению с одноветвенными и существующими многоразрешающими подходами при меньших вычислительных затратах в задачах обнаружения объектов, сегментации, классификации изображений и мультимодального понимания.'}, 'en': {'title': 'Efficient Multi-Scale Processing with PIIP Networks', 'desc': 'This paper introduces Parameter-Inverted Image Pyramid Networks (PIIP), a new architecture designed to efficiently process multi-scale images for visual tasks. Unlike traditional methods that use a single large model for all resolutions, PIIP employs smaller branches for higher resolution images, reducing computational costs while maintaining performance. The architecture also features a unique cross-branch interaction mechanism to enhance feature integration across different scales. Experimental results demonstrate that PIIP outperforms existing methods in various tasks, achieving significant accuracy improvements with lower resource usage.'}, 'zh': {'title': '高效多尺度图像处理的新方法', 'desc': '本文提出了一种新的网络架构,称为参数反转图像金字塔网络(PIIP),旨在提高多尺度图像处理的效率。PIIP利用预训练模型作为分支,处理不同分辨率的图像,从而在性能和计算成本之间取得平衡。通过引入跨分支特征交互机制,PIIP能够有效整合来自不同空间尺度的信息。实验结果表明,PIIP在目标检测、分割和多模态理解等任务上表现优于现有方法,同时显著降低了计算成本。'}}}, {'id': 'https://huggingface.co/papers/2501.09012', 'title': 'Multimodal LLMs Can Reason about Aesthetics in Zero-Shot', 'url': 'https://huggingface.co/papers/2501.09012', 'abstract': "We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability shall be elicited to evaluate the aesthetics of artworks. To facilitate this investigation, we construct MM-StyleBench, a novel high-quality dataset for benchmarking artistic stylization. We then develop a principled method for human preference modeling and perform a systematic correlation analysis between MLLMs' responses and human preference. Our experiments reveal an inherent hallucination issue of MLLMs in art evaluation, associated with response subjectivity. ArtCoT is proposed, demonstrating that art-specific task decomposition and the use of concrete language boost MLLMs' reasoning ability for aesthetics. Our findings offer valuable insights into MLLMs for art and can benefit a wide range of downstream applications, such as style transfer and artistic image generation. Code available at https://github.com/songrise/MLLM4Art.", 'score': 5, 'issue_id': 1699, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'e516a920b6534cc0', 'authors': ['Ruixiang Jiang', 'Changwen Chen'], 'affiliations': ['The Hong Kong Polytechnic University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09012.jpg', 'data': {'categories': ['#artificial intelligence', '#reasoning', '#hallucinations', '#multimodal', '#benchmark', '#dataset'], 'emoji': '🎨', 'ru': {'title': 'Искусственный интеллект учится оценивать искусство', 'desc': 'Исследование посвящено использованию мультимодальных языковых моделей (MLLM) для оценки эстетики произведений искусства. Авторы создали набор данных MM-StyleBench для тестирования художественной стилизации и разработали метод моделирования человеческих предпочтений. Эксперименты выявили проблему галлюцинаций MLLM при оценке искусства, связанную с субъективностью ответов. Предложенный метод ArtCoT улучшает способность MLLM к рассуждениям об эстетике путем декомпозиции задач и использования конкретного языка.'}, 'en': {'title': 'Enhancing MLLMs for Art Evaluation through Structured Reasoning', 'desc': "This paper investigates how Multimodal Large Language Models (MLLMs) can assess the aesthetics of artworks. The authors introduce MM-StyleBench, a new dataset designed to benchmark artistic stylization. They also create a method for modeling human preferences and analyze the correlation between MLLMs' evaluations and human judgments. The study highlights a hallucination problem in MLLMs when evaluating art and proposes ArtCoT, which improves reasoning by using task decomposition and specific language, providing insights for applications like style transfer and artistic image generation."}, 'zh': {'title': '提升多模态大语言模型的艺术推理能力', 'desc': '本研究首次探讨了多模态大语言模型(MLLMs)在评估艺术作品美学时的推理能力。我们构建了一个新的高质量数据集MM-StyleBench,用于艺术风格化的基准测试。通过系统的相关性分析,我们发现MLLMs在艺术评估中存在固有的幻觉问题,且与人类偏好存在主观性关联。我们提出了ArtCoT方法,表明艺术特定任务分解和使用具体语言可以提升MLLMs的美学推理能力。'}}}, {'id': 'https://huggingface.co/papers/2501.09019', 'title': 'Ouroboros-Diffusion: Exploring Consistent Content Generation in Tuning-free Long Video Diffusion', 'url': 'https://huggingface.co/papers/2501.09019', 'abstract': "The first-in-first-out (FIFO) video diffusion, built on a pre-trained text-to-video model, has recently emerged as an effective approach for tuning-free long video generation. This technique maintains a queue of video frames with progressively increasing noise, continuously producing clean frames at the queue's head while Gaussian noise is enqueued at the tail. However, FIFO-Diffusion often struggles to keep long-range temporal consistency in the generated videos due to the lack of correspondence modeling across frames. In this paper, we propose Ouroboros-Diffusion, a novel video denoising framework designed to enhance structural and content (subject) consistency, enabling the generation of consistent videos of arbitrary length. Specifically, we introduce a new latent sampling technique at the queue tail to improve structural consistency, ensuring perceptually smooth transitions among frames. To enhance subject consistency, we devise a Subject-Aware Cross-Frame Attention (SACFA) mechanism, which aligns subjects across frames within short segments to achieve better visual coherence. Furthermore, we introduce self-recurrent guidance. This technique leverages information from all previous cleaner frames at the front of the queue to guide the denoising of noisier frames at the end, fostering rich and contextual global information interaction. Extensive experiments of long video generation on the VBench benchmark demonstrate the superiority of our Ouroboros-Diffusion, particularly in terms of subject consistency, motion smoothness, and temporal consistency.", 'score': 4, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'c4c991699f684865', 'authors': ['Jingyuan Chen', 'Fuchen Long', 'Jie An', 'Zhaofan Qiu', 'Ting Yao', 'Jiebo Luo', 'Tao Mei'], 'affiliations': ['HiDream.ai Inc.', 'University of Rochester, Rochester, NY USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.09019.jpg', 'data': {'categories': ['#benchmark', '#video', '#long_context', '#diffusion'], 'emoji': '🐍', 'ru': {'title': 'Бесконечное видео: Ouroboros-Diffusion для непрерывной генерации согласованного контента', 'desc': 'Эта статья представляет новый метод генерации видео произвольной длины под названием Ouroboros-Diffusion. Метод улучшает структурную и сюжетную согласованность видео с помощью нового подхода к выборке латентного пространства и механизма Subject-Aware Cross-Frame Attention. Авторы также вводят самоповторяющееся руководство, использующее информацию из предыдущих очищенных кадров для улучшения шумных кадров. Эксперименты на бенчмарке VBench показывают превосходство Ouroboros-Diffusion в сохранении согласованности субъектов, плавности движения и временной согласованности.'}, 'en': {'title': 'Ouroboros-Diffusion: Enhancing Long Video Consistency and Coherence', 'desc': 'The paper introduces Ouroboros-Diffusion, a new framework for improving long video generation using a pre-trained text-to-video model. It addresses the limitations of FIFO-Diffusion, particularly in maintaining long-range temporal consistency across video frames. The proposed method enhances structural consistency through a novel latent sampling technique and improves subject consistency with a Subject-Aware Cross-Frame Attention mechanism. Additionally, self-recurrent guidance is implemented to utilize information from previous frames, resulting in videos with better visual coherence and smoother transitions.'}, 'zh': {'title': 'Ouroboros-Diffusion:提升视频生成一致性的创新框架', 'desc': 'FIFO视频扩散是一种基于预训练文本到视频模型的长视频生成方法,但在生成视频时常常缺乏长时间的一致性。本文提出了Ouroboros-Diffusion框架,通过引入新的潜在采样技术和主题感知跨帧注意机制,增强了视频的结构和内容一致性。该方法确保了帧之间的平滑过渡,并通过自递归引导技术利用前面清晰帧的信息来改善后面噪声帧的去噪效果。实验结果表明,Ouroboros-Diffusion在主题一致性、运动平滑性和时间一致性方面优于现有方法。'}}}, {'id': 'https://huggingface.co/papers/2501.08809', 'title': 'XMusic: Towards a Generalized and Controllable Symbolic Music Generation Framework', 'url': 'https://huggingface.co/papers/2501.08809', 'abstract': 'In recent years, remarkable advancements in artificial intelligence-generated content (AIGC) have been achieved in the fields of image synthesis and text generation, generating content comparable to that produced by humans. However, the quality of AI-generated music has not yet reached this standard, primarily due to the challenge of effectively controlling musical emotions and ensuring high-quality outputs. This paper presents a generalized symbolic music generation framework, XMusic, which supports flexible prompts (i.e., images, videos, texts, tags, and humming) to generate emotionally controllable and high-quality symbolic music. XMusic consists of two core components, XProjector and XComposer. XProjector parses the prompts of various modalities into symbolic music elements (i.e., emotions, genres, rhythms and notes) within the projection space to generate matching music. XComposer contains a Generator and a Selector. The Generator generates emotionally controllable and melodious music based on our innovative symbolic music representation, whereas the Selector identifies high-quality symbolic music by constructing a multi-task learning scheme involving quality assessment, emotion recognition, and genre recognition tasks. In addition, we build XMIDI, a large-scale symbolic music dataset that contains 108,023 MIDI files annotated with precise emotion and genre labels. Objective and subjective evaluations show that XMusic significantly outperforms the current state-of-the-art methods with impressive music quality. Our XMusic has been awarded as one of the nine Highlights of Collectibles at WAIC 2023. The project homepage of XMusic is https://xmusic-project.github.io.', 'score': 4, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'd4d018c9adb2579c', 'pdf_title_img': 'img/title_stub.png', 'data': {'categories': ['#audio', '#story_generation', '#multimodal', '#dataset'], 'emoji': '🎵', 'ru': {'title': 'XMusic: ИИ-композитор нового поколения с управляемыми эмоциями', 'desc': 'Статья представляет XMusic - генерализованный фреймворк для генерации символической музыки, поддерживающий различные типы промптов. XMusic состоит из двух ключевых компонентов: XProjector для обработки промптов и XComposer для генерации музыки. Авторы также создали датасет XMIDI, содержащий более 100 тысяч MIDI-файлов с аннотациями эмоций и жанров. Согласно оценкам, XMusic значительно превосходит современные методы по качеству генерируемой музыки.'}, 'en': {'title': 'XMusic: Emotionally Controlled Music Generation Made Easy!', 'desc': 'This paper introduces XMusic, a new framework for generating symbolic music that can be controlled by emotional prompts. It includes two main components: XProjector, which converts various input types into musical elements, and XComposer, which generates and selects high-quality music. The framework uses a multi-task learning approach to ensure the generated music meets quality, emotional, and genre standards. Additionally, the authors created a large dataset, XMIDI, to support their research and demonstrate that XMusic outperforms existing methods in music generation.'}, 'zh': {'title': 'XMusic:情感可控的高质量音乐生成', 'desc': '近年来,人工智能生成内容(AIGC)在图像合成和文本生成领域取得了显著进展,但在音乐生成方面仍面临挑战。本文提出了一种通用的符号音乐生成框架XMusic,能够通过灵活的提示生成可控情感和高质量的符号音乐。XMusic由两个核心组件组成:XProjector和XComposer,前者将多种模态的提示解析为音乐元素,后者则生成和选择高质量的音乐。通过构建大规模的XMIDI数据集和多任务学习方案,XMusic在音乐质量上显著优于现有方法。'}}, 'authors': [], 'affiliations': []}, {'id': 'https://huggingface.co/papers/2501.08970', 'title': 'Trusted Machine Learning Models Unlock Private Inference for Problems Currently Infeasible with Cryptography', 'url': 'https://huggingface.co/papers/2501.08970', 'abstract': 'We often interact with untrusted parties. Prioritization of privacy can limit the effectiveness of these interactions, as achieving certain goals necessitates sharing private data. Traditionally, addressing this challenge has involved either seeking trusted intermediaries or constructing cryptographic protocols that restrict how much data is revealed, such as multi-party computations or zero-knowledge proofs. While significant advances have been made in scaling cryptographic approaches, they remain limited in terms of the size and complexity of applications they can be used for. In this paper, we argue that capable machine learning models can fulfill the role of a trusted third party, thus enabling secure computations for applications that were previously infeasible. In particular, we describe Trusted Capable Model Environments (TCMEs) as an alternative approach for scaling secure computation, where capable machine learning model(s) interact under input/output constraints, with explicit information flow control and explicit statelessness. This approach aims to achieve a balance between privacy and computational efficiency, enabling private inference where classical cryptographic solutions are currently infeasible. We describe a number of use cases that are enabled by TCME, and show that even some simple classic cryptographic problems can already be solved with TCME. Finally, we outline current limitations and discuss the path forward in implementing them.', 'score': 3, 'issue_id': 1702, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '858fc03ac78b66c1', 'authors': ['Ilia Shumailov', 'Daniel Ramage', 'Sarah Meiklejohn', 'Peter Kairouz', 'Florian Hartmann', 'Borja Balle', 'Eugene Bagdasarian'], 'affiliations': ['Google', 'Google DeepMind', 'Google Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.08970.jpg', 'data': {'categories': ['#data', '#ethics', '#architecture', '#security', '#inference'], 'emoji': '🔐', 'ru': {'title': 'Машинное обучение как доверенный посредник для безопасных вычислений', 'desc': 'Статья представляет новый подход к безопасным вычислениям с использованием машинного обучения - Trusted Capable Model Environments (TCME). TCME предлагается как альтернатива традиционным криптографическим методам для обеспечения конфиденциальности при взаимодействии с ненадежными сторонами. Авторы утверждают, что мощные модели машинного обучения могут выполнять роль доверенной третьей стороны, позволяя проводить безопасные вычисления для приложений, которые ранее были невозможны. В статье описываются возможные применения TCME и обсуждаются текущие ограничения и перспективы развития этого подхода.'}, 'en': {'title': 'Empowering Privacy with Trusted Machine Learning Models', 'desc': 'This paper introduces Trusted Capable Model Environments (TCMEs) as a novel solution for secure computations involving untrusted parties. It suggests that advanced machine learning models can act as trusted intermediaries, allowing for private data sharing while maintaining privacy. The authors highlight how TCMEs can efficiently manage input/output constraints and control information flow, making them suitable for applications where traditional cryptographic methods fall short. They also present various use cases and acknowledge the limitations of their approach, paving the way for future developments in secure machine learning applications.'}, 'zh': {'title': '利用机器学习实现安全计算的新方法', 'desc': '本文探讨了在与不可信方互动时如何平衡隐私和计算效率。我们提出了可信能力模型环境(TCME),作为一种新的安全计算方法,利用机器学习模型充当可信第三方。TCME在输入/输出约束下进行交互,并通过显式的信息流控制和无状态性来保护隐私。我们展示了TCME在解决一些经典密码学问题上的潜力,并讨论了未来的实施路径。'}}}, {'id': 'https://huggingface.co/papers/2501.04693', 'title': 'Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous Sensors via Language Grounding', 'url': 'https://huggingface.co/papers/2501.04693', 'abstract': 'Interacting with the world is a multi-sensory experience: achieving effective general-purpose interaction requires making use of all available modalities -- including vision, touch, and audio -- to fill in gaps from partial observation. For example, when vision is occluded reaching into a bag, a robot should rely on its senses of touch and sound. However, state-of-the-art generalist robot policies are typically trained on large datasets to predict robot actions solely from visual and proprioceptive observations. In this work, we propose FuSe, a novel approach that enables finetuning visuomotor generalist policies on heterogeneous sensor modalities for which large datasets are not readily available by leveraging natural language as a common cross-modal grounding. We combine a multimodal contrastive loss with a sensory-grounded language generation loss to encode high-level semantics. In the context of robot manipulation, we show that FuSe enables performing challenging tasks that require reasoning jointly over modalities such as vision, touch, and sound in a zero-shot setting, such as multimodal prompting, compositional cross-modal prompting, and descriptions of objects it interacts with. We show that the same recipe is applicable to widely different generalist policies, including both diffusion-based generalist policies and large vision-language-action (VLA) models. Extensive experiments in the real world show that FuSeis able to increase success rates by over 20% compared to all considered baselines.', 'score': 0, 'issue_id': 1709, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1612a7343aff595b', 'authors': ['Joshua Jones', 'Oier Mees', 'Carmelo Sferrazza', 'Kyle Stachowicz', 'Pieter Abbeel', 'Sergey Levine'], 'affiliations': ['Berkeley AI Research (BAIR), UC Berkeley, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.04693.jpg', 'data': {'categories': ['#transfer_learning', '#multimodal', '#robotics', '#reasoning'], 'emoji': '🤖', 'ru': {'title': 'Мультисенсорный ИИ: объединение зрения, осязания и звука для улучшения взаимодействия роботов с миром', 'desc': 'Статья представляет FuSe - новый подход к обучению роботов, использующий мультимодальные сенсорные данные. FuSe использует естественный язык как общую основу для объединения различных модальностей, таких как зрение, осязание и звук. Метод сочетает мультимодальную контрастивную функцию потерь с генерацией языка на основе сенсорных данных для кодирования высокоуровневой семантики. Эксперименты показывают, что FuSe позволяет роботам выполнять сложные задачи, требующие рассуждений на основе нескольких модальностей, повышая успешность на 20% по сравнению с базовыми методами.'}, 'en': {'title': 'FuSe: Bridging Sensory Gaps for Smarter Robot Interaction', 'desc': 'This paper introduces FuSe, a method that enhances robot interaction by integrating multiple sensory modalities like vision, touch, and sound. Traditional robot policies often rely solely on visual data, but FuSe allows for fine-tuning these policies using natural language to bridge gaps in sensory information. By employing a multimodal contrastive loss and a sensory-grounded language generation loss, FuSe effectively encodes high-level semantics for better decision-making. The results demonstrate that FuSe significantly improves the success rates of robots in complex tasks, showcasing its versatility across different generalist policies.'}, 'zh': {'title': '多模态交互,提升机器人智能', 'desc': '本论文提出了一种名为FuSe的新方法,旨在通过多模态传感器数据来微调通用机器人策略。FuSe利用自然语言作为跨模态的共同基础,结合多模态对比损失和感知基础的语言生成损失,以编码高层语义。通过这种方法,机器人能够在视觉、触觉和听觉等多种感官信息的共同推理下,完成复杂的操作任务。实验结果表明,FuSe在实际应用中成功率提高了超过20%。'}}}, {'id': 'https://huggingface.co/papers/2412.19412', 'title': 'MINIMA: Modality Invariant Image Matching', 'url': 'https://huggingface.co/papers/2412.19412', 'abstract': 'Image matching for both cross-view and cross-modality plays a critical role in multimodal perception. In practice, the modality gap caused by different imaging systems/styles poses great challenges to the matching task. Existing works try to extract invariant features for specific modalities and train on limited datasets, showing poor generalization. In this paper, we present MINIMA, a unified image matching framework for multiple cross-modal cases. Without pursuing fancy modules, our MINIMA aims to enhance universal performance from the perspective of data scaling up. For such purpose, we propose a simple yet effective data engine that can freely produce a large dataset containing multiple modalities, rich scenarios, and accurate matching labels. Specifically, we scale up the modalities from cheap but rich RGB-only matching data, by means of generative models. Under this setting, the matching labels and rich diversity of the RGB dataset are well inherited by the generated multimodal data. Benefiting from this, we construct MD-syn, a new comprehensive dataset that fills the data gap for general multimodal image matching. With MD-syn, we can directly train any advanced matching pipeline on randomly selected modality pairs to obtain cross-modal ability. Extensive experiments on in-domain and zero-shot matching tasks, including 19 cross-modal cases, demonstrate that our MINIMA can significantly outperform the baselines and even surpass modality-specific methods. The dataset and code are available at https://github.com/LSXI7/MINIMA .', 'score': 0, 'issue_id': 1709, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': 'fa772dead5453f7b', 'authors': ['Xingyu Jiang', 'Jiangwei Ren', 'Zizhuo Li', 'Xin Zhou', 'Dingkang Liang', 'Xiang Bai'], 'affiliations': ['Huazhong University of Science and Technology', 'Wuhan University'], 'pdf_title_img': 'assets/pdf/title_img/2412.19412.jpg', 'data': {'categories': ['#dataset', '#data', '#multimodal', '#open_source', '#synthetic'], 'emoji': '🔀', 'ru': {'title': 'Универсальное сопоставление изображений через масштабирование данных', 'desc': 'Статья представляет MINIMA - универсальную систему сопоставления изображений для различных кросс-модальных случаев. Авторы предлагают эффективный механизм генерации большого набора данных с несколькими модальностями, разнообразными сценариями и точными метками сопоставления. Используя этот подход, они создают новый комплексный датасет MD-syn для обучения нейросетей кросс-модальному сопоставлению изображений. Эксперименты показывают, что MINIMA значительно превосходит базовые модели и даже специализированные методы для конкретных модальностей в 19 кросс-модальных задачах.'}, 'en': {'title': 'MINIMA: Bridging the Gap in Cross-Modal Image Matching', 'desc': 'This paper introduces MINIMA, a framework designed for image matching across different views and modalities, addressing the challenges posed by varying imaging systems. The authors highlight the limitations of existing methods that rely on invariant features and small datasets, which often lead to poor performance. MINIMA enhances image matching by scaling up data through a generative model that creates a large, diverse dataset with accurate matching labels. The new dataset, MD-syn, allows for effective training of matching algorithms, resulting in improved performance in both in-domain and zero-shot scenarios compared to traditional methods.'}, 'zh': {'title': 'MINIMA:跨模态图像匹配的新突破', 'desc': '本文提出了一种名为MINIMA的统一图像匹配框架,旨在解决跨视角和跨模态的图像匹配问题。现有方法在特定模态上提取不变特征,但在有限数据集上训练,导致泛化能力差。MINIMA通过一个简单有效的数据引擎,生成包含多种模态和丰富场景的大型数据集,从而提升通用性能。通过构建MD-syn数据集,MINIMA能够在随机选择的模态对上直接训练,显著提高跨模态匹配能力。'}}}]; + const articlesData = [{'id': 'https://huggingface.co/papers/2412.18525', 'title': 'Explanatory Instructions: Towards Unified Vision Tasks Understanding and Zero-shot Generalization', 'url': 'https://huggingface.co/papers/2412.18525', 'abstract': "Computer Vision (CV) has yet to fully achieve the zero-shot task generalization observed in Natural Language Processing (NLP), despite following many of the milestones established in NLP, such as large transformer models, extensive pre-training, and the auto-regression paradigm, among others. In this paper, we explore the idea that CV adopts discrete and terminological task definitions (\\eg, ``image segmentation''), which may be a key barrier to zero-shot task generalization. Our hypothesis is that without truly understanding previously-seen tasks--due to these terminological definitions--deep models struggle to generalize to novel tasks. To verify this, we introduce Explanatory Instructions, which provide an intuitive way to define CV task objectives through detailed linguistic transformations from input images to outputs. We create a large-scale dataset comprising 12 million ``image input to explanatory instruction to output'' triplets, and train an auto-regressive-based vision-language model (AR-based VLM) that takes both images and explanatory instructions as input. By learning to follow these instructions, the AR-based VLM achieves instruction-level zero-shot capabilities for previously-seen tasks and demonstrates strong zero-shot generalization for unseen CV tasks. Code and dataset will be openly available on our GitHub repository.", 'score': 48, 'issue_id': 1406, 'pub_date': '2024-12-24', 'pub_date_card': {'ru': '24 декабря', 'en': 'December 24', 'zh': '12月24日'}, 'hash': '23f11aceae00534d', 'authors': ['Yang Shen', 'Xiu-Shen Wei', 'Yifan Sun', 'Yuxin Song', 'Tao Yuan', 'Jian Jin', 'Heyang Xu', 'Yazhou Yao', 'Errui Ding'], 'affiliations': ['Baidu', 'Nanjing University of Science and Technology', 'Southeast University'], 'pdf_title_img': 'assets/pdf/title_img/2412.18525.jpg', 'data': {'categories': ['#dataset', '#open_source', '#cv', '#multimodal', '#transfer_learning'], 'emoji': '🔬', 'ru': {'title': 'Лингвистические инструкции - ключ к обобщению в компьютерном зрении', 'desc': "В статье исследуется проблема недостаточной способности моделей компьютерного зрения к обобщению на новые задачи без предварительного обучения. Авторы предлагают использовать подробные лингвистические инструкции для определения задач вместо дискретных терминологических определений. Они создали большой датасет из 12 миллионов примеров 'изображение-инструкция-результат' и обучили авторегрессионную мультимодальную модель следовать этим инструкциям. Эксперименты показали, что такой подход позволяет модели лучше обобщаться на новые задачи компьютерного зрения без дополнительного обучения."}, 'en': {'title': 'Unlocking Zero-Shot Generalization in Computer Vision with Explanatory Instructions', 'desc': "This paper addresses the challenge of zero-shot task generalization in Computer Vision (CV), which has not reached the levels seen in Natural Language Processing (NLP). The authors argue that the use of specific terminological definitions for tasks in CV, like 'image segmentation', limits the models' ability to generalize to new tasks. To overcome this, they propose 'Explanatory Instructions' that transform image inputs into detailed linguistic outputs, helping models understand tasks better. They introduce a large dataset of 12 million triplets and train an auto-regressive vision-language model that successfully demonstrates zero-shot capabilities for both seen and unseen tasks."}, 'zh': {'title': '突破计算机视觉的零样本任务泛化', 'desc': '本文探讨了计算机视觉(CV)在零样本任务泛化方面的挑战,尤其是与自然语言处理(NLP)的对比。我们认为,CV使用的术语性任务定义(如“图像分割”)可能是阻碍零样本任务泛化的关键因素。为了解决这个问题,我们引入了“解释性指令”,通过详细的语言转换来直观地定义CV任务目标。我们创建了一个包含1200万对“图像输入、解释性指令和输出”的大规模数据集,并训练了一个基于自回归的视觉语言模型,实现了对已见任务的指令级零样本能力,并在未见的CV任务上展示了强大的零样本泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2412.20070', 'title': 'On the Compositional Generalization of Multimodal LLMs for Medical Imaging', 'url': 'https://huggingface.co/papers/2412.20070', 'abstract': 'Multimodal large language models (MLLMs) hold significant potential in the medical field, but their capabilities are often limited by insufficient data in certain medical domains, highlighting the need for understanding what kinds of images can be used by MLLMs for generalization. Current research suggests that multi-task training outperforms single-task as different tasks can benefit each other, but they often overlook the internal relationships within these tasks, providing limited guidance on selecting datasets to enhance specific tasks. To analyze this phenomenon, we attempted to employ compositional generalization (CG)-the ability of models to understand novel combinations by recombining learned elements-as a guiding framework. Since medical images can be precisely defined by Modality, Anatomical area, and Task, naturally providing an environment for exploring CG. Therefore, we assembled 106 medical datasets to create Med-MAT for comprehensive experiments. The experiments confirmed that MLLMs can use CG to understand unseen medical images and identified CG as one of the main drivers of the generalization observed in multi-task training. Additionally, further studies demonstrated that CG effectively supports datasets with limited data and delivers consistent performance across different backbones, highlighting its versatility and broad applicability. Med-MAT is publicly available at https://github.com/FreedomIntelligence/Med-MAT.', 'score': 36, 'issue_id': 1405, 'pub_date': '2024-12-28', 'pub_date_card': {'ru': '28 декабря', 'en': 'December 28', 'zh': '12月28日'}, 'hash': '34f9c6ec4611d6ec', 'authors': ['Zhenyang Cai', 'Junying Chen', 'Rongsheng Wang', 'Weihong Wang', 'Yonglin Deng', 'Dingjie Song', 'Yize Chen', 'Zixu Zhang', 'Benyou Wang'], 'affiliations': ['The Chinese University of Hong Kong, Shenzhen'], 'pdf_title_img': 'assets/pdf/title_img/2412.20070.jpg', 'data': {'categories': ['#dataset', '#healthcare', '#open_source', '#multimodal', '#transfer_learning'], 'emoji': '🩺', 'ru': {'title': 'Композиционная генерализация - ключ к пониманию медицинских изображений для MLLM', 'desc': 'Статья исследует возможности мультимодальных больших языковых моделей (MLLM) в медицинской сфере, фокусируясь на композиционной генерализации (CG). Авторы создали набор данных Med-MAT из 106 медицинских датасетов для изучения способности моделей понимать новые комбинации изображений. Эксперименты показали, что MLLM могут использовать CG для интерпретации ранее невиданных медицинских изображений. Исследование также выявило эффективность CG для датасетов с ограниченными данными и стабильность результатов на разных архитектурах моделей.'}, 'en': {'title': 'Unlocking Medical Insights with Compositional Generalization', 'desc': "This paper explores the use of multimodal large language models (MLLMs) in the medical field, focusing on how they can generalize from limited data. It highlights the advantages of multi-task training over single-task training, emphasizing the importance of understanding the relationships between different tasks. The authors introduce compositional generalization (CG) as a framework to enhance the model's ability to interpret new combinations of medical images. They created a dataset called Med-MAT, which consists of 106 medical datasets, and found that CG significantly improves the performance of MLLMs, especially in scenarios with scarce data."}, 'zh': {'title': '组合泛化助力医学图像理解', 'desc': '多模态大型语言模型(MLLMs)在医学领域具有重要潜力,但在某些医学领域的数据不足限制了其能力。当前研究表明,多任务训练优于单任务训练,因为不同任务可以相互促进,但往往忽视了这些任务之间的内部关系。我们采用组合泛化(CG)作为指导框架,分析模型如何理解新组合的能力,并组建了106个医学数据集以创建Med-MAT进行全面实验。实验结果确认,MLLMs能够利用CG理解未见过的医学图像,并且CG是多任务训练中观察到的泛化的主要驱动因素之一。'}}}, {'id': 'https://huggingface.co/papers/2412.20422', 'title': 'Bringing Objects to Life: 4D generation from 3D objects', 'url': 'https://huggingface.co/papers/2412.20422', 'abstract': 'Recent advancements in generative modeling now enable the creation of 4D content (moving 3D objects) controlled with text prompts. 4D generation has large potential in applications like virtual worlds, media, and gaming, but existing methods provide limited control over the appearance and geometry of generated content. In this work, we introduce a method for animating user-provided 3D objects by conditioning on textual prompts to guide 4D generation, enabling custom animations while maintaining the identity of the original object. We first convert a 3D mesh into a ``static" 4D Neural Radiance Field (NeRF) that preserves the visual attributes of the input object. Then, we animate the object using an Image-to-Video diffusion model driven by text. To improve motion realism, we introduce an incremental viewpoint selection protocol for sampling perspectives to promote lifelike movement and a masked Score Distillation Sampling (SDS) loss, which leverages attention maps to focus optimization on relevant regions. We evaluate our model in terms of temporal coherence, prompt adherence, and visual fidelity and find that our method outperforms baselines that are based on other approaches, achieving up to threefold improvements in identity preservation measured using LPIPS scores, and effectively balancing visual quality with dynamic content.', 'score': 29, 'issue_id': 1408, 'pub_date': '2024-12-29', 'pub_date_card': {'ru': '29 декабря', 'en': 'December 29', 'zh': '12月29日'}, 'hash': 'de742e56a5ec379f', 'authors': ['Ohad Rahamim', 'Ori Malca', 'Dvir Samuel', 'Gal Chechik'], 'affiliations': ['Bar-Ilan University', 'NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2412.20422.jpg', 'data': {'categories': ['#optimization', '#multimodal', '#games', '#diffusion', '#video', '#3d'], 'emoji': '🎭', 'ru': {'title': 'Оживление 3D-объектов с помощью текста: новый рубеж в генеративном моделировании', 'desc': 'Статья представляет новый метод анимации 3D-объектов с помощью текстовых подсказок. Авторы используют генеративную модель для создания 4D-контента (движущихся 3D-объектов), сохраняя при этом исходный вид объекта. Метод включает преобразование 3D-меша в статическое 4D нейронное радиальное поле (NeRF) и последующую анимацию с помощью диффузионной модели Image-to-Video. Для улучшения реалистичности движения введены протокол выбора ракурсов и маскированная функция потерь Score Distillation Sampling.'}, 'en': {'title': 'Animating 3D Objects with Text Prompts for Realistic 4D Generation', 'desc': "This paper presents a novel approach to generating 4D content by animating 3D objects based on text prompts. The method involves converting a 3D mesh into a static 4D Neural Radiance Field (NeRF) to retain the object's visual characteristics. It then utilizes an Image-to-Video diffusion model to create animations while ensuring the original object's identity is preserved. The authors enhance motion realism through a viewpoint selection protocol and a masked Score Distillation Sampling loss, leading to significant improvements in visual quality and dynamic content generation."}, 'zh': {'title': '文本驱动的4D动画生成新方法', 'desc': '本研究提出了一种新方法,可以通过文本提示来控制4D内容的生成,特别是动画用户提供的3D对象。我们首先将3D网格转换为静态的4D神经辐射场(NeRF),以保留输入对象的视觉特征。然后,利用图像到视频的扩散模型进行动画制作,确保生成的动画与文本提示相符。通过引入增量视角选择协议和掩码评分蒸馏损失,我们提高了运动的真实感,并在多个评估指标上超越了现有方法。'}}}, {'id': 'https://huggingface.co/papers/2412.20993', 'title': 'Efficiently Serving LLM Reasoning Programs with Certaindex', 'url': 'https://huggingface.co/papers/2412.20993', 'abstract': 'The rapid evolution of large language models (LLMs) has unlocked their capabilities in advanced reasoning tasks like mathematical problem-solving, code generation, and legal analysis. Central to this progress are inference-time reasoning algorithms, which refine outputs by exploring multiple solution paths, at the cost of increasing compute demands and response latencies. Existing serving systems fail to adapt to the scaling behaviors of these algorithms or the varying difficulty of queries, leading to inefficient resource use and unmet latency targets. We present Dynasor, a system that optimizes inference-time compute for LLM reasoning queries. Unlike traditional engines, Dynasor tracks and schedules requests within reasoning queries and uses Certaindex, a proxy that measures statistical reasoning progress based on model certainty, to guide compute allocation dynamically. Dynasor co-adapts scheduling with reasoning progress: it allocates more compute to hard queries, reduces compute for simpler ones, and terminates unpromising queries early, balancing accuracy, latency, and cost. On diverse datasets and algorithms, Dynasor reduces compute by up to 50% in batch processing and sustaining 3.3x higher query rates or 4.7x tighter latency SLOs in online serving.', 'score': 24, 'issue_id': 1406, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '7fe76ed90463d977', 'authors': ['Yichao Fu', 'Junda Chen', 'Siqi Zhu', 'Zheyu Fu', 'Zhongdongming Dai', 'Aurick Qiao', 'Hao Zhang'], 'affiliations': ['Snowflake', 'Tsinghua University', 'UC San Diego'], 'pdf_title_img': 'assets/pdf/title_img/2412.20993.jpg', 'data': {'categories': ['#training', '#reasoning', '#optimization', '#inference'], 'emoji': '🧠', 'ru': {'title': 'Dynasor: умное распределение ресурсов для эффективных LLM-рассуждений', 'desc': 'Статья представляет систему Dynasor, оптимизирующую вычисления для задач рассуждения с использованием больших языковых моделей (LLM). Dynasor отслеживает и планирует запросы, используя прокси Certaindex для измерения прогресса рассуждений на основе уверенности модели. Система динамически распределяет вычислительные ресурсы, уделяя больше внимания сложным запросам и меньше простым, а также прекращая бесперспективные запросы. Dynasor показывает значительное снижение вычислительных затрат и улучшение производительности на различных наборах данных и алгоритмах.'}, 'en': {'title': 'Dynasor: Smart Compute Allocation for Efficient LLM Reasoning', 'desc': "This paper introduces Dynasor, a system designed to optimize the compute resources used during inference for large language models (LLMs) when handling reasoning queries. It addresses the inefficiencies of existing serving systems that do not adapt to the complexity of different queries or the scaling needs of inference-time reasoning algorithms. Dynasor employs a dynamic scheduling approach that allocates compute resources based on the difficulty of the query, using a proxy called Certaindex to measure the model's certainty in its reasoning. As a result, Dynasor can significantly reduce compute usage while improving query processing rates and meeting latency targets more effectively."}, 'zh': {'title': 'Dynasor:优化推理查询的计算效率', 'desc': '这篇论文介绍了Dynasor系统,它优化了大型语言模型(LLM)在推理查询时的计算效率。Dynasor通过跟踪和调度推理查询中的请求,动态分配计算资源,以应对不同难度的查询。该系统使用Certaindex代理,根据模型的确定性来衡量推理进展,从而指导计算分配。通过在多种数据集和算法上测试,Dynasor在批处理时减少了多达50%的计算需求,同时在在线服务中实现了3.3倍更高的查询速率或4.7倍更严格的延迟服务水平目标。'}}}, {'id': 'https://huggingface.co/papers/2412.21037', 'title': 'TangoFlux: Super Fast and Faithful Text to Audio Generation with Flow Matching and Clap-Ranked Preference Optimization', 'url': 'https://huggingface.co/papers/2412.21037', 'abstract': 'We introduce TangoFlux, an efficient Text-to-Audio (TTA) generative model with 515M parameters, capable of generating up to 30 seconds of 44.1kHz audio in just 3.7 seconds on a single A40 GPU. A key challenge in aligning TTA models lies in the difficulty of creating preference pairs, as TTA lacks structured mechanisms like verifiable rewards or gold-standard answers available for Large Language Models (LLMs). To address this, we propose CLAP-Ranked Preference Optimization (CRPO), a novel framework that iteratively generates and optimizes preference data to enhance TTA alignment. We demonstrate that the audio preference dataset generated using CRPO outperforms existing alternatives. With this framework, TangoFlux achieves state-of-the-art performance across both objective and subjective benchmarks. We open source all code and models to support further research in TTA generation.', 'score': 19, 'issue_id': 1405, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'bb669623871df661', 'authors': ['Chia-Yu Hung', 'Navonil Majumder', 'Zhifeng Kong', 'Ambuj Mehrish', 'Rafael Valle', 'Bryan Catanzaro', 'Soujanya Poria'], 'affiliations': ['NVIDIA', 'Singapore University of Technology and Design (SUTD)'], 'pdf_title_img': 'assets/pdf/title_img/2412.21037.jpg', 'data': {'categories': ['#dataset', '#audio', '#open_source', '#benchmark', '#alignment', '#rlhf', '#small_models'], 'emoji': '🎵', 'ru': {'title': 'TangoFlux: Революция в генерации аудио из текста', 'desc': 'TangoFlux - это эффективная генеративная модель для преобразования текста в аудио (Text-to-Audio, TTA) с 515 миллионами параметров. Модель способна генерировать до 30 секунд аудио с частотой 44,1 кГц всего за 3,7 секунды на одном GPU A40. Авторы представляют новую методику CLAP-Ranked Preference Optimization (CRPO) для улучшения согласованности TTA моделей путем итеративной генерации и оптимизации данных о предпочтениях. TangoFlux достигает передовых результатов в объективных и субъективных тестах, а код и модели открыты для дальнейших исследований.'}, 'en': {'title': 'TangoFlux: Revolutionizing Text-to-Audio Generation with CRPO', 'desc': "TangoFlux is a powerful Text-to-Audio generative model that can create high-quality audio quickly and efficiently. It addresses the challenge of aligning TTA models by introducing a new method called CLAP-Ranked Preference Optimization (CRPO), which helps generate and optimize preference data. This approach improves the model's ability to understand and produce audio that aligns with user preferences. The results show that TangoFlux not only meets but exceeds current standards in both objective and subjective evaluations, and the team has made their code and models available for further research."}, 'zh': {'title': 'TangoFlux:高效的文本到音频生成模型', 'desc': '我们介绍了TangoFlux,这是一种高效的文本到音频生成模型,拥有5.15亿个参数,能够在单个A40 GPU上以3.7秒的速度生成最长30秒的44.1kHz音频。TTA模型对齐的一个主要挑战是创建偏好对的困难,因为TTA缺乏像大型语言模型(LLMs)那样的可验证奖励或标准答案的结构化机制。为了解决这个问题,我们提出了CLAP-Ranked Preference Optimization(CRPO),这是一个新颖的框架,通过迭代生成和优化偏好数据来增强TTA的对齐。我们证明了使用CRPO生成的音频偏好数据集在现有替代方案中表现更优,TangoFlux在客观和主观基准测试中都达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2412.21079', 'title': 'Edicho: Consistent Image Editing in the Wild', 'url': 'https://huggingface.co/papers/2412.21079', 'abstract': 'As a verified need, consistent editing across in-the-wild images remains a technical challenge arising from various unmanageable factors, like object poses, lighting conditions, and photography environments. Edicho steps in with a training-free solution based on diffusion models, featuring a fundamental design principle of using explicit image correspondence to direct editing. Specifically, the key components include an attention manipulation module and a carefully refined classifier-free guidance (CFG) denoising strategy, both of which take into account the pre-estimated correspondence. Such an inference-time algorithm enjoys a plug-and-play nature and is compatible to most diffusion-based editing methods, such as ControlNet and BrushNet. Extensive results demonstrate the efficacy of Edicho in consistent cross-image editing under diverse settings. We will release the code to facilitate future studies.', 'score': 17, 'issue_id': 1405, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '8068418a331b2086', 'authors': ['Qingyan Bai', 'Hao Ouyang', 'Yinghao Xu', 'Qiuyu Wang', 'Ceyuan Yang', 'Ka Leong Cheng', 'Yujun Shen', 'Qifeng Chen'], 'affiliations': ['Ant Group', 'CUHK', 'HKUST', 'Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21079.jpg', 'data': {'categories': ['#cv', '#diffusion', '#open_source', '#inference'], 'emoji': '🖼️', 'ru': {'title': 'Edicho: согласованное редактирование изображений без обучения', 'desc': 'Статья представляет Edicho - решение для согласованного редактирования изображений без обучения, основанное на диффузионных моделях. Ключевые компоненты включают модуль манипуляции вниманием и стратегию шумоподавления без классификатора, использующие предварительно оцененное соответствие между изображениями. Этот алгоритм совместим с большинством методов редактирования на основе диффузии, таких как ControlNet и BrushNet. Результаты демонстрируют эффективность Edicho в согласованном редактировании изображений в различных условиях.'}, 'en': {'title': 'Edicho: Consistent Image Editing Made Easy with Diffusion Models', 'desc': 'This paper introduces Edicho, a novel approach for consistent editing of images that addresses challenges like varying object poses and lighting. It utilizes diffusion models without the need for prior training, focusing on explicit image correspondence to guide the editing process. Key innovations include an attention manipulation module and a refined classifier-free guidance denoising strategy, which enhance the editing quality by considering pre-estimated correspondences. The method is designed to be easily integrated with existing diffusion-based editing techniques, showing strong performance across different scenarios.'}, 'zh': {'title': 'Edicho:无训练一致性图像编辑的新方法', 'desc': 'Edicho 是一种基于扩散模型的无训练解决方案,旨在解决在不同环境下进行一致性图像编辑的挑战。它的设计原则是利用显式图像对应关系来指导编辑,确保在不同的拍摄条件下保持一致性。该方法包括一个注意力操作模块和经过精细调整的无分类器引导去噪策略,能够有效处理预估的对应关系。Edicho 具有即插即用的特性,兼容大多数基于扩散的编辑方法,实验结果显示其在多种设置下的有效性。'}}}, {'id': 'https://huggingface.co/papers/2412.21187', 'title': 'Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs', 'url': 'https://huggingface.co/papers/2412.21187', 'abstract': 'The remarkable performance of models like the OpenAI o1 can be attributed to their ability to emulate human-like long-time thinking during inference. These models employ extended chain-of-thought (CoT) processes, exploring multiple strategies to enhance problem-solving capabilities. However, a critical question remains: How to intelligently and efficiently scale computational resources during testing. This paper presents the first comprehensive study on the prevalent issue of overthinking in these models, where excessive computational resources are allocated for simple problems with minimal benefit. We introduce novel efficiency metrics from both outcome and process perspectives to evaluate the rational use of computational resources by o1-like models. Using a self-training paradigm, we propose strategies to mitigate overthinking, streamlining reasoning processes without compromising accuracy. Experimental results show that our approach successfully reduces computational overhead while preserving model performance across a range of testsets with varying difficulty levels, such as GSM8K, MATH500, GPQA, and AIME.', 'score': 11, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '78da22eae14fe26c', 'authors': ['Xingyu Chen', 'Jiahao Xu', 'Tian Liang', 'Zhiwei He', 'Jianhui Pang', 'Dian Yu', 'Linfeng Song', 'Qiuzhi Liu', 'Mengfei Zhou', 'Zhuosheng Zhang', 'Rui Wang', 'Zhaopeng Tu', 'Haitao Mi', 'Dong Yu'], 'affiliations': ['Shanghai Jiao Tong University', 'Tencent AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2412.21187.jpg', 'data': {'categories': ['#optimization', '#reasoning', '#training', '#math', '#inference'], 'emoji': '🧠', 'ru': {'title': 'Эффективное мышление ИИ: борьба с избыточными вычислениями', 'desc': 'Статья исследует проблему избыточных вычислений (overthinking) в крупных языковых моделях типа OpenAI o1 при решении задач. Авторы вводят новые метрики эффективности для оценки рационального использования вычислительных ресурсов такими моделями. Предлагается стратегия на основе самообучения для оптимизации рассуждений модели без потери точности. Экспериментальные результаты показывают успешное снижение вычислительных затрат при сохранении производительности на различных наборах тестов.'}, 'en': {'title': 'Streamlining Reasoning: Tackling Overthinking in AI Models', 'desc': "This paper investigates the phenomenon of overthinking in advanced machine learning models, particularly those like OpenAI's o1, which excel at long-term reasoning. It highlights the inefficiencies that arise when these models allocate excessive computational resources to solve simple problems, leading to minimal gains in performance. The authors propose new efficiency metrics to assess how well these models utilize their computational power during inference. By implementing a self-training approach, they present strategies to reduce overthinking, achieving a balance between computational efficiency and model accuracy across various challenging test sets."}, 'zh': {'title': '优化计算资源,提升模型效率', 'desc': '本文探讨了像OpenAI o1这样的模型在推理过程中模拟人类长期思考的能力。研究指出,这些模型在解决问题时常常会过度思考,导致在简单问题上分配过多的计算资源。我们提出了新的效率指标,从结果和过程两个角度评估计算资源的合理使用,并提出了自我训练的策略来减少过度思考。实验结果表明,我们的方法在不同难度的测试集上成功降低了计算开销,同时保持了模型的性能。'}}}, {'id': 'https://huggingface.co/papers/2412.20005', 'title': 'OneKE: A Dockerized Schema-Guided LLM Agent-based Knowledge Extraction System', 'url': 'https://huggingface.co/papers/2412.20005', 'abstract': "We introduce OneKE, a dockerized schema-guided knowledge extraction system, which can extract knowledge from the Web and raw PDF Books, and support various domains (science, news, etc.). Specifically, we design OneKE with multiple agents and a configure knowledge base. Different agents perform their respective roles, enabling support for various extraction scenarios. The configure knowledge base facilitates schema configuration, error case debugging and correction, further improving the performance. Empirical evaluations on benchmark datasets demonstrate OneKE's efficacy, while case studies further elucidate its adaptability to diverse tasks across multiple domains, highlighting its potential for broad applications. We have open-sourced the Code at https://github.com/zjunlp/OneKE and released a Video at http://oneke.openkg.cn/demo.mp4.", 'score': 10, 'issue_id': 1405, 'pub_date': '2024-12-28', 'pub_date_card': {'ru': '28 декабря', 'en': 'December 28', 'zh': '12月28日'}, 'hash': 'da8469c61421cefb', 'authors': ['Yujie Luo', 'Xiangyuan Ru', 'Kangwei Liu', 'Lin Yuan', 'Mengshu Sun', 'Ningyu Zhang', 'Lei Liang', 'Zhiqiang Zhang', 'Jun Zhou', 'Lanning Wei', 'Da Zheng', 'Haofen Wang', 'Huajun Chen'], 'affiliations': ['Ant Group', 'Tongji University', 'ZJU-Ant Group Joint Research Center for Knowledge Graphs', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2412.20005.jpg', 'data': {'categories': ['#dataset', '#agents', '#open_source', '#benchmark', '#multimodal', '#science'], 'emoji': '🧠', 'ru': {'title': 'OneKE: Универсальный инструмент для извлечения знаний из разнородных источников', 'desc': 'OneKE - это докеризованная система извлечения знаний, управляемая схемой. Она способна извлекать информацию из веб-ресурсов и PDF-книг, поддерживая различные домены, такие как наука и новости. Система использует множество агентов и настраиваемую базу знаний для выполнения различных сценариев извлечения. OneKE демонстрирует высокую эффективность на эталонных наборах данных и адаптируемость к разнообразным задачам в различных областях.'}, 'en': {'title': 'OneKE: Versatile Knowledge Extraction for Diverse Domains', 'desc': "OneKE is a knowledge extraction system designed to gather information from the Web and raw PDF books across various domains like science and news. It utilizes multiple agents, each responsible for specific tasks, which enhances its ability to handle different extraction scenarios effectively. The system includes a configurable knowledge base that aids in schema setup, debugging, and error correction, leading to improved performance. Empirical tests on benchmark datasets confirm OneKE's effectiveness, and case studies showcase its versatility in tackling diverse tasks."}, 'zh': {'title': 'OneKE:多领域知识提取的智能系统', 'desc': 'OneKE是一个基于Docker的知识提取系统,能够从网络和原始PDF书籍中提取知识,支持多个领域(如科学、新闻等)。该系统设计了多个智能代理,各自承担不同的角色,以适应各种提取场景。配置知识库的设计使得模式配置、错误调试和修正变得更加高效,从而提升了系统的性能。通过在基准数据集上的实证评估,OneKE展示了其有效性,并通过案例研究进一步说明了其在多个领域的适应性和广泛应用潜力。'}}}, {'id': 'https://huggingface.co/papers/2412.20631', 'title': "Slow Perception: Let's Perceive Geometric Figures Step-by-step", 'url': 'https://huggingface.co/papers/2412.20631', 'abstract': 'Recently, "visual o1" began to enter people\'s vision, with expectations that this slow-thinking design can solve visual reasoning tasks, especially geometric math problems. However, the reality is that current LVLMs (Large Vision Language Models) can hardly even accurately copy a geometric figure, let alone truly understand the complex inherent logic and spatial relationships within geometric shapes. We believe accurate copying (strong perception) is the first step to visual o1. Accordingly, we introduce the concept of "slow perception" (SP), which guides the model to gradually perceive basic point-line combinations, as our humans, reconstruct complex geometric structures progressively. There are two-fold stages in SP: a) perception decomposition. Perception is not instantaneous. In this stage, complex geometric figures are broken down into basic simple units to unify geometry representation. b) perception flow, which acknowledges that accurately tracing a line is not an easy task. This stage aims to avoid "long visual jumps" in regressing line segments by using a proposed "perceptual ruler" to trace each line stroke-by-stroke. Surprisingly, such a human-like perception manner enjoys an inference time scaling law -- the slower, the better. Researchers strive to speed up the model\'s perception in the past, but we slow it down again, allowing the model to read the image step-by-step and carefully.', 'score': 9, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'f99c59b7ef92c667', 'authors': ['Haoran Wei', 'Youyang Yin', 'Yumeng Li', 'Jia Wang', 'Liang Zhao', 'Jianjian Sun', 'Zheng Ge', 'Xiangyu Zhang'], 'affiliations': ['Beihang University', 'Stepfun'], 'pdf_title_img': 'assets/pdf/title_img/2412.20631.jpg', 'data': {'categories': ['#cv', '#math', '#reasoning'], 'emoji': '🔍', 'ru': {'title': 'Медленнее значит лучше: новый подход к компьютерному зрению', 'desc': "Статья представляет концепцию 'медленного восприятия' (slow perception) для улучшения способности моделей компьютерного зрения копировать геометрические фигуры. Авторы предлагают двухэтапный подход: декомпозиция восприятия, разбивающая сложные фигуры на простые элементы, и поток восприятия, использующий 'перцептивную линейку' для точного отслеживания линий. Исследователи обнаружили, что более медленное восприятие приводит к лучшим результатам, что противоречит традиционному стремлению ускорить обработку изображений. Эта методика может стать первым шагом к решению задач визуального рассуждения и геометрических задач большими визуально-языковыми моделями."}, 'en': {'title': 'Slow Down to See Better: Enhancing Visual Reasoning with Slow Perception', 'desc': "This paper introduces the concept of 'slow perception' (SP) to enhance the capabilities of Large Vision Language Models (LVLMs) in visual reasoning tasks, particularly in understanding geometric shapes. SP consists of two stages: perception decomposition, where complex figures are simplified into basic components, and perception flow, which emphasizes careful tracing of lines to avoid errors. The authors argue that this method mimics human cognitive processes, allowing for a more accurate understanding of spatial relationships. Interestingly, they find that a slower, more deliberate approach to perception improves the model's performance, challenging the traditional focus on speed in machine learning."}, 'zh': {'title': '慢感知:逐步理解几何结构的关键', 'desc': '最近,"视觉o1"开始引起人们的关注,期望这种慢思维设计能够解决视觉推理任务,尤其是几何数学问题。然而,当前的大型视觉语言模型(LVLMs)在准确复制几何图形方面几乎无能为力,更不用说真正理解几何形状内在的复杂逻辑和空间关系。我们提出了"慢感知"(SP)的概念,指导模型逐步感知基本的点线组合,像人类一样逐步重建复杂的几何结构。SP包括两个阶段:感知分解和感知流,前者将复杂的几何图形分解为基本单元,后者通过使用"感知尺"逐步追踪每条线段,避免"长视觉跳跃"。'}}}, {'id': 'https://huggingface.co/papers/2412.21140', 'title': 'Facilitating large language model Russian adaptation with Learned Embedding Propagation', 'url': 'https://huggingface.co/papers/2412.21140', 'abstract': 'Rapid advancements of large language model (LLM) technologies led to the introduction of powerful open-source instruction-tuned LLMs that have the same text generation quality as the state-of-the-art counterparts such as GPT-4. While the emergence of such models accelerates the adoption of LLM technologies in sensitive-information environments the authors of such models don not disclose the training data necessary for replication of the results thus making the achievements model-exclusive. Since those open-source models are also multilingual this in turn reduces the benefits of training a language specific LLMs as improved inference computation efficiency becomes the only guaranteed advantage of such costly procedure. More cost-efficient options such as vocabulary extension and subsequent continued pre-training are also inhibited by the lack of access to high-quality instruction-tuning data since it is the major factor behind the resulting LLM task-solving capabilities. To address the limitations and cut the costs of the language adaptation pipeline we propose Learned Embedding Propagation (LEP). Unlike existing approaches our method has lower training data size requirements due to minimal impact on existing LLM knowledge which we reinforce using novel ad-hoc embedding propagation procedure that allows to skip the instruction-tuning step and instead implant the new language knowledge directly into any existing instruct-tuned variant. We evaluated four Russian vocabulary adaptations for LLaMa-3-8B and Mistral-7B, showing that LEP is competitive with traditional instruction-tuning methods, achieving performance comparable to OpenChat 3.5 and LLaMa-3-8B-Instruct, with further improvements via self-calibration and continued tuning enhancing task-solving capabilities.', 'score': 9, 'issue_id': 1412, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '093f3929e323d180', 'authors': ['Mikhail Tikhomirov', 'Daniil Chernyshev'], 'affiliations': ['Lomonosov Moscow State University, Moscow, Russia'], 'pdf_title_img': 'assets/pdf/title_img/2412.21140.jpg', 'data': {'categories': ['#data', '#training', '#low_resource', '#transfer_learning', '#dataset', '#open_source', '#multilingual'], 'emoji': '🌐', 'ru': {'title': 'Эффективная адаптация языковых моделей без масштабного переобучения', 'desc': 'Статья представляет новый метод адаптации больших языковых моделей (LLM) к другим языкам, называемый Learned Embedding Propagation (LEP). Этот подход позволяет эффективно внедрять знания нового языка в существующие инструктированные LLM без необходимости повторного обучения на больших объемах данных. Авторы провели эксперименты с адаптацией моделей LLaMa-3-8B и Mistral-7B к русскому языку, показав, что LEP конкурентоспособен с традиционными методами инструктирования. Результаты демонстрируют, что LEP достигает производительности, сравнимой с OpenChat 3.5 и LLaMa-3-8B-Instruct, с возможностью дальнейшего улучшения через самокалибровку и дополнительную настройку.'}, 'en': {'title': 'Efficient Language Adaptation with Learned Embedding Propagation', 'desc': 'This paper introduces Learned Embedding Propagation (LEP), a novel method for adapting large language models (LLMs) to new languages without the need for extensive instruction-tuning data. LEP minimizes the training data requirements by directly embedding new language knowledge into existing instruct-tuned models, thus bypassing traditional instruction-tuning steps. The authors demonstrate that LEP can effectively adapt LLaMa-3-8B and Mistral-7B for Russian vocabulary, achieving performance on par with state-of-the-art models like OpenChat 3.5. This approach not only reduces costs but also enhances the efficiency of language adaptation in multilingual contexts.'}, 'zh': {'title': '学习嵌入传播:降低语言适应成本的新方法', 'desc': '这篇论文介绍了一种名为学习嵌入传播(LEP)的方法,旨在降低语言适应过程的成本。LEP方法通过最小化对现有大语言模型(LLM)知识的影响,减少了对训练数据的需求。与传统的指令调优方法相比,LEP能够直接将新的语言知识植入到现有的指令调优模型中,从而跳过指令调优步骤。实验结果表明,LEP在俄语词汇适应方面的表现与传统方法相当,且通过自我校准和持续调优进一步提升了任务解决能力。'}}}, {'id': 'https://huggingface.co/papers/2412.21139', 'title': 'Training Software Engineering Agents and Verifiers with SWE-Gym', 'url': 'https://huggingface.co/papers/2412.21139', 'abstract': 'We present SWE-Gym, the first environment for training real-world software engineering (SWE) agents. SWE-Gym contains 2,438 real-world Python task instances, each comprising a codebase with an executable runtime environment, unit tests, and a task specified in natural language. We use SWE-Gym to train language model based SWE agents , achieving up to 19% absolute gains in resolve rate on the popular SWE-Bench Verified and Lite test sets. We also experiment with inference-time scaling through verifiers trained on agent trajectories sampled from SWE-Gym. When combined with our fine-tuned SWE agents, we achieve 32.0% and 26.0% on SWE-Bench Verified and Lite, respectively, reflecting a new state-of-the-art for open-weight SWE agents. To facilitate further research, we publicly release SWE-Gym, models, and agent trajectories.', 'score': 9, 'issue_id': 1406, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '800bb3f4c48e2cf9', 'authors': ['Jiayi Pan', 'Xingyao Wang', 'Graham Neubig', 'Navdeep Jaitly', 'Heng Ji', 'Alane Suhr', 'Yizhe Zhang'], 'affiliations': ['Apple', 'CMU', 'UC Berkeley', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2412.21139.jpg', 'data': {'categories': ['#dataset', '#open_source', '#agents', '#training'], 'emoji': '🤖', 'ru': {'title': 'SWE-Gym: революция в обучении ИИ-агентов для разработки ПО', 'desc': 'SWE-Gym - это новая среда для обучения агентов программной инженерии на реальных задачах. Она содержит 2438 экземпляров задач на Python с исполняемой средой, юнит-тестами и описанием на естественном языке. Авторы использовали SWE-Gym для обучения агентов на основе языковых моделей, достигнув улучшения на 19% в решении задач из наборов SWE-Bench. Комбинация обученных агентов и верификаторов позволила достичь нового рекорда производительности для открытых моделей в программной инженерии.'}, 'en': {'title': 'Revolutionizing Software Engineering with SWE-Gym', 'desc': 'SWE-Gym is a novel environment designed for training software engineering agents using real-world Python tasks. It includes 2,438 task instances, each with a codebase, executable environment, unit tests, and natural language task descriptions. The paper demonstrates that language model-based agents trained in SWE-Gym can significantly improve their performance, achieving up to 19% higher resolve rates on benchmark tests. Additionally, the authors explore scaling inference through verifiers, leading to state-of-the-art results for open-weight software engineering agents, and they provide resources for further research.'}, 'zh': {'title': 'SWE-Gym:软件工程代理的新起点', 'desc': '我们提出了SWE-Gym,这是第一个用于训练真实世界软件工程(SWE)代理的环境。SWE-Gym包含2438个真实的Python任务实例,每个实例都有可执行的运行环境、单元测试和用自然语言指定的任务。通过使用SWE-Gym,我们训练的基于语言模型的SWE代理在流行的SWE-Bench验证和Lite测试集上实现了高达19%的绝对解决率提升。我们还通过在SWE-Gym中采样的代理轨迹训练验证器,进行推理时的扩展,结合我们微调的SWE代理,在SWE-Bench验证和Lite上分别达到了32.0%和26.0%的新状态,成为开放权重SWE代理的新标杆。'}}}, {'id': 'https://huggingface.co/papers/2412.21206', 'title': 'PERSE: Personalized 3D Generative Avatars from A Single Portrait', 'url': 'https://huggingface.co/papers/2412.21206', 'abstract': "We present PERSE, a method for building an animatable personalized generative avatar from a reference portrait. Our avatar model enables facial attribute editing in a continuous and disentangled latent space to control each facial attribute, while preserving the individual's identity. To achieve this, our method begins by synthesizing large-scale synthetic 2D video datasets, where each video contains consistent changes in the facial expression and viewpoint, combined with a variation in a specific facial attribute from the original input. We propose a novel pipeline to produce high-quality, photorealistic 2D videos with facial attribute editing. Leveraging this synthetic attribute dataset, we present a personalized avatar creation method based on the 3D Gaussian Splatting, learning a continuous and disentangled latent space for intuitive facial attribute manipulation. To enforce smooth transitions in this latent space, we introduce a latent space regularization technique by using interpolated 2D faces as supervision. Compared to previous approaches, we demonstrate that PERSE generates high-quality avatars with interpolated attributes while preserving identity of reference person.", 'score': 8, 'issue_id': 1415, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '334a60a17f9a9477', 'authors': ['Hyunsoo Cha', 'Inhee Lee', 'Hanbyul Joo'], 'affiliations': ['Seoul National University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21206.jpg', 'data': {'categories': ['#3d', '#cv', '#dataset', '#synthetic'], 'emoji': '🎭', 'ru': {'title': 'Персонализированные аватары с гибким редактированием черт лица', 'desc': 'PERSE - это метод создания анимируемого персонализированного генеративного аватара на основе портрета. Он позволяет редактировать лицевые атрибуты в непрерывном и разделенном латентном пространстве, сохраняя при этом индивидуальность человека. Метод использует синтетические наборы 2D-видео для обучения модели на основе 3D Gaussian Splatting. PERSE демонстрирует высокое качество генерации аватаров с интерполированными атрибутами, сохраняя идентичность исходного человека.'}, 'en': {'title': 'Create Your Unique Avatar with PERSE!', 'desc': "PERSE is a novel method for creating personalized generative avatars from a single reference portrait. It allows users to edit facial attributes in a smooth and controlled manner within a continuous latent space, ensuring that the individual's identity remains intact. The approach involves generating large-scale synthetic 2D video datasets that showcase variations in facial expressions and attributes, which are then used to train the avatar model. By employing 3D Gaussian Splatting and a latent space regularization technique, PERSE achieves high-quality, photorealistic avatars with seamless attribute transitions."}, 'zh': {'title': '个性化生成头像的新方法', 'desc': '本文介绍了一种名为PERSE的方法,用于从参考肖像构建可动画的个性化生成头像。该头像模型能够在连续且解耦的潜在空间中编辑面部属性,同时保持个体的身份。我们的方法首先合成大规模的合成2D视频数据集,每个视频包含面部表情和视角的一致变化,并结合原始输入中特定面部属性的变化。通过引入潜在空间正则化技术,我们实现了高质量、逼真的2D视频生成,并在此基础上提出了一种个性化头像创建方法。'}}}, {'id': 'https://huggingface.co/papers/2412.21199', 'title': 'HumanEval Pro and MBPP Pro: Evaluating Large Language Models on Self-invoking Code Generation', 'url': 'https://huggingface.co/papers/2412.21199', 'abstract': "We introduce self-invoking code generation, a new task designed to evaluate the progressive reasoning and problem-solving capabilities of LLMs. In this task, models are presented with a base problem and a related, more complex problem. They must solve the base problem and then utilize its solution to address the more complex one. This work features three key contributions. First, we propose a general recipe for generating more challenging versions of existing benchmarks, resulting in three new benchmarks: HumanEval Pro, MBPP Pro, and BigCodeBench-Lite Pro, specifically designed to assess LLMs on self-invoking code generation. Second, from the analysis of experimental results over twenty LLMs on our benchmarks, we have two important observations: (i) Most LLMs excel in traditional code generation benchmarks like HumanEval and MBPP, but their performance declines on self-invoking tasks. For example, o1-mini achieves 96.2% pass@1 on HumanEval but only 76.2% on HumanEval Pro. (ii) On self-invoking code generation task, the instruction-tuned models demonstrate only marginal improvements compared to the base models. Third, we disclose the types of failure modes that exist in our evaluation results. All these results underscore the need for further advancements in self-invoking code generation tasks and provide a new direction for future research on enhancing LLMs' code reasoning capabilities.", 'score': 6, 'issue_id': 1408, 'pub_date': '2024-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '9d2cebc8f30f722c', 'authors': ['Zhaojian Yu', 'Yilun Zhao', 'Arman Cohan', 'Xiao-Ping Zhang'], 'affiliations': ['Tsinghua University', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2412.21199.jpg', 'data': {'categories': ['#dataset', '#reasoning', '#training', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Самовызывающийся код: новый рубеж для языковых моделей', 'desc': 'Статья представляет новую задачу для оценки возможностей языковых моделей (LLM) - генерацию самовызывающегося кода. В рамках этой задачи модели должны решить базовую проблему, а затем использовать ее решение для более сложной задачи. Авторы создали три новых бенчмарка: HumanEval Pro, MBPP Pro и BigCodeBench-Lite Pro. Эксперименты показали, что большинство LLM хорошо справляются с традиционными задачами генерации кода, но их производительность снижается на самовызывающихся задачах. Результаты подчеркивают необходимость дальнейших исследований в области улучшения способностей LLM к рассуждению при работе с кодом.'}, 'en': {'title': 'Enhancing LLMs: The Challenge of Self-Invoking Code Generation', 'desc': 'This paper introduces a new task called self-invoking code generation, which tests the reasoning and problem-solving skills of large language models (LLMs). In this task, models first solve a simple problem and then use that solution to tackle a more complex one. The authors create three new benchmarks to evaluate LLMs on this task, revealing that while many models perform well on standard code generation tasks, their performance drops significantly on self-invoking tasks. The findings highlight the limitations of current models and suggest that more research is needed to improve their code reasoning abilities.'}, 'zh': {'title': '自调用代码生成:提升LLMs推理能力的新方向', 'desc': '本文介绍了一种新的任务——自调用代码生成,旨在评估大型语言模型(LLMs)的推理和问题解决能力。在这个任务中,模型需要先解决一个基础问题,然后利用其解决方案来处理一个更复杂的问题。研究提出了三项重要贡献,包括生成更具挑战性的基准测试的通用方法,并创建了三个新基准:HumanEval Pro、MBPP Pro和BigCodeBench-Lite Pro。实验结果显示,大多数LLMs在传统代码生成基准上表现良好,但在自调用任务上的表现却有所下降,表明在自调用代码生成任务上仍需进一步的研究和改进。'}}}, {'id': 'https://huggingface.co/papers/2501.09732', 'title': 'Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps', 'url': 'https://huggingface.co/papers/2501.09732', 'abstract': 'Generative models have made significant impacts across various domains, largely due to their ability to scale during training by increasing data, computational resources, and model size, a phenomenon characterized by the scaling laws. Recent research has begun to explore inference-time scaling behavior in Large Language Models (LLMs), revealing how performance can further improve with additional computation during inference. Unlike LLMs, diffusion models inherently possess the flexibility to adjust inference-time computation via the number of denoising steps, although the performance gains typically flatten after a few dozen. In this work, we explore the inference-time scaling behavior of diffusion models beyond increasing denoising steps and investigate how the generation performance can further improve with increased computation. Specifically, we consider a search problem aimed at identifying better noises for the diffusion sampling process. We structure the design space along two axes: the verifiers used to provide feedback, and the algorithms used to find better noise candidates. Through extensive experiments on class-conditioned and text-conditioned image generation benchmarks, our findings reveal that increasing inference-time compute leads to substantial improvements in the quality of samples generated by diffusion models, and with the complicated nature of images, combinations of the components in the framework can be specifically chosen to conform with different application scenario.', 'score': 50, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '2ad32c666f91ba05', 'authors': ['Nanye Ma', 'Shangyuan Tong', 'Haolin Jia', 'Hexiang Hu', 'Yu-Chuan Su', 'Mingda Zhang', 'Xuan Yang', 'Yandong Li', 'Tommi Jaakkola', 'Xuhui Jia', 'Saining Xie'], 'affiliations': ['Google', 'MIT', 'NYU'], 'pdf_title_img': 'assets/pdf/title_img/2501.09732.jpg', 'data': {'categories': ['#diffusion', '#inference', '#benchmark', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Повышение качества генерации изображений за счет масштабирования вычислений при выводе', 'desc': 'Это исследование посвящено изучению поведения диффузионных моделей при масштабировании вычислений во время вывода. Авторы рассматривают задачу поиска лучших шумов для процесса сэмплирования диффузионной модели. Они структурируют пространство решений по двум осям: верификаторы для обратной связи и алгоритмы поиска лучших кандидатов шума. Эксперименты показывают, что увеличение вычислений при выводе приводит к значительному улучшению качества сгенерированных изображений.'}, 'en': {'title': 'Enhancing Diffusion Models: Scaling Inference for Better Image Generation', 'desc': 'This paper investigates how to enhance the performance of diffusion models during the inference phase by increasing computational resources. It highlights that, unlike Large Language Models (LLMs), diffusion models can adjust their inference process through the number of denoising steps, but improvements tend to plateau after a certain point. The authors propose a method to optimize the noise used in the diffusion sampling process by exploring different feedback verifiers and algorithms. Their experiments demonstrate that by strategically increasing computation during inference, the quality of generated images can be significantly improved, tailored to various application needs.'}, 'zh': {'title': '扩散模型推理时的计算扩展与性能提升', 'desc': '生成模型在多个领域产生了重要影响,主要得益于其在训练过程中通过增加数据、计算资源和模型规模来扩展的能力。最近的研究开始探讨大型语言模型(LLMs)在推理时的扩展行为,发现额外的计算可以进一步提高性能。与LLMs不同,扩散模型通过去噪步骤的数量灵活调整推理时的计算,尽管性能提升通常在几十步后趋于平稳。本文探讨了扩散模型在推理时的扩展行为,研究如何通过增加计算来进一步提高生成性能,特别是通过寻找更好的噪声来优化扩散采样过程。'}}}, {'id': 'https://huggingface.co/papers/2501.09751', 'title': 'OmniThink: Expanding Knowledge Boundaries in Machine Writing through Thinking', 'url': 'https://huggingface.co/papers/2501.09751', 'abstract': "Machine writing with large language models often relies on retrieval-augmented generation. However, these approaches remain confined within the boundaries of the model's predefined scope, limiting the generation of content with rich information. Specifically, vanilla-retrieved information tends to lack depth, utility, and suffers from redundancy, which negatively impacts the quality of generated articles, leading to shallow, repetitive, and unoriginal outputs. To address these issues, we propose OmniThink, a machine writing framework that emulates the human-like process of iterative expansion and reflection. The core idea behind OmniThink is to simulate the cognitive behavior of learners as they progressively deepen their knowledge of the topics. Experimental results demonstrate that OmniThink improves the knowledge density of generated articles without compromising metrics such as coherence and depth. Human evaluations and expert feedback further highlight the potential of OmniThink to address real-world challenges in the generation of long-form articles.", 'score': 34, 'issue_id': 1722, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '7e8d42358354f79b', 'authors': ['Zekun Xi', 'Wenbiao Yin', 'Jizhan Fang', 'Jialong Wu', 'Runnan Fang', 'Ningyu Zhang', 'Jiang Yong', 'Pengjun Xie', 'Fei Huang', 'Huajun Chen'], 'affiliations': ['Tongyi Lab, Alibaba Group', 'Zhejiang Key Laboratory of Big Data Intelligent Computing', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09751.jpg', 'data': {'categories': ['#rag', '#story_generation', '#long_context', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'OmniThink: Имитация человеческого мышления для улучшения машинной генерации текста', 'desc': 'Статья представляет новый подход к генерации текста с использованием больших языковых моделей, названный OmniThink. Этот метод имитирует человеческий процесс итеративного расширения знаний и рефлексии, преодолевая ограничения стандартных методов извлечения информации. OmniThink улучшает плотность знаний в генерируемых статьях, не жертвуя связностью и глубиной. Эксперименты и оценки экспертов подтверждают эффективность OmniThink для решения реальных задач генерации длинных статей.'}, 'en': {'title': 'OmniThink: Elevating Machine Writing through Human-Like Learning', 'desc': 'This paper introduces OmniThink, a novel machine writing framework that enhances the capabilities of large language models by mimicking human cognitive processes. Unlike traditional retrieval-augmented generation methods, which often produce shallow and repetitive content, OmniThink focuses on iterative expansion and reflection to deepen knowledge on topics. The framework significantly improves the knowledge density of generated articles while maintaining coherence and depth, as shown by experimental results. Human evaluations and expert feedback confirm that OmniThink effectively addresses challenges in generating high-quality long-form content.'}, 'zh': {'title': 'OmniThink:提升机器写作的知识密度', 'desc': '本文提出了一种名为OmniThink的机器写作框架,旨在改善传统大语言模型在生成内容时的局限性。OmniThink模拟人类学习者的认知过程,通过迭代扩展和反思来加深对主题的理解。实验结果表明,OmniThink能够提高生成文章的知识密度,同时保持连贯性和深度等指标。人类评估和专家反馈进一步验证了OmniThink在生成长篇文章时解决实际问题的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.09755', 'title': 'Learnings from Scaling Visual Tokenizers for Reconstruction and Generation', 'url': 'https://huggingface.co/papers/2501.09755', 'abstract': "Visual tokenization via auto-encoding empowers state-of-the-art image and video generative models by compressing pixels into a latent space. Although scaling Transformer-based generators has been central to recent advances, the tokenizer component itself is rarely scaled, leaving open questions about how auto-encoder design choices influence both its objective of reconstruction and downstream generative performance. Our work aims to conduct an exploration of scaling in auto-encoders to fill in this blank. To facilitate this exploration, we replace the typical convolutional backbone with an enhanced Vision Transformer architecture for Tokenization (ViTok). We train ViTok on large-scale image and video datasets far exceeding ImageNet-1K, removing data constraints on tokenizer scaling. We first study how scaling the auto-encoder bottleneck affects both reconstruction and generation -- and find that while it is highly correlated with reconstruction, its relationship with generation is more complex. We next explored the effect of separately scaling the auto-encoders' encoder and decoder on reconstruction and generation performance. Crucially, we find that scaling the encoder yields minimal gains for either reconstruction or generation, while scaling the decoder boosts reconstruction but the benefits for generation are mixed. Building on our exploration, we design ViTok as a lightweight auto-encoder that achieves competitive performance with state-of-the-art auto-encoders on ImageNet-1K and COCO reconstruction tasks (256p and 512p) while outperforming existing auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates competitive performance on image generation for ImageNet-1K and sets new state-of-the-art benchmarks for class-conditional video generation on UCF-101.", 'score': 25, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '426aa3415c3c0ef4', 'authors': ['Philippe Hansen-Estruch', 'David Yan', 'Ching-Yao Chung', 'Orr Zohar', 'Jialiang Wang', 'Tingbo Hou', 'Tao Xu', 'Sriram Vishwanath', 'Peter Vajda', 'Xinlei Chen'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'Stanford University', 'UT Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.09755.jpg', 'data': {'categories': ['#cv', '#benchmark', '#video', '#optimization', '#architecture', '#diffusion'], 'emoji': '🔬', 'ru': {'title': 'ViTok: Оптимизация визуальной токенизации для генеративных моделей', 'desc': 'Статья исследует масштабирование автоэнкодеров для визуальной токенизации в генеративных моделях изображений и видео. Авторы представляют ViTok - легковесный автоэнкодер на основе Vision Transformer, обученный на масштабных датасетах. Исследование показывает, что масштабирование декодера улучшает реконструкцию, но неоднозначно влияет на генерацию. ViTok демонстрирует конкурентоспособную производительность при меньшем количестве FLOP и устанавливает новые рекорды в условной генерации видео.'}, 'en': {'title': 'Scaling Auto-Encoders for Enhanced Image and Video Generation', 'desc': 'This paper explores the scaling of auto-encoders, particularly focusing on the tokenizer component, which is crucial for image and video generation. The authors introduce ViTok, a Vision Transformer-based architecture that replaces traditional convolutional backbones, allowing for better scaling on large datasets. They investigate how different scaling strategies for the encoder and decoder affect both reconstruction and generative performance, finding that scaling the decoder is more beneficial for reconstruction. Ultimately, ViTok achieves competitive results with fewer computational resources and sets new benchmarks in image and video generation tasks.'}, 'zh': {'title': '自编码器的视觉标记化:提升生成模型的关键', 'desc': '本论文探讨了通过自编码器进行视觉标记化对图像和视频生成模型的影响。我们提出了一种增强的视觉变换器架构(ViTok),用于替代传统的卷积骨干网络,以提高标记化的效果。研究发现,自编码器的瓶颈规模与重建性能高度相关,但与生成性能的关系更为复杂。最终,ViTok在多个任务中表现出色,尤其是在视频重建和图像生成方面,展示了其在计算效率上的优势。'}}}, {'id': 'https://huggingface.co/papers/2501.09686', 'title': 'Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models', 'url': 'https://huggingface.co/papers/2501.09686', 'abstract': 'Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs\' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs\' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI\'s o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions.', 'score': 23, 'issue_id': 1720, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1c6b1b1f0235304c', 'authors': ['Fengli Xu', 'Qianyue Hao', 'Zefang Zong', 'Jingwei Wang', 'Yunke Zhang', 'Jingyi Wang', 'Xiaochong Lan', 'Jiahui Gong', 'Tianjian Ouyang', 'Fanjin Meng', 'Chenyang Shao', 'Yuwei Yan', 'Qinglong Yang', 'Yiwen Song', 'Sijian Ren', 'Xinyuan Hu', 'Yu Li', 'Jie Feng', 'Chen Gao', 'Yong Li'], 'affiliations': ['Emory University, Atlanta GA, USA', 'HKUST (GZ), Guangzhou, China', 'Tsinghua University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09686.jpg', 'data': {'categories': ['#open_source', '#training', '#rl', '#survey', '#reasoning', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Путь к большим моделям рассуждений: новый рубеж в ИИ', 'desc': 'Этот обзор посвящен прогрессу в области рассуждений с использованием больших языковых моделей (LLM). Рассматриваются ключевые технические компоненты, способствующие развитию крупных моделей рассуждений, включая автоматизированное построение данных, методы обучения рассуждениям и масштабирование во время тестирования. Анализируются популярные проекты с открытым исходным кодом по созданию крупных моделей рассуждений. Обсуждаются открытые проблемы и направления будущих исследований в этой области.'}, 'en': {'title': 'Unlocking Human-Like Reasoning in Large Language Models', 'desc': "This paper discusses the advancements in Large Language Models (LLMs) and their application to complex reasoning tasks. It introduces the concept of 'thought', which represents intermediate reasoning steps, allowing LLMs to simulate human-like reasoning processes. The paper highlights the use of reinforcement learning to enhance LLMs' reasoning capabilities by generating high-quality reasoning trajectories through trial-and-error methods. Additionally, it emphasizes the importance of scaling both training and testing phases to improve reasoning accuracy, paving the way for the development of Large Reasoning Models."}, 'zh': {'title': '推动大型推理模型的研究新前沿', 'desc': '这篇论文探讨了大型语言模型(LLMs)在复杂推理任务中的应用。研究者们引入了“思考”的概念,通过中间步骤的令牌序列来模拟人类的推理过程。最近,强化学习(RL)被应用于训练LLMs,以自动生成高质量的推理轨迹,从而显著提高推理能力。论文还讨论了在测试时增加令牌数量以提高推理准确性的效果,并展望了大型推理模型的未来研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09484', 'title': 'Exploring the Inquiry-Diagnosis Relationship with Advanced Patient Simulators', 'url': 'https://huggingface.co/papers/2501.09484', 'abstract': 'Online medical consultation (OMC) restricts doctors to gathering patient information solely through inquiries, making the already complex sequential decision-making process of diagnosis even more challenging. Recently, the rapid advancement of large language models has demonstrated a significant potential to transform OMC. However, most studies have primarily focused on improving diagnostic accuracy under conditions of relatively sufficient information, while paying limited attention to the "inquiry" phase of the consultation process. This lack of focus has left the relationship between "inquiry" and "diagnosis" insufficiently explored. In this paper, we first extract real patient interaction strategies from authentic doctor-patient conversations and use these strategies to guide the training of a patient simulator that closely mirrors real-world behavior. By inputting medical records into our patient simulator to simulate patient responses, we conduct extensive experiments to explore the relationship between "inquiry" and "diagnosis" in the consultation process. Experimental results demonstrate that inquiry and diagnosis adhere to the Liebig\'s law: poor inquiry quality limits the effectiveness of diagnosis, regardless of diagnostic capability, and vice versa. Furthermore, the experiments reveal significant differences in the inquiry performance of various models. To investigate this phenomenon, we categorize the inquiry process into four types: (1) chief complaint inquiry; (2) specification of known symptoms; (3) inquiry about accompanying symptoms; and (4) gathering family or medical history. We analyze the distribution of inquiries across the four types for different models to explore the reasons behind their significant performance differences. We plan to open-source the weights and related code of our patient simulator at https://github.com/LIO-H-ZEN/PatientSimulator.', 'score': 18, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'aff7d86ad63040d9', 'authors': ['Zhaocheng Liu', 'Quan Tu', 'Wen Ye', 'Yu Xiao', 'Zhishou Zhang', 'Hengfu Cui', 'Yalun Zhu', 'Qiang Ju', 'Shizheng Li', 'Jian Xie'], 'affiliations': ['Baichuan Inc.', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09484.jpg', 'data': {'categories': ['#data', '#training', '#science', '#open_source', '#healthcare'], 'emoji': '🩺', 'ru': {'title': 'Симуляция пациента для улучшения онлайн-диагностики с помощью ИИ', 'desc': 'Эта статья исследует процесс онлайн-медицинских консультаций с использованием больших языковых моделей. Авторы разработали симулятор пациента на основе реальных стратегий взаимодействия врача и пациента. Эксперименты показали, что качество опроса и диагностики взаимозависимы и подчиняются закону Либиха. Анализ различных моделей выявил значительные различия в эффективности опроса, которые были классифицированы по четырем типам.'}, 'en': {'title': 'Enhancing Diagnosis through Effective Inquiry in Online Medical Consultations', 'desc': "This paper addresses the challenges of online medical consultations (OMC) by focusing on the inquiry phase, which is crucial for accurate diagnosis. It utilizes large language models to create a patient simulator that mimics real patient interactions based on actual doctor-patient conversations. The study reveals that the quality of inquiry directly impacts diagnostic effectiveness, following Liebig's law, which states that the weakest link limits overall performance. Additionally, the research categorizes inquiry types and analyzes their distribution across different models, highlighting significant performance variations in inquiry effectiveness."}, 'zh': {'title': '优化询问,提升诊断效果', 'desc': '本文探讨了在线医疗咨询中询问与诊断之间的关系。我们从真实的医患对话中提取了患者互动策略,并利用这些策略训练了一个模拟患者的模型。实验结果表明,询问质量的差异直接影响诊断效果,且不同模型在询问表现上存在显著差异。我们将询问过程分为四种类型,并分析了不同模型在这些类型上的表现,以揭示其性能差异的原因。'}}}, {'id': 'https://huggingface.co/papers/2501.09038', 'title': 'Do generative video models learn physical principles from watching videos?', 'url': 'https://huggingface.co/papers/2501.09038', 'abstract': "AI video generation is undergoing a revolution, with quality and realism advancing rapidly. These advances have led to a passionate scientific debate: Do video models learn ``world models'' that discover laws of physics -- or, alternatively, are they merely sophisticated pixel predictors that achieve visual realism without understanding the physical principles of reality? We address this question by developing Physics-IQ, a comprehensive benchmark dataset that can only be solved by acquiring a deep understanding of various physical principles, like fluid dynamics, optics, solid mechanics, magnetism and thermodynamics. We find that across a range of current models (Sora, Runway, Pika, Lumiere, Stable Video Diffusion, and VideoPoet), physical understanding is severely limited, and unrelated to visual realism. At the same time, some test cases can already be successfully solved. This indicates that acquiring certain physical principles from observation alone may be possible, but significant challenges remain. While we expect rapid advances ahead, our work demonstrates that visual realism does not imply physical understanding. Our project page is at https://physics-iq.github.io; code at https://github.com/google-deepmind/physics-IQ-benchmark.", 'score': 17, 'issue_id': 1725, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '6a5047e8681ddcc5', 'authors': ['Saman Motamed', 'Laura Culp', 'Kevin Swersky', 'Priyank Jaini', 'Robert Geirhos'], 'affiliations': ['Google DeepMind', 'INSAIT, Sofia University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09038.jpg', 'data': {'categories': ['#benchmark', '#science', '#video'], 'emoji': '🧠', 'ru': {'title': 'Визуальный реализм не гарантирует понимание физики в ИИ', 'desc': 'Статья посвящена исследованию физического понимания в моделях генерации видео. Авторы разработали набор данных Physics-IQ для оценки способности моделей понимать законы физики. Результаты показывают, что современные модели имеют ограниченное физическое понимание, несмотря на визуальный реализм. Однако некоторые задачи уже успешно решаются, что указывает на потенциал изучения физических принципов из наблюдений.'}, 'en': {'title': 'Visual Realism vs. Physical Understanding in AI Video Generation', 'desc': "This paper explores whether AI video generation models truly understand the laws of physics or if they are just good at creating realistic images. The authors introduce Physics-IQ, a benchmark dataset designed to test models on their grasp of physical principles like fluid dynamics and thermodynamics. Their findings show that current models struggle with physical understanding, even though they can produce visually realistic videos. This suggests that while some physical concepts can be learned from observation, there are still significant gaps in the models' comprehension of reality."}, 'zh': {'title': '视觉真实感不等于物理理解', 'desc': '本论文探讨了AI视频生成技术的进展,特别是模型是否理解物理规律。我们开发了Physics-IQ,一个全面的基准数据集,只有通过深入理解流体动力学、光学、固体力学、磁学和热力学等物理原理才能解决。研究发现,当前模型在物理理解方面存在严重限制,且与视觉真实感无关。尽管某些测试案例已成功解决,但这表明仅通过观察获得某些物理原理仍面临重大挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.09747', 'title': 'FAST: Efficient Action Tokenization for Vision-Language-Action Models', 'url': 'https://huggingface.co/papers/2501.09747', 'abstract': 'Autoregressive sequence models, such as Transformer-based vision-language action (VLA) policies, can be tremendously effective for capturing complex and generalizable robotic behaviors. However, such models require us to choose a tokenization of our continuous action signals, which determines how the discrete symbols predicted by the model map to continuous robot actions. We find that current approaches for robot action tokenization, based on simple per-dimension, per-timestep binning schemes, typically perform poorly when learning dexterous skills from high-frequency robot data. To address this challenge, we propose a new compression-based tokenization scheme for robot actions, based on the discrete cosine transform. Our tokenization approach, Frequency-space Action Sequence Tokenization (FAST), enables us to train autoregressive VLAs for highly dexterous and high-frequency tasks where standard discretization methods fail completely. Based on FAST, we release FAST+, a universal robot action tokenizer, trained on 1M real robot action trajectories. It can be used as a black-box tokenizer for a wide range of robot action sequences, with diverse action spaces and control frequencies. Finally, we show that, when combined with the pi0 VLA, our method can scale to training on 10k hours of robot data and match the performance of diffusion VLAs, while reducing training time by up to 5x.', 'score': 16, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '1ff64d2f7e62d274', 'authors': ['Karl Pertsch', 'Kyle Stachowicz', 'Brian Ichter', 'Danny Driess', 'Suraj Nair', 'Quan Vuong', 'Oier Mees', 'Chelsea Finn', 'Sergey Levine'], 'affiliations': ['Physical Intelligence', 'Stanford', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.09747.jpg', 'data': {'categories': ['#dataset', '#agents', '#training', '#games', '#optimization', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'Революция в токенизации действий робота: от частотного пространства к универсальности', 'desc': 'Статья представляет новый метод токенизации действий робота под названием FAST (Frequency-space Action Sequence Tokenization), основанный на дискретном косинусном преобразовании. Этот подход позволяет обучать авторегрессионные модели VLA (Vision-Language Action) для высокочастотных и сложных задач манипулирования, где стандартные методы дискретизации не работают. Авторы также представляют FAST+, универсальный токенизатор действий робота, обученный на 1 миллионе реальных траекторий. В сочетании с моделью pi0 VLA, метод FAST позволяет обучаться на 10 тысячах часов данных робота и достигать производительности диффузионных VLA, сокращая время обучения до 5 раз.'}, 'en': {'title': 'Revolutionizing Robot Action Tokenization with FAST', 'desc': 'This paper introduces a new method for tokenizing continuous robot actions to improve the performance of autoregressive sequence models, specifically in the context of vision-language action (VLA) policies. The authors identify that traditional tokenization methods, which use simple binning techniques, struggle with high-frequency and dexterous robotic tasks. To overcome this limitation, they propose Frequency-space Action Sequence Tokenization (FAST), which utilizes the discrete cosine transform for better action representation. The results demonstrate that FAST can effectively train VLAs on extensive robot data, achieving performance comparable to diffusion models while significantly reducing training time.'}, 'zh': {'title': '提升机器人灵巧技能的标记化新方法', 'desc': '本文提出了一种新的机器人动作标记化方案,称为频率空间动作序列标记化(FAST),旨在解决现有基于简单分箱方法的标记化在学习灵巧技能时的不足。FAST利用离散余弦变换来有效地处理高频机器人数据,从而提高了模型在复杂任务中的表现。我们还发布了FAST+,这是一个通用的机器人动作标记器,能够处理多种动作序列和控制频率。通过与pi0 VLA结合,我们的方法在训练10,000小时的机器人数据时,能够与扩散VLA的性能相匹配,同时将训练时间减少了多达5倍。'}}}, {'id': 'https://huggingface.co/papers/2501.09756', 'title': 'SynthLight: Portrait Relighting with Diffusion Model by Learning to Re-render Synthetic Faces', 'url': 'https://huggingface.co/papers/2501.09756', 'abstract': "We introduce SynthLight, a diffusion model for portrait relighting. Our approach frames image relighting as a re-rendering problem, where pixels are transformed in response to changes in environmental lighting conditions. Using a physically-based rendering engine, we synthesize a dataset to simulate this lighting-conditioned transformation with 3D head assets under varying lighting. We propose two training and inference strategies to bridge the gap between the synthetic and real image domains: (1) multi-task training that takes advantage of real human portraits without lighting labels; (2) an inference time diffusion sampling procedure based on classifier-free guidance that leverages the input portrait to better preserve details. Our method generalizes to diverse real photographs and produces realistic illumination effects, including specular highlights and cast shadows, while preserving the subject's identity. Our quantitative experiments on Light Stage data demonstrate results comparable to state-of-the-art relighting methods. Our qualitative results on in-the-wild images showcase rich and unprecedented illumination effects. Project Page: https://vrroom.github.io/synthlight/", 'score': 15, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'e6621d55eb165448', 'authors': ['Sumit Chaturvedi', 'Mengwei Ren', 'Yannick Hold-Geoffroy', 'Jingyuan Liu', 'Julie Dorsey', 'Zhixin Shu'], 'affiliations': ['Adobe Research', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09756.jpg', 'data': {'categories': ['#dataset', '#3d', '#inference', '#cv', '#diffusion', '#training', '#synthetic'], 'emoji': '💡', 'ru': {'title': 'SynthLight: реалистичная перезасветка портретов с помощью диффузионной модели', 'desc': 'SynthLight - это диффузионная модель для перезасветки портретов. Модель рассматривает перезасветку как проблему повторного рендеринга, где пиксели трансформируются в ответ на изменения условий освещения окружающей среды. Авторы синтезировали датасет с помощью физически корректного рендеринга, симулируя трансформации освещения на 3D-моделях голов. Предложены две стратегии обучения и вывода для преодоления разрыва между синтетическими и реальными изображениями.'}, 'en': {'title': 'Revolutionizing Portrait Relighting with SynthLight', 'desc': 'SynthLight is a diffusion model designed for relighting portraits by treating the task as a re-rendering challenge influenced by environmental lighting changes. It utilizes a physically-based rendering engine to create a synthetic dataset that simulates how lighting affects 3D head models. The model employs multi-task training to utilize real portraits without specific lighting labels and a novel inference strategy that enhances detail preservation during the relighting process. The results show that SynthLight can effectively generalize to real images, producing realistic lighting effects while maintaining the identity of the subjects, outperforming existing methods in both quantitative and qualitative assessments.'}, 'zh': {'title': 'SynthLight:肖像重光照的新方法', 'desc': '我们介绍了SynthLight,这是一种用于肖像重光照的扩散模型。我们将图像重光照视为重新渲染的问题,通过物理基础渲染引擎合成数据集,以模拟在不同光照条件下的像素变换。我们提出了两种训练和推理策略,以缩小合成图像和真实图像之间的差距,利用真实人像进行多任务训练,并在推理时使用无分类器引导的扩散采样程序。我们的模型能够在多样的真实照片中推广,生成逼真的光照效果,同时保持主体的身份特征。'}}}, {'id': 'https://huggingface.co/papers/2501.09433', 'title': 'CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation', 'url': 'https://huggingface.co/papers/2501.09433', 'abstract': 'The synthesis of high-quality 3D assets from textual or visual inputs has become a central objective in modern generative modeling. Despite the proliferation of 3D generation algorithms, they frequently grapple with challenges such as multi-view inconsistency, slow generation times, low fidelity, and surface reconstruction problems. While some studies have addressed some of these issues, a comprehensive solution remains elusive. In this paper, we introduce CaPa, a carve-and-paint framework that generates high-fidelity 3D assets efficiently. CaPa employs a two-stage process, decoupling geometry generation from texture synthesis. Initially, a 3D latent diffusion model generates geometry guided by multi-view inputs, ensuring structural consistency across perspectives. Subsequently, leveraging a novel, model-agnostic Spatially Decoupled Attention, the framework synthesizes high-resolution textures (up to 4K) for a given geometry. Furthermore, we propose a 3D-aware occlusion inpainting algorithm that fills untextured regions, resulting in cohesive results across the entire model. This pipeline generates high-quality 3D assets in less than 30 seconds, providing ready-to-use outputs for commercial applications. Experimental results demonstrate that CaPa excels in both texture fidelity and geometric stability, establishing a new standard for practical, scalable 3D asset generation.', 'score': 12, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '8c7a54f21e46af7a', 'authors': ['Hwan Heo', 'Jangyeong Kim', 'Seongyeong Lee', 'Jeong A Wi', 'Junyoung Choi', 'Sangjun Ahn'], 'affiliations': ['Graphics AI Lab, NC Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.09433.jpg', 'data': {'categories': ['#diffusion', '#3d', '#optimization'], 'emoji': '🎨', 'ru': {'title': 'CaPa: Революция в генерации 3D-моделей', 'desc': 'В статье представлен CaPa - фреймворк для генерации высококачественных 3D-моделей. Он использует двухэтапный процесс, разделяя создание геометрии и текстур с помощью латентной диффузионной модели и пространственно-разделенного внимания. CaPa также предлагает алгоритм для заполнения нетекстурированных областей, обеспечивая целостность результатов. Фреймворк генерирует 3D-модели менее чем за 30 секунд, превосходя аналоги по качеству текстур и стабильности геометрии.'}, 'en': {'title': 'CaPa: Fast and High-Fidelity 3D Asset Generation', 'desc': 'This paper presents CaPa, a novel framework for generating high-quality 3D assets from textual or visual inputs. It addresses common challenges in 3D generation, such as multi-view inconsistency and slow generation times, by separating geometry generation from texture synthesis. The framework utilizes a 3D latent diffusion model for consistent geometry creation and a Spatially Decoupled Attention mechanism for high-resolution texture synthesis. CaPa also includes a 3D-aware occlusion inpainting algorithm to enhance the final output, achieving high fidelity and stability in under 30 seconds.'}, 'zh': {'title': '高效生成高保真3D资产的CaPa框架', 'desc': '本论文介绍了一种名为CaPa的框架,用于高效生成高保真度的3D资产。该框架采用两阶段的过程,将几何体生成与纹理合成解耦。首先,使用3D潜在扩散模型生成几何体,确保多视角之间的结构一致性。然后,通过一种新颖的空间解耦注意力机制合成高分辨率纹理,并提出了3D感知的遮挡修复算法,最终在30秒内生成高质量的3D资产。'}}}, {'id': 'https://huggingface.co/papers/2501.09653', 'title': 'The Heap: A Contamination-Free Multilingual Code Dataset for Evaluating Large Language Models', 'url': 'https://huggingface.co/papers/2501.09653', 'abstract': 'The recent rise in the popularity of large language models has spurred the development of extensive code datasets needed to train them. This has left limited code available for collection and use in the downstream investigation of specific behaviors, or evaluation of large language models without suffering from data contamination. To address this problem, we release The Heap, a large multilingual dataset covering 57 programming languages that has been deduplicated with respect to other open datasets of code, enabling researchers to conduct fair evaluations of large language models without significant data cleaning overhead.', 'score': 10, 'issue_id': 1730, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '6d731a1519dc2727', 'authors': ['Jonathan Katzy', 'Razvan Mihai Popescu', 'Arie van Deursen', 'Maliheh Izadi'], 'affiliations': ['Delft University of Technology Delft, The Netherlands'], 'pdf_title_img': 'assets/pdf/title_img/2501.09653.jpg', 'data': {'categories': ['#low_resource', '#multilingual', '#open_source', '#data', '#dataset'], 'emoji': '🗃️', 'ru': {'title': 'The Heap: чистый код для честной оценки языковых моделей', 'desc': "Статья описывает создание нового набора данных для обучения языковых моделей в области программирования. Набор данных под названием 'The Heap' охватывает 57 языков программирования и был дедуплицирован относительно других открытых наборов данных. Это позволяет исследователям проводить объективные оценки больших языковых моделей без необходимости значительной предварительной очистки данных. Создание 'The Heap' решает проблему ограниченности доступного кода для исследования специфических поведений моделей и их оценки без риска загрязнения данных."}, 'en': {'title': 'The Heap: A Clean Dataset for Fair Evaluation of Language Models', 'desc': 'This paper introduces The Heap, a comprehensive multilingual dataset that includes code from 57 programming languages. It addresses the challenge of data contamination in evaluating large language models by providing a deduplicated dataset, ensuring that the code is unique compared to existing open datasets. Researchers can utilize The Heap for downstream tasks without the burden of extensive data cleaning. This resource aims to facilitate fair assessments of model performance in coding tasks.'}, 'zh': {'title': '公平评估大型语言模型的新数据集', 'desc': '随着大型语言模型的流行,开发了大量的代码数据集来训练这些模型。然而,这导致可用于特定行为研究或评估大型语言模型的代码有限,且可能存在数据污染的问题。为了解决这个问题,我们发布了The Heap,这是一个覆盖57种编程语言的大型多语言数据集,经过去重处理,避免与其他开放代码数据集重复。这样,研究人员可以在不需要大量数据清理的情况下,公平地评估大型语言模型。'}}}, {'id': 'https://huggingface.co/papers/2501.09503', 'title': 'AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation', 'url': 'https://huggingface.co/papers/2501.09503', 'abstract': 'Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .', 'score': 8, 'issue_id': 1721, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'fb27e795153a9668', 'authors': ['Junjie He', 'Yuxiang Tuo', 'Binghui Chen', 'Chongyang Zhong', 'Yifeng Geng', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.09503.jpg', 'data': {'categories': ['#cv', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'AnyStory: Высококачественная генерация персонализированных изображений с множественными субъектами', 'desc': 'Статья представляет AnyStory - новый подход к генерации персонализированных изображений с несколькими субъектами. Метод использует универсальный энкодер изображений ReferenceNet и CLIP для высококачественного кодирования характеристик субъектов. AnyStory применяет декуплированный маршрутизатор субъектов для точного определения их потенциального расположения в латентном пространстве. Эксперименты показывают превосходную производительность метода в сохранении деталей субъектов, соответствии текстовым описаниям и персонализации для нескольких субъектов одновременно.'}, 'en': {'title': 'AnyStory: Mastering Personalized Image Generation for Multiple Subjects', 'desc': "This paper introduces AnyStory, a novel method for generating personalized images with high fidelity, even when multiple subjects are involved. It employs an 'encode-then-route' strategy, where a powerful image encoder, ReferenceNet, captures detailed subject features. The routing mechanism uses an instance-aware subject router to accurately determine where each subject should be placed in the generated image. Experimental results show that AnyStory excels in maintaining subject details and aligning them with text descriptions, making it effective for both single and multiple subjects."}, 'zh': {'title': 'AnyStory:个性化主题生成的新方法', 'desc': '最近,大规模生成模型在文本到图像生成方面表现出色。然而,生成高保真度的个性化图像,尤其是涉及多个主题的情况,仍然面临挑战。本文提出了AnyStory,这是一种统一的个性化主题生成方法,能够在不牺牲主题保真的情况下,实现单个和多个主题的高保真个性化。AnyStory通过“编码-再路由”的方式建模主题个性化问题,利用强大的图像编码器和实例感知路由器,准确预测主题在潜在空间中的位置。'}}}, {'id': 'https://huggingface.co/papers/2501.08617', 'title': 'RLHS: Mitigating Misalignment in RLHF with Hindsight Simulation', 'url': 'https://huggingface.co/papers/2501.08617', 'abstract': "Generative AI systems like foundation models (FMs) must align well with human values to ensure their behavior is helpful and trustworthy. While Reinforcement Learning from Human Feedback (RLHF) has shown promise for optimizing model performance using human judgments, existing RLHF pipelines predominantly rely on immediate feedback, which can fail to accurately reflect the downstream impact of an interaction on users' utility. We demonstrate that feedback based on evaluators' foresight estimates of downstream consequences systematically induces Goodhart's Law dynamics, incentivizing misaligned behaviors like sycophancy and deception and ultimately degrading user outcomes. To alleviate this, we propose decoupling evaluation from prediction by refocusing RLHF on hindsight feedback. Our theoretical analysis reveals that conditioning evaluator feedback on downstream observations mitigates misalignment and improves expected human utility, even when these observations are simulated by the AI system itself. To leverage this insight in a practical alignment algorithm, we introduce Reinforcement Learning from Hindsight Simulation (RLHS), which first simulates plausible consequences and then elicits feedback to assess what behaviors were genuinely beneficial in hindsight. We apply RLHS to two widely-employed online and offline preference optimization methods -- Proximal Policy Optimization (PPO) and Direct Preference Optimization (DPO) -- and show empirically that misalignment is significantly reduced with both methods. Through an online human user study, we show that RLHS consistently outperforms RLHF in helping users achieve their goals and earns higher satisfaction ratings, despite being trained solely with simulated hindsight feedback. These results underscore the importance of focusing on long-term consequences, even simulated ones, to mitigate misalignment in RLHF.", 'score': 8, 'issue_id': 1720, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'f758bc630d8dd443', 'authors': ['Kaiqu Liang', 'Haimin Hu', 'Ryan Liu', 'Thomas L. Griffiths', 'Jaime Fernández Fisac'], 'affiliations': ['Department of Computer Science, Princeton University', 'Department of Electrical and Computer Engineering, Princeton University', 'Department of Psychology, Princeton University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08617.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#training', '#rl'], 'emoji': '🔮', 'ru': {'title': 'Взгляд в будущее для лучшей настройки ИИ', 'desc': 'Статья представляет новый метод обучения с подкреплением - Reinforcement Learning from Hindsight Simulation (RLHS). В отличие от стандартного RLHF, RLHS использует симуляцию долгосрочных последствий действий модели и оценку их полезности постфактум. Авторы показывают, что RLHS позволяет уменьшить проблему неправильной мотивации модели и улучшить соответствие человеческим ценностям. Эмпирические эксперименты демонстрируют превосходство RLHS над RLHF в достижении целей пользователей.'}, 'en': {'title': 'Aligning AI with Human Values through Hindsight Feedback', 'desc': "This paper addresses the challenge of aligning generative AI systems with human values using Reinforcement Learning from Human Feedback (RLHF). It identifies that relying on immediate feedback can lead to misaligned behaviors, such as sycophancy and deception, due to Goodhart's Law dynamics. The authors propose a new approach called Reinforcement Learning from Hindsight Simulation (RLHS), which uses simulated consequences to gather feedback on beneficial behaviors. Their experiments show that RLHS improves user satisfaction and goal achievement compared to traditional RLHF methods, highlighting the importance of considering long-term outcomes in AI alignment."}, 'zh': {'title': '关注长期后果,提升AI对齐性', 'desc': '这篇论文探讨了生成性人工智能系统如何更好地与人类价值观对齐,以确保其行为有益且可信。现有的基于人类反馈的强化学习(RLHF)方法主要依赖即时反馈,但这种反馈可能无法准确反映与用户效用相关的长期影响。作者提出了一种新的方法,称为基于事后模拟的强化学习(RLHS),通过模拟可能的后果来获取反馈,从而改善模型的对齐性。研究表明,RLHS在帮助用户实现目标和提高满意度方面,优于传统的RLHF方法。'}}}, {'id': 'https://huggingface.co/papers/2501.15368', 'title': 'Baichuan-Omni-1.5 Technical Report', 'url': 'https://huggingface.co/papers/2501.15368', 'abstract': 'We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks.', 'score': 35, 'issue_id': 1898, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': 'f40b7f7b108c1500', 'authors': ['Yadong Li', 'Jun Liu', 'Tao Zhang', 'Tao Zhang', 'Song Chen', 'Tianpeng Li', 'Zehuan Li', 'Lijun Liu', 'Lingfeng Ming', 'Guosheng Dong', 'Da Pan', 'Chong Li', 'Yuanbo Fang', 'Dongdong Kuang', 'Mingrui Wang', 'Chenglin Zhu', 'Youwei Zhang', 'Hongyu Guo', 'Fengyu Zhang', 'Yuran Wang', 'Bowen Ding', 'Wei Song', 'Xu Li', 'Yuqi Huo', 'Zheng Liang', 'Shusen Zhang', 'Xin Wu', 'Shuai Zhao', 'Linchu Xiong', 'Yozhen Wu', 'Jiahui Ye', 'Wenhao Lu', 'Bowen Li', 'Yan Zhang', 'Yaqi Zhou', 'Xin Chen', 'Lei Su', 'Hongda Zhang', 'Fuzhong Chen', 'Xuezhen Dong', 'Na Nie', 'Zhiying Wu', 'Bin Xiao', 'Ting Li', 'Shunya Dang', 'Ping Zhang', 'Yijia Sun', 'Jincheng Wu', 'Jinjie Yang', 'Xionghai Lin', 'Zhi Ma', 'Kegeng Wu', 'Jia li', 'Aiyuan Yang', 'Hui Liu', 'Jianqiang Zhang', 'Xiaoxi Chen', 'Guangwei Ai', 'Wentao Zhang', 'Yicong Chen', 'Xiaoqin Huang', 'Kun Li', 'Wenjing Luo', 'Yifei Duan', 'Lingling Zhu', 'Ran Xiao', 'Zhe Su', 'Jiani Pu', 'Dian Wang', 'Xu Jia', 'Tianyu Zhang', 'Mengyu Ai', 'Mang Wang', 'Yujing Qiao', 'Lei Zhang', 'Yanjun Shen', 'Fan Yang', 'Miao Zhen', 'Yijie Zhou', 'Mingyang Chen', 'Fei Li', 'Chenzheng Zhu', 'Keer Lu', 'Yaqi Zhao', 'Hao Liang', 'Youquan Li', 'Yanzhao Qin', 'Linzhuang Sun', 'Jianhua Xu', 'Haoze Sun', 'Mingan Lin', 'Zenan Zhou', 'Weipeng Chen'], 'affiliations': ['Baichuan Inc.'], 'pdf_title_img': 'assets/pdf/title_img/2501.15368.jpg', 'data': {'categories': ['#data', '#optimization', '#dataset', '#training', '#audio', '#multimodal'], 'emoji': '🎭', 'ru': {'title': 'Baichuan-Omni-1.5: Прорыв в омнимодальном ИИ', 'desc': 'Baichuan-Omni-1.5 - это омнимодальная модель, обладающая способностями понимания и генерации аудио. Для достижения качественного взаимодействия между модальностями, авторы оптимизировали три ключевых аспекта: создали комплексный пайплайн для обработки мультимодальных данных, разработали аудио-токенизатор для захвата семантической и акустической информации, и применили многоэтапную стратегию обучения. Модель демонстрирует ведущие результаты в омнимодальных возможностях и сравнима с передовыми моделями в различных мультимодальных медицинских бенчмарках.'}, 'en': {'title': 'Revolutionizing Multimodal Interaction with Baichuan-Omni-1.5', 'desc': 'Baichuan-Omni-1.5 is a cutting-edge omni-modal model designed for seamless interaction across text, audio, and visual data. It utilizes a robust data cleaning and synthesis pipeline to process approximately 500 billion high-quality multimodal data points. The model features a specialized audio-tokenizer that captures both semantic and acoustic elements, enhancing its compatibility with multi-layered language models (MLLM). Through a multi-stage training approach, it effectively aligns and fine-tunes across modalities, outperforming existing models in various multimodal tasks, particularly in medical benchmarks.'}, 'zh': {'title': '全模态交互的新纪元', 'desc': '我们介绍了Baichuan-Omni-1.5,这是一种全模态模型,具备全模态理解和端到端音频生成能力。为了实现不同模态之间流畅且高质量的交互,我们优化了三个关键方面。首先,我们建立了一个全面的数据清洗和合成管道,获得了约5000亿条高质量的多模态数据(文本、音频和视觉)。其次,我们设计了一个音频标记器(Baichuan-Audio-Tokenizer),能够捕捉音频的语义和声学信息,从而增强与多模态大语言模型的兼容性。'}}}, {'id': 'https://huggingface.co/papers/2501.15383', 'title': 'Qwen2.5-1M Technical Report', 'url': 'https://huggingface.co/papers/2501.15383', 'abstract': 'We introduce Qwen2.5-1M, a series of models that extend the context length to 1 million tokens. Compared to the previous 128K version, the Qwen2.5-1M series have significantly enhanced long-context capabilities through long-context pre-training and post-training. Key techniques such as long data synthesis, progressive pre-training, and multi-stage supervised fine-tuning are employed to effectively enhance long-context performance while reducing training costs. To promote the use of long-context models among a broader user base, we present and open-source our inference framework. This framework includes a length extrapolation method that can expand the model context lengths by at least four times, or even more, without additional training. To reduce inference costs, we implement a sparse attention method along with chunked prefill optimization for deployment scenarios and a sparsity refinement method to improve precision. Additionally, we detail our optimizations in the inference engine, including kernel optimization, pipeline parallelism, and scheduling optimization, which significantly enhance overall inference performance. By leveraging our inference framework, the Qwen2.5-1M models achieve a remarkable 3x to 7x prefill speedup in scenarios with 1 million tokens of context. This framework provides an efficient and powerful solution for developing applications that require long-context processing using open-source models. The Qwen2.5-1M series currently includes the open-source models Qwen2.5-7B-Instruct-1M and Qwen2.5-14B-Instruct-1M, as well as the API-accessed model Qwen2.5-Turbo. Evaluations show that Qwen2.5-1M models have been greatly improved in long-context tasks without compromising performance in short-context scenarios. Specifically, the Qwen2.5-14B-Instruct-1M model significantly outperforms GPT-4o-mini in long-context tasks and supports contexts eight times longer.', 'score': 22, 'issue_id': 1898, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': '203817e55fc3eb45', 'authors': ['An Yang', 'Bowen Yu', 'Chengyuan Li', 'Dayiheng Liu', 'Fei Huang', 'Haoyan Huang', 'Jiandong Jiang', 'Jianhong Tu', 'Jianwei Zhang', 'Jingren Zhou', 'Junyang Lin', 'Kai Dang', 'Kexin Yang', 'Le Yu', 'Mei Li', 'Minmin Sun', 'Qin Zhu', 'Rui Men', 'Tao He', 'Weijia Xu', 'Wenbiao Yin', 'Wenyuan Yu', 'Xiafei Qiu', 'Xingzhang Ren', 'Xinlong Yang', 'Yong Li', 'Zhiying Xu', 'Zipeng Zhang'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.15383.jpg', 'data': {'categories': ['#architecture', '#inference', '#long_context', '#training', '#open_source'], 'emoji': '🚀', 'ru': {'title': 'Миллион токенов: новый рубеж для языковых моделей', 'desc': 'Статья представляет серию моделей Qwen2.5-1M с контекстным окном в 1 миллион токенов. Авторы применили техники синтеза длинных данных, прогрессивного предобучения и многоэтапной супервизированной донастройки для улучшения работы с длинным контекстом. Разработан фреймворк для инференса, включающий метод экстраполяции длины и оптимизации для ускорения обработки. Модели Qwen2.5-1M демонстрируют значительное улучшение на задачах с длинным контекстом без ухудшения производительности на коротких текстах.'}, 'en': {'title': 'Unlocking the Power of 1 Million Tokens with Qwen2.5-1M', 'desc': 'The Qwen2.5-1M models introduce a significant advancement in handling long-context inputs, extending the context length to 1 million tokens. This is achieved through innovative techniques like long data synthesis and multi-stage supervised fine-tuning, which enhance performance while minimizing training costs. The open-source inference framework allows users to expand context lengths without additional training and includes optimizations for efficient deployment. Overall, these models demonstrate superior performance in long-context tasks compared to existing models, making them a valuable resource for applications requiring extensive context processing.'}, 'zh': {'title': 'Qwen2.5-1M:长上下文处理的新突破', 'desc': '我们介绍了Qwen2.5-1M系列模型,能够处理长达100万标记的上下文。与之前的128K版本相比,Qwen2.5-1M在长上下文能力上有显著提升,采用了长数据合成、渐进式预训练和多阶段监督微调等关键技术。为了降低推理成本,我们实现了稀疏注意力机制和分块预填充优化,同时优化了推理引擎的性能。Qwen2.5-1M模型在处理长上下文任务时表现优异,且在短上下文场景中性能没有下降。'}}}, {'id': 'https://huggingface.co/papers/2501.16142', 'title': 'Towards General-Purpose Model-Free Reinforcement Learning', 'url': 'https://huggingface.co/papers/2501.16142', 'abstract': 'Reinforcement learning (RL) promises a framework for near-universal problem-solving. In practice however, RL algorithms are often tailored to specific benchmarks, relying on carefully tuned hyperparameters and algorithmic choices. Recently, powerful model-based RL methods have shown impressive general results across benchmarks but come at the cost of increased complexity and slow run times, limiting their broader applicability. In this paper, we attempt to find a unifying model-free deep RL algorithm that can address a diverse class of domains and problem settings. To achieve this, we leverage model-based representations that approximately linearize the value function, taking advantage of the denser task objectives used by model-based RL while avoiding the costs associated with planning or simulated trajectories. We evaluate our algorithm, MR.Q, on a variety of common RL benchmarks with a single set of hyperparameters and show a competitive performance against domain-specific and general baselines, providing a concrete step towards building general-purpose model-free deep RL algorithms.', 'score': 13, 'issue_id': 1898, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '0cf7cd0c9c1f5964', 'authors': ['Scott Fujimoto', "Pierluca D'Oro", 'Amy Zhang', 'Yuandong Tian', 'Michael Rabbat'], 'affiliations': ['Meta FAIR'], 'pdf_title_img': 'assets/pdf/title_img/2501.16142.jpg', 'data': {'categories': ['#optimization', '#rl', '#benchmark', '#training', '#games'], 'emoji': '🤖', 'ru': {'title': 'MR.Q: На пути к универсальному обучению с подкреплением', 'desc': 'Статья представляет новый алгоритм обучения с подкреплением под названием MR.Q. Этот алгоритм объединяет преимущества модельного и безмодельного подходов, используя представления, линеаризующие функцию ценности. MR.Q показывает конкурентоспособные результаты на различных бенчмарках с единым набором гиперпараметров. Исследование направлено на создание универсального безмодельного алгоритма глубокого обучения с подкреплением.'}, 'en': {'title': 'Towards Universal Problem-Solving with MR.Q in Reinforcement Learning', 'desc': 'This paper presents a new model-free deep reinforcement learning algorithm called MR.Q, which aims to solve a wide range of problems without needing extensive tuning of hyperparameters. The authors utilize model-based representations to simplify the value function, allowing the algorithm to benefit from the advantages of model-based RL while avoiding the complexities of planning. MR.Q is evaluated across various standard RL benchmarks using a single set of hyperparameters, demonstrating competitive performance against both specialized and general algorithms. This work represents a significant advancement towards creating versatile and efficient model-free deep RL solutions.'}, 'zh': {'title': '构建通用的无模型深度强化学习算法', 'desc': '强化学习(RL)提供了一种通用问题解决框架,但在实际应用中,RL算法通常针对特定基准进行调整,依赖于精心调节的超参数和算法选择。最近,强大的基于模型的RL方法在多个基准上表现出色,但其复杂性和较慢的运行时间限制了其更广泛的应用。本文提出了一种统一的无模型深度RL算法MR.Q,旨在解决多样化的领域和问题设置。我们利用基于模型的表示方法,近似线性化价值函数,从而在避免规划或模拟轨迹相关成本的同时,利用基于模型的RL所使用的更密集的任务目标。'}}}, {'id': 'https://huggingface.co/papers/2501.15570', 'title': 'ARWKV: Pretrain is not what we need, an RNN-Attention-Based Language Model Born from Transformer', 'url': 'https://huggingface.co/papers/2501.15570', 'abstract': "As is known, hybrid quadratic and subquadratic attention models in multi-head architectures have surpassed both Transformer and Linear RNN models , with these works primarily focusing on reducing KV complexity and improving efficiency. For further research on expressiveness, we introduce our series of models distilled from Qwen 2.5, based on pure native RWKV-7 attention, which aims to make RNN more expressive and demonstrates state tracking ability beyond transformers. We work with QRWK 32B based on RWKV-6 architecture, another approach that reduces the entire knowledge processing time to just 8 hours using 16 AMD MI300X GPUs while maintaining Qwen 2.5's performance. In fact, the distillation process can utilize any LLM, not just Qwen, and enables knowledge transfer from larger LLMs to smaller ones with more fewer tokens. We will explain the detailed process and share our insights on building more powerful foundation models. Please note that this is an ongoing work that will be updated continuously. The model checkpoints and source code are available at https://github.com/yynil/RWKVInside{https://github.com/yynil/RWKVInside}, https://huggingface.co/RWKV-Red-Team/ARWKV-7B-Preview-0.1{https://huggingface.co/RWKV-Red-Team/ARWKV-7B-Preview-0.1}.", 'score': 11, 'issue_id': 1900, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': '063647dfe2bd7b63', 'authors': ['Lin Yueyu', 'Li Zhiyuan', 'Peter Yue', 'Liu Xiao'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.15570.jpg', 'data': {'categories': ['#transfer_learning', '#training', '#architecture', '#small_models', '#optimization', '#open_source'], 'emoji': '🧠', 'ru': {'title': 'Повышение эффективности и выразительности RNN через дистилляцию знаний', 'desc': 'Статья представляет новые модели, основанные на чистом нативном внимании RWKV-7, дистиллированные из Qwen 2.5. Цель исследования - повысить выразительность RNN и продемонстрировать способность отслеживания состояния, превосходящую трансформеры. Авторы работают с QRWK 32B на архитектуре RWKV-6, что позволяет сократить время обработки знаний до 8 часов на 16 GPU AMD MI300X. Процесс дистилляции может использовать любую большую языковую модель для передачи знаний меньшим моделям с меньшим количеством токенов.'}, 'en': {'title': 'Enhancing RNN Expressiveness with RWKV Attention', 'desc': 'This paper presents a new series of models derived from Qwen 2.5, focusing on enhancing the expressiveness of RNNs through a native RWKV-7 attention mechanism. The authors demonstrate that their hybrid quadratic and subquadratic attention models outperform traditional Transformer and Linear RNN architectures by significantly reducing key-value (KV) complexity. They introduce the QRWK 32B model, which achieves impressive efficiency by processing knowledge in just 8 hours using 16 AMD MI300X GPUs while retaining the performance of Qwen 2.5. Additionally, the distillation process allows for knowledge transfer from larger language models (LLMs) to smaller ones, making it a versatile approach for building more powerful foundation models.'}, 'zh': {'title': '提升RNN表达能力的新模型', 'desc': '本文介绍了一种新型的混合二次和亚二次注意力模型,旨在提高RNN的表达能力。我们基于RWKV-7注意力架构,提出了一系列从Qwen 2.5中提炼的模型,展示了超越Transformer的状态跟踪能力。通过使用16个AMD MI300X GPU,我们的QRWK 32B模型将知识处理时间缩短至仅8小时,同时保持了Qwen 2.5的性能。该提炼过程可以利用任何大型语言模型(LLM),实现从更大模型到更小模型的知识转移。'}}}, {'id': 'https://huggingface.co/papers/2501.15907', 'title': 'Emilia: A Large-Scale, Extensive, Multilingual, and Diverse Dataset for Speech Generation', 'url': 'https://huggingface.co/papers/2501.15907', 'abstract': 'Recent advancements in speech generation have been driven by the large-scale training datasets. However, current models fall short of capturing the spontaneity and variability inherent in real-world human speech, due to their reliance on audiobook datasets limited to formal read-aloud speech styles. To bridge this gap, we introduce Emilia-Pipe, an open-source preprocessing pipeline to extract high-quality training data from valuable yet underexplored in-the-wild data that capture spontaneous human speech in real-world contexts. By leveraging Emilia-Pipe, we construct Emilia, the first multilingual speech generation dataset derived from in-the-wild speech data. This dataset comprises over 101k hours of speech across six languages: English, Chinese, German, French, Japanese, and Korean. Besides, we expand Emilia to Emilia-Large, a dataset exceeding 216k hours, making it the largest open-source speech generation dataset available. Extensive experiments demonstrate that Emilia significantly outperforms traditional audiobook datasets in generating spontaneous and human-like speech, showcasing superior performance in capturing diverse speaker timbre and speaking styles of real-world human speech. Furthermore, this work underscores the importance of scaling dataset size to advance speech generation research and validates the effectiveness of Emilia for both multilingual and crosslingual speech generation.', 'score': 10, 'issue_id': 1903, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': 'bd221795c86585eb', 'authors': ['Haorui He', 'Zengqiang Shang', 'Chaoren Wang', 'Xuyuan Li', 'Yicheng Gu', 'Hua Hua', 'Liwei Liu', 'Chen Yang', 'Jiaqi Li', 'Peiyang Shi', 'Yuancheng Wang', 'Kai Chen', 'Pengyuan Zhang', 'Zhizheng Wu'], 'affiliations': ['Chinese University of Hong Kong, Shenzhen, China', 'Laboratory of Speech and Intelligent Information Processing, Institute of Acoustics, CAS, Beijing, China', 'Shanghai AI Laboratory, Shanghai, China', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.15907.jpg', 'data': {'categories': ['#data', '#audio', '#multilingual', '#dataset', '#open_source', '#low_resource'], 'emoji': '🗣️', 'ru': {'title': 'Emilia: новый этап в генерации естественной речи', 'desc': 'Исследователи представили Emilia-Pipe - открытый конвейер предобработки для извлечения высококачественных данных из спонтанной речи в реальных условиях. На его основе создан многоязычный датасет Emilia, содержащий более 101 тысячи часов речи на 6 языках. Расширенная версия Emilia-Large включает более 216 тысяч часов и является крупнейшим открытым датасетом для генерации речи. Эксперименты показали превосходство Emilia над традиционными аудиокнижными датасетами в генерации естественной и спонтанной речи.'}, 'en': {'title': 'Unlocking Spontaneous Speech with Emilia-Pipe', 'desc': 'This paper presents Emilia-Pipe, a preprocessing tool designed to extract high-quality training data from spontaneous human speech in real-world settings. The authors introduce Emilia, a multilingual speech generation dataset that includes over 101k hours of diverse speech data across six languages. They further expand this dataset to Emilia-Large, which contains more than 216k hours, making it the largest open-source resource for speech generation. The results show that models trained on Emilia outperform those trained on traditional audiobook datasets, effectively capturing the variability and naturalness of human speech.'}, 'zh': {'title': '打破传统,捕捉真实语音的多样性', 'desc': '近年来,语音生成的进展主要依赖于大规模的训练数据集。然而,目前的模型在捕捉真实人类语音的自发性和多样性方面存在不足,因为它们依赖于仅限于正式朗读风格的有声书数据集。为了解决这个问题,我们提出了Emilia-Pipe,这是一个开源的预处理管道,用于从有价值但未被充分探索的真实环境数据中提取高质量的训练数据。通过利用Emilia-Pipe,我们构建了Emilia,这是第一个基于真实环境语音数据的多语言语音生成数据集,包含超过101k小时的语音,涵盖六种语言。'}}}, {'id': 'https://huggingface.co/papers/2501.15369', 'title': 'iFormer: Integrating ConvNet and Transformer for Mobile Application', 'url': 'https://huggingface.co/papers/2501.15369', 'abstract': 'We present a new family of mobile hybrid vision networks, called iFormer, with a focus on optimizing latency and accuracy on mobile applications. iFormer effectively integrates the fast local representation capacity of convolution with the efficient global modeling ability of self-attention. The local interactions are derived from transforming a standard convolutional network, i.e., ConvNeXt, to design a more lightweight mobile network. Our newly introduced mobile modulation attention removes memory-intensive operations in MHA and employs an efficient modulation mechanism to boost dynamic global representational capacity. We conduct comprehensive experiments demonstrating that iFormer outperforms existing lightweight networks across various tasks. Notably, iFormer achieves an impressive Top-1 accuracy of 80.4\\% on ImageNet-1k with a latency of only 1.10 ms on an iPhone 13, surpassing the recently proposed MobileNetV4 under similar latency constraints. Additionally, our method shows significant improvements in downstream tasks, including COCO object detection, instance segmentation, and ADE20k semantic segmentation, while still maintaining low latency on mobile devices for high-resolution inputs in these scenarios.', 'score': 8, 'issue_id': 1898, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': '50e030854cdc071f', 'authors': ['Chuanyang Zheng'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.15369.jpg', 'data': {'categories': ['#optimization', '#training', '#cv', '#architecture'], 'emoji': '📱', 'ru': {'title': 'iFormer: Эффективные нейросети для мобильного компьютерного зрения', 'desc': 'iFormer - это новое семейство мобильных гибридных сетей компьютерного зрения, оптимизированных для мобильных приложений. Оно сочетает быструю локальную репрезентативную способность свёрточных сетей с эффективным глобальным моделированием механизма внимания. iFormer использует облегченную версию ConvNeXt и новый механизм модуляционного внимания для мобильных устройств. Эксперименты показывают, что iFormer превосходит существующие легковесные сети по точности и скорости работы на различных задачах, включая классификацию изображений, обнаружение объектов и сегментацию.'}, 'en': {'title': 'iFormer: Optimizing Mobile Vision with Speed and Accuracy', 'desc': 'The paper introduces iFormer, a new type of mobile hybrid vision network designed to enhance both speed and accuracy for mobile applications. It combines the quick local processing of convolutional networks with the effective global understanding of self-attention mechanisms. By modifying a standard convolutional architecture, ConvNeXt, iFormer creates a lightweight model that reduces memory usage while improving performance. Experimental results show that iFormer achieves high accuracy on ImageNet-1k and excels in various downstream tasks, all while maintaining low latency on mobile devices.'}, 'zh': {'title': 'iFormer:移动应用中的高效视觉网络', 'desc': '我们提出了一种新的移动混合视觉网络家族,称为iFormer,旨在优化移动应用的延迟和准确性。iFormer有效地结合了卷积的快速局部表示能力和自注意力的高效全局建模能力。通过将标准卷积网络ConvNeXt转化为更轻量级的移动网络,iFormer实现了局部交互的优化。我们的移动调制注意力机制去除了多头自注意力中的内存密集型操作,并采用高效的调制机制来增强动态全局表示能力。'}}}, {'id': 'https://huggingface.co/papers/2501.14723', 'title': 'CodeMonkeys: Scaling Test-Time Compute for Software Engineering', 'url': 'https://huggingface.co/papers/2501.14723', 'abstract': 'Scaling test-time compute is a promising axis for improving LLM capabilities. However, test-time compute can be scaled in a variety of ways, and effectively combining different approaches remains an active area of research. Here, we explore this problem in the context of solving real-world GitHub issues from the SWE-bench dataset. Our system, named CodeMonkeys, allows models to iteratively edit a codebase by jointly generating and running a testing script alongside their draft edit. We sample many of these multi-turn trajectories for every issue to generate a collection of candidate edits. This approach lets us scale "serial" test-time compute by increasing the number of iterations per trajectory and "parallel" test-time compute by increasing the number of trajectories per problem. With parallel scaling, we can amortize up-front costs across multiple downstream samples, allowing us to identify relevant codebase context using the simple method of letting an LLM read every file. In order to select between candidate edits, we combine voting using model-generated tests with a final multi-turn trajectory dedicated to selection. Overall, CodeMonkeys resolves 57.4% of issues from SWE-bench Verified using a budget of approximately 2300 USD. Our selection method can also be used to combine candidates from different sources. Selecting over an ensemble of edits from existing top SWE-bench Verified submissions obtains a score of 66.2% and outperforms the best member of the ensemble on its own. We fully release our code and data at https://scalingintelligence.stanford.edu/pubs/codemonkeys.', 'score': 4, 'issue_id': 1912, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '0aee5401febd2bf6', 'authors': ['Ryan Ehrlich', 'Bradley Brown', 'Jordan Juravsky', 'Ronald Clark', 'Christopher Ré', 'Azalia Mirhoseini'], 'affiliations': ['Department of Computer Science, Stanford University', 'University of Oxford'], 'pdf_title_img': 'assets/pdf/title_img/2501.14723.jpg', 'data': {'categories': ['#data', '#optimization', '#training', '#dataset', '#plp', '#open_source'], 'emoji': '🐒', 'ru': {'title': 'CodeMonkeys: Масштабирование вычислений LLM для решения реальных задач программирования', 'desc': 'Статья представляет систему CodeMonkeys для решения реальных проблем GitHub с помощью больших языковых моделей (LLM). Система позволяет моделям итеративно редактировать кодовую базу, генерируя и запуская тестовые скрипты вместе с черновыми правками. CodeMonkeys использует как последовательное, так и параллельное масштабирование вычислений во время тестирования, что позволяет эффективно идентифицировать релевантный контекст кодовой базы. Метод выбора кандидатов на основе голосования и финальной многоходовой траектории позволил системе решить 57.4% проблем из набора данных SWE-bench Verified.'}, 'en': {'title': 'Enhancing LLMs with Scalable Test-Time Compute for Code Editing', 'desc': 'This paper presents CodeMonkeys, a system designed to enhance the capabilities of large language models (LLMs) by scaling test-time compute during code editing tasks. It combines iterative code generation with testing script execution, allowing models to refine their edits through multiple iterations and trajectories. By leveraging both serial and parallel scaling, CodeMonkeys efficiently identifies relevant code context and selects the best candidate edits through a voting mechanism. The system demonstrates effectiveness by resolving over 57% of real-world GitHub issues while optimizing resource usage, and it shows improved performance when combining edits from various sources.'}, 'zh': {'title': '通过CodeMonkeys提升代码编辑能力', 'desc': '本文探讨了如何通过扩展测试时计算来提升大型语言模型(LLM)的能力。我们提出了一个名为CodeMonkeys的系统,它可以通过生成和运行测试脚本来迭代编辑代码库,从而解决实际的GitHub问题。该方法通过增加每个问题的迭代次数和轨迹数量,实现了串行和并行的测试时计算扩展。最终,CodeMonkeys成功解决了57.4%的问题,并且我们的选择方法也能有效结合来自不同来源的候选编辑。'}}}, {'id': 'https://huggingface.co/papers/2403.09193', 'title': 'Are Vision Language Models Texture or Shape Biased and Can We Steer Them?', 'url': 'https://huggingface.co/papers/2403.09193', 'abstract': 'Vision language models (VLMs) have drastically changed the computer vision model landscape in only a few years, opening an exciting array of new applications from zero-shot image classification, over to image captioning, and visual question answering. Unlike pure vision models, they offer an intuitive way to access visual content through language prompting. The wide applicability of such models encourages us to ask whether they also align with human vision - specifically, how far they adopt human-induced visual biases through multimodal fusion, or whether they simply inherit biases from pure vision models. One important visual bias is the texture vs. shape bias, or the dominance of local over global information. In this paper, we study this bias in a wide range of popular VLMs. Interestingly, we find that VLMs are often more shape-biased than their vision encoders, indicating that visual biases are modulated to some extent through text in multimodal models. If text does indeed influence visual biases, this suggests that we may be able to steer visual biases not just through visual input but also through language: a hypothesis that we confirm through extensive experiments. For instance, we are able to steer shape bias from as low as 49% to as high as 72% through prompting alone. For now, the strong human bias towards shape (96%) remains out of reach for all tested VLMs.', 'score': 4, 'issue_id': 1911, 'pub_date': '2025-03-14', 'pub_date_card': {'ru': '14 марта', 'en': 'March 14', 'zh': '3月14日'}, 'hash': 'e5fc94d983fca41c', 'authors': ['Paul Gavrikov', 'Jovita Lukasik', 'Steffen Jung', 'Robert Geirhos', 'Bianca Lamm', 'Muhammad Jehanzeb Mirza', 'Margret Keuper', 'Janis Keuper'], 'affiliations': ['Google DeepMind', 'ICG, Graz University of Technology', 'IMLA, Offenburg University', 'Max Planck Institute for Informatics, Saarland Informatics Campus', 'University of Mannheim', 'University of Siegen'], 'pdf_title_img': 'assets/pdf/title_img/2403.09193.jpg', 'data': {'categories': ['#cv', '#ethics', '#alignment', '#multimodal'], 'emoji': '👁️', 'ru': {'title': 'Текст направляет взгляд: как языковые подсказки влияют на визуальные предубеждения ИИ', 'desc': 'Статья исследует визуальные предубеждения в мультимодальных моделях, объединяющих зрение и язык (VLM). Авторы обнаружили, что VLM часто более ориентированы на форму объектов, чем чисто визуальные модели. Эксперименты показали, что текстовые подсказки могут значительно влиять на баланс между ориентацией на форму и текстуру в VLM. Однако даже после оптимизации, VLM все еще уступают человеческому зрению в ориентации на форму объектов.'}, 'en': {'title': 'Steering Visual Biases with Language in Vision Language Models', 'desc': 'This paper investigates how vision language models (VLMs) incorporate human visual biases, particularly the texture vs. shape bias, which refers to the preference for local versus global information in images. The authors find that VLMs tend to be more shape-biased than traditional vision models, suggesting that language prompts can influence visual processing. Through experiments, they demonstrate that the shape bias can be adjusted significantly by changing the text prompts used with the models. However, despite these adjustments, the VLMs still do not fully match the strong human bias towards shape recognition.'}, 'zh': {'title': '通过语言引导视觉偏差的可能性', 'desc': '视觉语言模型(VLMs)在计算机视觉领域带来了显著变化,支持从零样本图像分类到图像描述和视觉问答等多种应用。这些模型通过语言提示提供了一种直观的方式来访问视觉内容。我们研究了VLMs中存在的视觉偏差,特别是纹理与形状偏差,发现VLMs在形状偏差上往往比纯视觉模型更强。这表明,通过文本的多模态融合,视觉偏差可以在一定程度上被调节,且我们可以通过语言来引导视觉偏差。'}}}, {'id': 'https://huggingface.co/papers/2501.16295', 'title': 'Mixture-of-Mamba: Enhancing Multi-Modal State-Space Models with Modality-Aware Sparsity', 'url': 'https://huggingface.co/papers/2501.16295', 'abstract': 'State Space Models (SSMs) have emerged as efficient alternatives to Transformers for sequential modeling, but their inability to leverage modality-specific features limits their performance in multi-modal pretraining. Here, we propose Mixture-of-Mamba, a novel SSM architecture that introduces modality-aware sparsity through modality-specific parameterization of the Mamba block. Building on Mixture-of-Transformers (W. Liang et al. arXiv:2411.04996; 2024), we extend the benefits of modality-aware sparsity to SSMs while preserving their computational efficiency. We evaluate Mixture-of-Mamba across three multi-modal pretraining settings: Transfusion (interleaved text and continuous image tokens with diffusion loss), Chameleon (interleaved text and discrete image tokens), and an extended three-modality framework incorporating speech. Mixture-of-Mamba consistently reaches the same loss values at earlier training steps with significantly reduced computational costs. In the Transfusion setting, Mixture-of-Mamba achieves equivalent image loss using only 34.76% of the training FLOPs at the 1.4B scale. In the Chameleon setting, Mixture-of-Mamba reaches similar image loss with just 42.50% of the FLOPs at the 1.4B scale, and similar text loss with just 65.40% of the FLOPs. In the three-modality setting, MoM matches speech loss at 24.80% of the FLOPs at the 1.4B scale. Our ablation study highlights the synergistic effects of decoupling projection components, where joint decoupling yields greater gains than individual modifications. These results establish modality-aware sparsity as a versatile and effective design principle, extending its impact from Transformers to SSMs and setting new benchmarks in multi-modal pretraining. Our code can be accessed at https://github.com/Weixin-Liang/Mixture-of-Mamba', 'score': 4, 'issue_id': 1898, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '011d06607305f0f8', 'authors': ['Weixin Liang', 'Junhong Shen', 'Genghan Zhang', 'Ning Dong', 'Luke Zettlemoyer', 'Lili Yu'], 'affiliations': ['Department of Computer Science, Stanford University', 'FAIR at Meta', 'Machine Learning Department, Carnegie Mellon University'], 'pdf_title_img': 'assets/pdf/title_img/2501.16295.jpg', 'data': {'categories': ['#multimodal', '#architecture', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Mixture-of-Mamba: Эффективное мультимодальное обучение с модальность-специфической разреженностью', 'desc': 'В этой статье представлена новая архитектура модели состояний (SSM) под названием Mixture-of-Mamba, которая вводит модальность-специфическую разреженность для мультимодального предобучения. Mixture-of-Mamba расширяет преимущества модальность-осведомленной разреженности на SSM, сохраняя при этом их вычислительную эффективность. Модель была оценена в трех настройках мультимодального предобучения: Transfusion, Chameleon и расширенной трехмодальной системе, включающей речь. Результаты показывают, что Mixture-of-Mamba достигает тех же значений потерь на более ранних этапах обучения со значительно меньшими вычислительными затратами по сравнению с базовыми моделями.'}, 'en': {'title': 'Revolutionizing Multi-Modal Learning with Efficient SSMs', 'desc': 'This paper introduces Mixture-of-Mamba, a new State Space Model (SSM) that enhances multi-modal pretraining by incorporating modality-aware sparsity. By parameterizing the Mamba block specifically for different modalities, the model efficiently utilizes features from various data types like text, images, and speech. The results show that Mixture-of-Mamba achieves comparable performance to existing models while significantly reducing computational costs, using fewer floating point operations (FLOPs). This work demonstrates the effectiveness of modality-aware sparsity in improving SSMs, setting new benchmarks in the field of multi-modal learning.'}, 'zh': {'title': '模态感知稀疏性:提升SSM的多模态预训练效率', 'desc': '状态空间模型(SSMs)作为序列建模的有效替代方案,面临无法利用特定模态特征的问题。我们提出了一种新颖的SSM架构——Mixture-of-Mamba,通过对Mamba模块进行模态特定参数化,引入了模态感知稀疏性。该模型在多模态预训练中表现出色,能够在较早的训练步骤中达到相同的损失值,同时显著降低计算成本。我们的研究表明,模态感知稀疏性是一个有效的设计原则,能够将其影响从变换器扩展到SSMs,并在多模态预训练中设定新的基准。'}}}, {'id': 'https://huggingface.co/papers/2501.15427', 'title': 'OpenCharacter: Training Customizable Role-Playing LLMs with Large-Scale Synthetic Personas', 'url': 'https://huggingface.co/papers/2501.15427', 'abstract': 'Customizable role-playing in large language models (LLMs), also known as character generalization, is gaining increasing attention for its versatility and cost-efficiency in developing and deploying role-playing dialogue agents. This study explores a large-scale data synthesis approach to equip LLMs with character generalization capabilities. We begin by synthesizing large-scale character profiles using personas from Persona Hub and then explore two strategies: response rewriting and response generation, to create character-aligned instructional responses. To validate the effectiveness of our synthetic instruction tuning data for character generalization, we perform supervised fine-tuning (SFT) using the LLaMA-3 8B model. Our best-performing model strengthens the original LLaMA-3 8B Instruct model and achieves performance comparable to GPT-4o models on role-playing dialogue. We release our synthetic characters and instruction-tuning dialogues to support public research.', 'score': 3, 'issue_id': 1910, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': 'fa7a70d2c9f398b9', 'authors': ['Xiaoyang Wang', 'Hongming Zhang', 'Tao Ge', 'Wenhao Yu', 'Dian Yu', 'Dong Yu'], 'affiliations': ['Tencent AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.15427.jpg', 'data': {'categories': ['#training', '#data', '#agents', '#dataset', '#open_source', '#synthetic'], 'emoji': '🎭', 'ru': {'title': 'Обучение ИИ искусству перевоплощения', 'desc': 'Исследование посвящено обучению больших языковых моделей (LLM) способности к обобщению характеров персонажей. Авторы синтезируют большой набор профилей персонажей и диалогов для инструктивной настройки модели. Используя эти данные, они проводят supervised fine-tuning модели LLaMA-3 8B. Полученная модель показывает результаты, сравнимые с GPT-4 в задачах ролевого диалога.'}, 'en': {'title': 'Empowering LLMs with Character Generalization for Role-Playing', 'desc': 'This paper discusses how to improve large language models (LLMs) for role-playing tasks by enabling them to adopt different character personas. The authors create a large dataset of character profiles and use two methods—response rewriting and response generation—to produce responses that match these characters. They then fine-tune the LLaMA-3 8B model with this synthetic data to enhance its ability to generate character-aligned dialogues. The results show that their improved model performs similarly to advanced models like GPT-4o in role-playing scenarios, and they provide their resources for further research.'}, 'zh': {'title': '增强大型语言模型的角色扮演能力', 'desc': '本文研究了如何通过大规模数据合成来增强大型语言模型(LLMs)的角色扮演能力。我们首先利用Persona Hub合成大量角色档案,然后探索了两种策略:响应重写和响应生成,以创建与角色对齐的指令响应。通过对LLaMA-3 8B模型进行监督微调(SFT),我们验证了合成指令调优数据在角色泛化方面的有效性。最终,我们的最佳模型在角色扮演对话中表现出色,达到了与GPT-4o模型相当的性能,并公开发布了合成角色和指令调优对话以支持公共研究。'}}}, {'id': 'https://huggingface.co/papers/2501.12370', 'title': 'Parameters vs FLOPs: Scaling Laws for Optimal Sparsity for Mixture-of-Experts Language Models', 'url': 'https://huggingface.co/papers/2501.12370', 'abstract': "Scaling the capacity of language models has consistently proven to be a reliable approach for improving performance and unlocking new capabilities. Capacity can be primarily defined by two dimensions: the number of model parameters and the compute per example. While scaling typically involves increasing both, the precise interplay between these factors and their combined contribution to overall capacity remains not fully understood. We explore this relationship in the context of sparse Mixture-of-Experts (MoEs), which allow scaling the number of parameters without proportionally increasing the FLOPs per example. We investigate how varying the sparsity level, i.e., the fraction of inactive parameters, impacts model's performance during pretraining and downstream few-shot evaluation. We find that under different constraints (e.g., parameter size and total training compute), there is an optimal level of sparsity that improves both training efficiency and model performance. These results provide a better understanding of the impact of sparsity in scaling laws for MoEs and complement existing works in this area, offering insights for designing more efficient architectures.", 'score': 3, 'issue_id': 1905, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'bffcdc51c572d8f2', 'authors': ['Samira Abnar', 'Harshay Shah', 'Dan Busbridge', 'Alaaeldin Mohamed Elnouby Ali', 'Josh Susskind', 'Vimal Thilak'], 'affiliations': ['Apple', 'MIT'], 'pdf_title_img': 'assets/pdf/title_img/2501.12370.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization'], 'emoji': '🧠', 'ru': {'title': 'Оптимальная разреженность - ключ к эффективному масштабированию языковых моделей', 'desc': 'Статья исследует взаимосвязь между количеством параметров и вычислительной мощностью в контексте разреженных моделей Mixture-of-Experts (MoE). Авторы изучают, как изменение уровня разреженности влияет на производительность модели во время предварительного обучения и последующей оценки few-shot. Результаты показывают, что существует оптимальный уровень разреженности, который улучшает как эффективность обучения, так и производительность модели. Это исследование дополняет существующие работы в области масштабирования языковых моделей и предлагает insights для разработки более эффективных архитектур.'}, 'en': {'title': 'Unlocking Efficiency: The Power of Sparsity in Language Models', 'desc': "This paper investigates how to improve language models by scaling their capacity, focusing on two main factors: the number of parameters and the compute required for each example. It specifically looks at sparse Mixture-of-Experts (MoEs), which allow for a larger number of parameters without a corresponding increase in computational load. The authors explore how different levels of sparsity, or the proportion of inactive parameters, affect the model's performance during training and evaluation. Their findings suggest that there is an optimal level of sparsity that enhances both efficiency and performance, providing valuable insights for developing more effective machine learning architectures."}, 'zh': {'title': '优化稀疏性,提升模型性能', 'desc': '本文探讨了语言模型容量的扩展,特别是在稀疏混合专家(MoEs)框架下。容量主要由模型参数数量和每个样本的计算量决定。研究发现,在不同的约束条件下,存在一个最佳的稀疏水平,可以提高训练效率和模型性能。此研究为理解稀疏性在MoEs扩展法则中的影响提供了新的视角,并为设计更高效的架构提供了见解。'}}}, {'id': 'https://huggingface.co/papers/2501.15420', 'title': 'Visual Generation Without Guidance', 'url': 'https://huggingface.co/papers/2501.15420', 'abstract': 'Classifier-Free Guidance (CFG) has been a default technique in various visual generative models, yet it requires inference from both conditional and unconditional models during sampling. We propose to build visual models that are free from guided sampling. The resulting algorithm, Guidance-Free Training (GFT), matches the performance of CFG while reducing sampling to a single model, halving the computational cost. Unlike previous distillation-based approaches that rely on pretrained CFG networks, GFT enables training directly from scratch. GFT is simple to implement. It retains the same maximum likelihood objective as CFG and differs mainly in the parameterization of conditional models. Implementing GFT requires only minimal modifications to existing codebases, as most design choices and hyperparameters are directly inherited from CFG. Our extensive experiments across five distinct visual models demonstrate the effectiveness and versatility of GFT. Across domains of diffusion, autoregressive, and masked-prediction modeling, GFT consistently achieves comparable or even lower FID scores, with similar diversity-fidelity trade-offs compared with CFG baselines, all while being guidance-free. Code will be available at https://github.com/thu-ml/GFT.', 'score': 2, 'issue_id': 1912, 'pub_date': '2025-01-26', 'pub_date_card': {'ru': '26 января', 'en': 'January 26', 'zh': '1月26日'}, 'hash': 'd7e67912a685cbf9', 'authors': ['Huayu Chen', 'Kai Jiang', 'Kaiwen Zheng', 'Jianfei Chen', 'Hang Su', 'Jun Zhu'], 'affiliations': ['Department of Computer Science & Technology, Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.15420.jpg', 'data': {'categories': ['#optimization', '#diffusion', '#training', '#cv', '#open_source'], 'emoji': '🖼️', 'ru': {'title': 'GFT: Эффективная генерация изображений без направляющей выборки', 'desc': 'Статья представляет новый метод обучения визуальных генеративных моделей - Guidance-Free Training (GFT). GFT позволяет достичь производительности Classifier-Free Guidance (CFG), но требует вдвое меньше вычислений при генерации изображений. Метод прост в реализации и может применяться для обучения моделей с нуля. Эксперименты показали эффективность GFT для различных типов моделей, включая диффузионные, авторегрессионные и модели с маскированием.'}, 'en': {'title': 'Guidance-Free Training: Simplifying Visual Generative Models', 'desc': 'This paper introduces Guidance-Free Training (GFT), a new approach for visual generative models that eliminates the need for classifier-free guidance during sampling. GFT achieves similar performance to traditional Classifier-Free Guidance (CFG) while only requiring a single model for inference, thus reducing computational costs by half. The method allows for training from scratch, avoiding reliance on pre-trained CFG networks, and retains the same maximum likelihood objective as CFG with minimal changes to existing implementations. Extensive experiments show that GFT performs comparably or better than CFG across various visual modeling domains, maintaining a good balance between diversity and fidelity.'}, 'zh': {'title': '无引导训练:降低计算成本的视觉生成新方法', 'desc': '无引导采样的视觉模型是本研究的核心。我们提出的无引导训练(GFT)算法在性能上与传统的分类器引导(CFG)相当,但只需使用单一模型进行采样,从而减少了计算成本。GFT可以直接从头开始训练,而不依赖于预训练的CFG网络,且实现简单。通过在五种不同的视觉模型上进行广泛实验,我们证明了GFT的有效性和多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.16273', 'title': 'Return of the Encoder: Maximizing Parameter Efficiency for SLMs', 'url': 'https://huggingface.co/papers/2501.16273', 'abstract': "The dominance of large decoder-only language models has overshadowed encoder-decoder architectures, despite their fundamental efficiency advantages in sequence processing. For small language models (SLMs) - those with 1 billion parameters or fewer - our systematic analysis across GPU, CPU, and NPU platforms reveals that encoder-decoder architectures achieve 47% lower first-token latency and 4.7x higher throughput compared to decoder-only models on edge devices. These gains may be attributed to encoder-decoder's one-time input processing and efficient separation of understanding and generation phases. We introduce a novel knowledge distillation framework that enables encoder-decoder models to leverage capabilities from large scalable decoder-only teachers while preserving their architectural advantages, achieving up to 6 average performance points improvement across diverse tasks, with significant gains in asymmetric sequence tasks where input and output distributions can benefit from different processing approaches. When combined with modern advances like Rotary Positional Embeddings (RoPE) and Vision encoders, our systematic investigation demonstrates that encoder-decoder architectures provide a more practical path toward deploying capable language models in resource-constrained environments. Our findings challenge the prevailing trend toward decoder-only scaling, showing that architectural choices become increasingly crucial as parameter budgets decrease, particularly for on-device and edge deployments where computational efficiency is paramount.", 'score': 2, 'issue_id': 1910, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': 'bd97733bda9e3557', 'authors': ['Mohamed Elfeki', 'Rui Liu', 'Chad Voegele'], 'affiliations': ['Microsoft'], 'pdf_title_img': 'assets/pdf/title_img/2501.16273.jpg', 'data': {'categories': ['#architecture', '#training', '#small_models', '#optimization', '#transfer_learning'], 'emoji': '🤖', 'ru': {'title': 'Энкодер-декодер: эффективное решение для малых языковых моделей', 'desc': 'Исследование показывает преимущества архитектуры энкодер-декодер для малых языковых моделей (до 1 млрд параметров) по сравнению с декодер-онли моделями. На периферийных устройствах энкодер-декодер модели демонстрируют на 47% меньшую задержку первого токена и в 4,7 раза большую пропускную способность. Предложен новый фреймворк дистилляции знаний, позволяющий энкодер-декодер моделям использовать возможности больших декодер-онли учителей. Результаты исследования ставят под сомнение тренд на масштабирование декодер-онли архитектур, особенно для ресурсно-ограниченных сред.'}, 'en': {'title': 'Unlocking Efficiency: The Power of Encoder-Decoder Models in Small Language Tasks', 'desc': 'This paper highlights the advantages of encoder-decoder architectures over large decoder-only language models, especially for small language models (SLMs) with 1 billion parameters or fewer. The authors demonstrate that encoder-decoder models can achieve significantly lower latency and higher throughput on edge devices due to their efficient processing of input and separation of understanding and generation phases. They introduce a new knowledge distillation framework that allows these models to benefit from the capabilities of larger decoder-only models while maintaining their efficiency. The study concludes that as parameter budgets decrease, the choice of architecture becomes critical for effective deployment in resource-constrained environments.'}, 'zh': {'title': '编码-解码架构的优势与应用', 'desc': '本论文分析了编码-解码架构在小型语言模型(SLMs)中的优势,尤其是在边缘设备上的表现。研究表明,编码-解码模型在首次令牌延迟上比仅解码模型低47%,并且吞吐量提高了4.7倍。这种优势源于编码-解码架构的一次性输入处理和理解与生成阶段的高效分离。我们还提出了一种新的知识蒸馏框架,使编码-解码模型能够利用大型解码教师的能力,同时保持其架构优势。'}}}, {'id': 'https://huggingface.co/papers/2501.14912', 'title': 'Feasible Learning', 'url': 'https://huggingface.co/papers/2501.14912', 'abstract': 'We introduce Feasible Learning (FL), a sample-centric learning paradigm where models are trained by solving a feasibility problem that bounds the loss for each training sample. In contrast to the ubiquitous Empirical Risk Minimization (ERM) framework, which optimizes for average performance, FL demands satisfactory performance on every individual data point. Since any model that meets the prescribed performance threshold is a valid FL solution, the choice of optimization algorithm and its dynamics play a crucial role in shaping the properties of the resulting solutions. In particular, we study a primal-dual approach which dynamically re-weights the importance of each sample during training. To address the challenge of setting a meaningful threshold in practice, we introduce a relaxation of FL that incorporates slack variables of minimal norm. Our empirical analysis, spanning image classification, age regression, and preference optimization in large language models, demonstrates that models trained via FL can learn from data while displaying improved tail behavior compared to ERM, with only a marginal impact on average performance.', 'score': 2, 'issue_id': 1898, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '7ded44debecf7694', 'authors': ['Juan Ramirez', 'Ignacio Hounie', 'Juan Elenter', 'Jose Gallego-Posada', 'Meraj Hashemizadeh', 'Alejandro Ribeiro', 'Simon Lacoste-Julien'], 'affiliations': ['Canada CIFAR AI Chair', 'Mila & Université de Montréal', 'Spotify', 'University of Pennsylvania'], 'pdf_title_img': 'assets/pdf/title_img/2501.14912.jpg', 'data': {'categories': ['#training', '#optimization'], 'emoji': '🎯', 'ru': {'title': 'Индивидуальный подход к каждому образцу данных', 'desc': 'В статье представлена новая парадигма обучения моделей машинного обучения - Feasible Learning (FL). В отличие от традиционного подхода минимизации эмпирического риска (ERM), FL стремится обеспечить удовлетворительную производительность для каждого отдельного образца данных. Авторы предлагают примально-двойственный подход, который динамически переопределяет важность каждого образца во время обучения. Эмпирический анализ на задачах классификации изображений, регрессии возраста и оптимизации предпочтений в больших языковых моделях показывает, что модели, обученные с помощью FL, демонстрируют улучшенное поведение на редких случаях по сравнению с ERM.'}, 'en': {'title': 'Ensuring Individual Sample Success with Feasible Learning', 'desc': "Feasible Learning (FL) is a new approach in machine learning that focuses on ensuring each training sample meets a specific performance standard, rather than just optimizing for overall average performance like traditional methods. This paradigm treats the training process as a feasibility problem, where any model that satisfies the performance criteria for all samples is considered valid. The paper explores a primal-dual optimization technique that adjusts the importance of each sample during training, enhancing the model's ability to learn effectively. Through various applications, including image classification and language model optimization, FL shows improved performance on challenging cases while maintaining similar average results compared to conventional methods."}, 'zh': {'title': '可行学习:每个样本都要优秀!', 'desc': '我们介绍了一种新的学习范式,称为可行学习(Feasible Learning,FL),它通过解决一个可行性问题来训练模型,从而限制每个训练样本的损失。与传统的经验风险最小化(Empirical Risk Minimization,ERM)框架不同,FL要求每个数据点都能达到满意的性能。FL的有效性依赖于优化算法的选择及其动态调整样本重要性的能力。我们的实证分析表明,使用FL训练的模型在图像分类、年龄回归和大语言模型的偏好优化中,能够在保持平均性能的同时,改善模型在极端情况下的表现。'}}}, {'id': 'https://huggingface.co/papers/2501.08325', 'title': 'GameFactory: Creating New Games with Generative Interactive Videos', 'url': 'https://huggingface.co/papers/2501.08325', 'abstract': 'Generative game engines have the potential to revolutionize game development by autonomously creating new content and reducing manual workload. However, existing video-based game generation methods fail to address the critical challenge of scene generalization, limiting their applicability to existing games with fixed styles and scenes. In this paper, we present GameFactory, a framework focused on exploring scene generalization in game video generation. To enable the creation of entirely new and diverse games, we leverage pre-trained video diffusion models trained on open-domain video data. To bridge the domain gap between open-domain priors and small-scale game dataset, we propose a multi-phase training strategy that decouples game style learning from action control, preserving open-domain generalization while achieving action controllability. Using Minecraft as our data source, we release GF-Minecraft, a high-quality and diversity action-annotated video dataset for research. Furthermore, we extend our framework to enable autoregressive action-controllable game video generation, allowing the production of unlimited-length interactive game videos. Experimental results demonstrate that GameFactory effectively generates open-domain, diverse, and action-controllable game videos, representing a significant step forward in AI-driven game generation. Our dataset and project page are publicly available at https://vvictoryuki.github.io/gamefactory/.', 'score': 47, 'issue_id': 1773, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '0331c9576ced4090', 'authors': ['Jiwen Yu', 'Yiran Qin', 'Xintao Wang', 'Pengfei Wan', 'Di Zhang', 'Xihui Liu'], 'affiliations': ['Kuaishou Technology', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.08325.jpg', 'data': {'categories': ['#dataset', '#video', '#open_source', '#diffusion', '#games', '#training', '#multimodal'], 'emoji': '🎮', 'ru': {'title': 'GameFactory: ИИ-революция в создании видеоигр', 'desc': 'GameFactory - это новая система для генерации видео игр с возможностью обобщения на различные сцены. Она использует предобученные модели диффузии видео на общих данных, что позволяет создавать разнообразные новые игры. Авторы предлагают многоэтапную стратегию обучения, которая разделяет изучение стиля игры и контроль действий. Система также поддерживает авторегрессивную генерацию видео игр с контролем действий неограниченной длины.'}, 'en': {'title': 'Revolutionizing Game Development with Scene Generalization', 'desc': 'This paper introduces GameFactory, a novel framework aimed at enhancing scene generalization in game video generation. It addresses the limitations of current methods that struggle with fixed styles and scenes by utilizing pre-trained video diffusion models on diverse video data. The authors propose a multi-phase training strategy that separates game style learning from action control, allowing for better generalization and controllability. The framework is validated using a new dataset, GF-Minecraft, which supports the generation of diverse and interactive game videos, marking a significant advancement in AI-driven game development.'}, 'zh': {'title': 'GameFactory:革命性的游戏视频生成框架', 'desc': '本论文介绍了GameFactory框架,旨在解决游戏视频生成中的场景泛化问题。现有的视频生成方法无法适应不同风格和场景的游戏,限制了其应用。我们利用预训练的视频扩散模型,并提出多阶段训练策略,以实现游戏风格学习与动作控制的解耦。实验结果表明,GameFactory能够有效生成开放域、多样化且可控的游戏视频,推动了AI驱动的游戏生成技术的发展。'}}}, {'id': 'https://huggingface.co/papers/2501.09781', 'title': 'VideoWorld: Exploring Knowledge Learning from Unlabeled Videos', 'url': 'https://huggingface.co/papers/2501.09781', 'abstract': 'This work explores whether a deep generative model can learn complex knowledge solely from visual input, in contrast to the prevalent focus on text-based models like large language models (LLMs). We develop VideoWorld, an auto-regressive video generation model trained on unlabeled video data, and test its knowledge acquisition abilities in video-based Go and robotic control tasks. Our experiments reveal two key findings: (1) video-only training provides sufficient information for learning knowledge, including rules, reasoning and planning capabilities, and (2) the representation of visual change is crucial for knowledge acquisition. To improve both the efficiency and efficacy of this process, we introduce the Latent Dynamics Model (LDM) as a key component of VideoWorld. Remarkably, VideoWorld reaches a 5-dan professional level in the Video-GoBench with just a 300-million-parameter model, without relying on search algorithms or reward mechanisms typical in reinforcement learning. In robotic tasks, VideoWorld effectively learns diverse control operations and generalizes across environments, approaching the performance of oracle models in CALVIN and RLBench. This study opens new avenues for knowledge acquisition from visual data, with all code, data, and models open-sourced for further research.', 'score': 6, 'issue_id': 1779, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'db65df971ed9f199', 'authors': ['Zhongwei Ren', 'Yunchao Wei', 'Xun Guo', 'Yao Zhao', 'Bingyi Kang', 'Jiashi Feng', 'Xiaojie Jin'], 'affiliations': ['Beijing Jiaotong University', 'ByteDance Seed', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.09781.jpg', 'data': {'categories': ['#agents', '#video', '#open_source', '#small_models', '#rl', '#games', '#optimization'], 'emoji': '🎥', 'ru': {'title': 'Визуальное обучение: от видео к глубоким знаниям', 'desc': 'Исследование посвящено обучению глубокой генеративной модели сложным знаниям исключительно на основе визуальных данных. Разработана модель VideoWorld, обученная на немаркированных видеоданных, которая тестируется на задачах игры в го и управления роботами. Ключевые выводы: визуальное обучение достаточно для приобретения знаний, включая правила, рассуждения и планирование, а представление визуальных изменений критично для этого процесса. Модель достигает уровня профессионала 5 дана в го и эффективно обучается управлению роботами в различных средах.'}, 'en': {'title': "Learning Knowledge from Visuals: VideoWorld's Breakthrough", 'desc': 'This paper investigates the ability of a deep generative model to learn complex knowledge from visual inputs, rather than relying on text-based models. The authors introduce VideoWorld, an auto-regressive model that generates videos and learns from unlabeled video data, demonstrating its effectiveness in tasks like video-based Go and robotic control. Key findings indicate that training solely on video data is sufficient for acquiring knowledge such as rules and reasoning, and that understanding visual changes is essential for this learning process. The introduction of the Latent Dynamics Model enhances the efficiency of knowledge acquisition, allowing VideoWorld to achieve high performance in various tasks without traditional reinforcement learning techniques.'}, 'zh': {'title': '从视觉数据中获取知识的新方法', 'desc': '本研究探讨了深度生成模型是否可以仅通过视觉输入学习复杂知识,而不是依赖于文本模型。我们开发了VideoWorld,这是一个基于自回归的视频生成模型,训练于未标记的视频数据,并测试其在视频围棋和机器人控制任务中的知识获取能力。实验结果表明,视频训练提供了足够的信息来学习规则、推理和规划能力,视觉变化的表示对知识获取至关重要。通过引入潜在动态模型(LDM),VideoWorld在视频围棋基准测试中达到了5段专业水平,且在机器人任务中有效学习了多种控制操作。'}}}, {'id': 'https://huggingface.co/papers/2501.09284', 'title': 'SEAL: Entangled White-box Watermarks on Low-Rank Adaptation', 'url': 'https://huggingface.co/papers/2501.09284', 'abstract': 'Recently, LoRA and its variants have become the de facto strategy for training and sharing task-specific versions of large pretrained models, thanks to their efficiency and simplicity. However, the issue of copyright protection for LoRA weights, especially through watermark-based techniques, remains underexplored. To address this gap, we propose SEAL (SEcure wAtermarking on LoRA weights), the universal whitebox watermarking for LoRA. SEAL embeds a secret, non-trainable matrix between trainable LoRA weights, serving as a passport to claim ownership. SEAL then entangles the passport with the LoRA weights through training, without extra loss for entanglement, and distributes the finetuned weights after hiding the passport. When applying SEAL, we observed no performance degradation across commonsense reasoning, textual/visual instruction tuning, and text-to-image synthesis tasks. We demonstrate that SEAL is robust against a variety of known attacks: removal, obfuscation, and ambiguity attacks.', 'score': 2, 'issue_id': 1782, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': '3c8f91b49b49bdd2', 'authors': ['Giyeong Oh', 'Saejin Kim', 'Woohyun Cho', 'Sangkyu Lee', 'Jiwan Chung', 'Dokyung Song', 'Youngjae Yu'], 'affiliations': ['Department of Artificial Intelligence, Yonsei University, Seoul, Republic of Korea', 'Department of Computer Science and Engineering, Yonsei University, Seoul, Republic of Korea'], 'pdf_title_img': 'assets/pdf/title_img/2501.09284.jpg', 'data': {'categories': ['#training', '#architecture', '#security'], 'emoji': '🔐', 'ru': {'title': 'SEAL: Защита авторских прав на LoRA-веса с помощью водяных знаков', 'desc': 'Статья представляет SEAL - универсальный метод водяных знаков для весов LoRA. SEAL встраивает секретную матрицу между обучаемыми весами LoRA, которая служит паспортом для подтверждения авторства. Метод не ухудшает производительность модели на различных задачах обработки естественного языка и компьютерного зрения. SEAL демонстрирует устойчивость к известным атакам на водяные знаки, таким как удаление, обфускация и атаки неоднозначности.'}, 'en': {'title': 'SEAL: Safeguarding LoRA Weights with Robust Watermarking', 'desc': "This paper introduces SEAL, a watermarking technique designed to protect LoRA weights used in machine learning. SEAL embeds a secret matrix within the trainable weights, allowing owners to claim their models without affecting performance. The method ensures that the watermark is integrated during training, maintaining the model's effectiveness across various tasks. Additionally, SEAL demonstrates resilience against common attacks aimed at removing or obscuring the watermark."}, 'zh': {'title': '保护LoRA权重的水印技术', 'desc': '最近,LoRA及其变体成为训练和共享特定任务的大型预训练模型的主要策略,因其高效和简单。然而,LoRA权重的版权保护问题,特别是基于水印的技术,仍然未得到充分研究。为了解决这个问题,我们提出了SEAL(LoRA权重的安全水印),这是一种通用的白盒水印技术。SEAL在可训练的LoRA权重之间嵌入一个秘密的、不可训练的矩阵,作为所有权的凭证,并在训练过程中将其与LoRA权重纠缠,确保性能不下降。'}}}, {'id': 'https://huggingface.co/papers/2501.09891', 'title': 'Evolving Deeper LLM Thinking', 'url': 'https://huggingface.co/papers/2501.09891', 'abstract': 'We explore an evolutionary search strategy for scaling inference time compute in Large Language Models. The proposed approach, Mind Evolution, uses a language model to generate, recombine and refine candidate responses. The proposed approach avoids the need to formalize the underlying inference problem whenever a solution evaluator is available. Controlling for inference cost, we find that Mind Evolution significantly outperforms other inference strategies such as Best-of-N and Sequential Revision in natural language planning tasks. In the TravelPlanner and Natural Plan benchmarks, Mind Evolution solves more than 98% of the problem instances using Gemini 1.5 Pro without the use of a formal solver.', 'score': 55, 'issue_id': 1750, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': 'f2f5bbede5781334', 'authors': ['Kuang-Huei Lee', 'Ian Fischer', 'Yueh-Hua Wu', 'Dave Marwood', 'Shumeet Baluja', 'Dale Schuurmans', 'Xinyun Chen'], 'affiliations': ['Google DeepMind', 'UC San Diego', 'University of Alberta'], 'pdf_title_img': 'assets/pdf/title_img/2501.09891.jpg', 'data': {'categories': ['#benchmark', '#inference', '#optimization'], 'emoji': '🧠', 'ru': {'title': 'Эволюция мышления: новый подход к оптимизации вывода в языковых моделях', 'desc': 'Статья представляет эволюционную стратегию поиска для масштабирования вычислений во время вывода в больших языковых моделях. Метод, названный Mind Evolution, использует языковую модель для генерации, рекомбинации и уточнения кандидатов-ответов. Этот подход устраняет необходимость формализации исходной задачи вывода, если доступен оценщик решений. При контроле за стоимостью вычислений, Mind Evolution значительно превосходит другие стратегии вывода в задачах планирования на естественном языке.'}, 'en': {'title': 'Mind Evolution: Revolutionizing Inference in Large Language Models', 'desc': 'This paper presents Mind Evolution, an innovative evolutionary search strategy designed to enhance the inference time of Large Language Models (LLMs). By leveraging a language model, Mind Evolution generates, recombines, and refines potential responses without needing to define the inference problem formally, as long as a solution evaluator is available. The results demonstrate that Mind Evolution significantly outperforms traditional inference methods like Best-of-N and Sequential Revision in natural language planning tasks. In benchmarks such as TravelPlanner and Natural Plan, Mind Evolution successfully solves over 98% of instances using Gemini 1.5 Pro, showcasing its effectiveness without relying on a formal solver.'}, 'zh': {'title': 'Mind Evolution:推理效率的新突破', 'desc': '本文探讨了一种用于大语言模型推理时间计算的进化搜索策略,称为Mind Evolution。该方法利用语言模型生成、重组和优化候选响应,避免了在有解决方案评估器的情况下需要形式化推理问题。通过控制推理成本,我们发现Mind Evolution在自然语言规划任务中显著优于其他推理策略,如Best-of-N和Sequential Revision。在TravelPlanner和Natural Plan基准测试中,Mind Evolution在不使用正式求解器的情况下,解决了超过98%的问题实例。'}}}, {'id': 'https://huggingface.co/papers/2501.10120', 'title': 'PaSa: An LLM Agent for Comprehensive Academic Paper Search', 'url': 'https://huggingface.co/papers/2501.10120', 'abstract': 'We introduce PaSa, an advanced Paper Search agent powered by large language models. PaSa can autonomously make a series of decisions, including invoking search tools, reading papers, and selecting relevant references, to ultimately obtain comprehensive and accurate results for complex scholarly queries. We optimize PaSa using reinforcement learning with a synthetic dataset, AutoScholarQuery, which includes 35k fine-grained academic queries and corresponding papers sourced from top-tier AI conference publications. Additionally, we develop RealScholarQuery, a benchmark collecting real-world academic queries to assess PaSa performance in more realistic scenarios. Despite being trained on synthetic data, PaSa significantly outperforms existing baselines on RealScholarQuery, including Google, Google Scholar, Google with GPT-4 for paraphrased queries, chatGPT (search-enabled GPT-4o), GPT-o1, and PaSa-GPT-4o (PaSa implemented by prompting GPT-4o). Notably, PaSa-7B surpasses the best Google-based baseline, Google with GPT-4o, by 37.78% in recall@20 and 39.90% in recall@50. It also exceeds PaSa-GPT-4o by 30.36% in recall and 4.25% in precision. Model, datasets, and code are available at https://github.com/bytedance/pasa.', 'score': 17, 'issue_id': 1750, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': 'bf3bfc73e6d5b31d', 'authors': ['Yichen He', 'Guanhua Huang', 'Peiyuan Feng', 'Yuan Lin', 'Yuchen Zhang', 'Hang Li', 'Weinan E'], 'affiliations': ['ByteDance Research', 'Peking University'], 'pdf_title_img': 'assets/pdf/title_img/2501.10120.jpg', 'data': {'categories': ['#agents', '#synthetic', '#benchmark', '#open_source', '#dataset', '#rl', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'PaSa: ИИ-агент для эффективного поиска научных статей', 'desc': 'PaSa - это продвинутый агент для поиска научных статей, основанный на больших языковых моделях. Он способен автономно принимать решения, включая использование поисковых инструментов, чтение статей и выбор релевантных ссылок для получения комплексных и точных результатов по сложным научным запросам. PaSa оптимизирован с помощью обучения с подкреплением на синтетическом наборе данных AutoScholarQuery, содержащем 35 тысяч детализированных академических запросов и соответствующих статей из ведущих конференций по ИИ. Несмотря на обучение на синтетических данных, PaSa значительно превосходит существующие базовые модели на реальном тестовом наборе RealScholarQuery, включая Google и ChatGPT.'}, 'en': {'title': 'Revolutionizing Academic Search with PaSa!', 'desc': 'The paper presents PaSa, a sophisticated Paper Search agent that utilizes large language models to enhance academic research. PaSa autonomously navigates the search process by making decisions such as invoking search tools, analyzing papers, and selecting pertinent references to deliver thorough and precise results for complex queries. It is optimized through reinforcement learning using a synthetic dataset called AutoScholarQuery, which contains 35,000 detailed academic queries and related papers from leading AI conferences. The performance of PaSa is evaluated against real-world queries using the RealScholarQuery benchmark, demonstrating significant improvements over existing search tools, including Google and various GPT models.'}, 'zh': {'title': 'PaSa:智能论文搜索的新纪元', 'desc': '本文介绍了一种名为PaSa的先进论文搜索代理,利用大型语言模型进行自主决策。PaSa能够调用搜索工具、阅读论文并选择相关参考文献,以获取复杂学术查询的全面和准确结果。我们通过强化学习优化PaSa,使用了一个包含35,000个细粒度学术查询的合成数据集AutoScholarQuery。尽管在合成数据上训练,PaSa在真实学术查询基准RealScholarQuery上的表现显著优于现有的基线模型。'}}}, {'id': 'https://huggingface.co/papers/2501.09775', 'title': 'Multiple Choice Questions: Reasoning Makes Large Language Models (LLMs) More Self-Confident Even When They Are Wrong', 'url': 'https://huggingface.co/papers/2501.09775', 'abstract': 'One of the most widely used methods to evaluate LLMs are Multiple Choice Question (MCQ) tests. MCQ benchmarks enable the testing of LLM knowledge on almost any topic at scale as the results can be processed automatically. To help the LLM answer, a few examples called few shots can be included in the prompt. Moreover, the LLM can be asked to answer the question directly with the selected option or to first provide the reasoning and then the selected answer, which is known as chain of thought. In addition to checking whether the selected answer is correct, the evaluation can look at the LLM-estimated probability of its response as an indication of the confidence of the LLM in the response. In this paper, we study how the LLM confidence in its answer depends on whether the model has been asked to answer directly or to provide the reasoning before answering. The results of the evaluation of questions on a wide range of topics in seven different models show that LLMs are more confident in their answers when they provide reasoning before the answer. This occurs regardless of whether the selected answer is correct. Our hypothesis is that this behavior is due to the reasoning that modifies the probability of the selected answer, as the LLM predicts the answer based on the input question and the reasoning that supports the selection made. Therefore, LLM estimated probabilities seem to have intrinsic limitations that should be understood in order to use them in evaluation procedures. Interestingly, the same behavior has been observed in humans, for whom explaining an answer increases confidence in its correctness.', 'score': 12, 'issue_id': 1756, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'eb8938131508de10', 'authors': ['Tairan Fu', 'Javier Conde', 'Gonzalo Martínez', 'María Grandury', 'Pedro Reviriego'], 'affiliations': ['College of Mechanical and Electrical Engineering Nanjing University of Aeronautics and Astronautics Nanjing, China', 'ETSI de Telecomunicación Universidad Politécnica de Madrid Madrid, Spain', 'SomosNLP/Universidad Politécnica de Madrid Madrid, Spain', 'Universidad Carlos III de Madrid Madrid, Spain'], 'pdf_title_img': 'assets/pdf/title_img/2501.09775.jpg', 'data': {'categories': ['#benchmark', '#hallucinations', '#training', '#reasoning'], 'emoji': '🤔', 'ru': {'title': 'Рассуждения повышают уверенность ИИ, но не точность', 'desc': 'Статья исследует влияние цепочки рассуждений на уверенность языковых моделей в ответах на вопросы с множественным выбором. Авторы обнаружили, что модели более уверены в своих ответах, когда они предоставляют рассуждения перед ответом, независимо от правильности ответа. Это поведение наблюдалось у семи различных моделей на широком спектре тем. Исследователи предполагают, что это связано с тем, как рассуждения модифицируют вероятность выбранного ответа в процессе генерации.'}, 'en': {'title': 'Boosting LLM Confidence Through Reasoning!', 'desc': "This paper investigates how the confidence of large language models (LLMs) in their answers is influenced by the method of response. Specifically, it compares direct answers to those that include reasoning, known as the chain of thought approach. The study finds that LLMs exhibit higher confidence in their answers when they provide reasoning first, regardless of the correctness of the answer. This suggests that the reasoning process alters the model's probability estimates, highlighting potential limitations in using these probabilities for evaluation purposes."}, 'zh': {'title': '推理提升LLM回答信心的秘密', 'desc': '本文研究了大型语言模型(LLM)在回答多项选择题时的信心如何受到回答方式的影响。通过提供推理过程,LLM在选择答案时表现出更高的信心,无论所选答案是否正确。研究表明,推理过程会改变LLM对所选答案的概率估计,这可能是LLM信心的内在限制。类似的现象也在人的回答中观察到,解释答案会提高对其正确性的信心。'}}}, {'id': 'https://huggingface.co/papers/2501.10020', 'title': 'Textoon: Generating Vivid 2D Cartoon Characters from Text Descriptions', 'url': 'https://huggingface.co/papers/2501.10020', 'abstract': 'The 2D cartoon style is a prominent art form in digital character creation, particularly popular among younger audiences. While advancements in digital human technology have spurred extensive research into photorealistic digital humans and 3D characters, interactive 2D cartoon characters have received comparatively less attention. Unlike 3D counterparts, which require sophisticated construction and resource-intensive rendering, Live2D, a widely-used format for 2D cartoon characters, offers a more efficient alternative, which allows to animate 2D characters in a manner that simulates 3D movement without the necessity of building a complete 3D model. Furthermore, Live2D employs lightweight HTML5 (H5) rendering, improving both accessibility and efficiency. In this technical report, we introduce Textoon, an innovative method for generating diverse 2D cartoon characters in the Live2D format based on text descriptions. The Textoon leverages cutting-edge language and vision models to comprehend textual intentions and generate 2D appearance, capable of creating a wide variety of stunning and interactive 2D characters within one minute. The project homepage is https://human3daigc.github.io/Textoon_webpage/.', 'score': 12, 'issue_id': 1751, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '828788f94bccbdc9', 'authors': ['Chao He', 'Jianqiang Ren', 'Liefeng Bo'], 'affiliations': ['Tongyi Lab, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.10020.jpg', 'data': {'categories': ['#3d', '#multimodal'], 'emoji': '🎨', 'ru': {'title': 'Textoon: ИИ создает 2D мультперсонажей по текстовому описанию', 'desc': 'В статье представлен метод Textoon для создания 2D мультипликационных персонажей в формате Live2D на основе текстовых описаний. Textoon использует современные языковые и визуальные модели для понимания текстовых намерений и генерации 2D внешнего вида персонажей. Метод способен создавать разнообразных интерактивных 2D персонажей менее чем за минуту. Live2D предлагает эффективную альтернативу 3D моделям, позволяя анимировать 2D персонажей, имитируя 3D движение, без необходимости создания полной 3D модели.'}, 'en': {'title': 'Transforming Text into 2D Cartoon Characters with Textoon!', 'desc': 'This paper presents Textoon, a novel approach for generating diverse 2D cartoon characters using the Live2D format. By utilizing advanced language and vision models, Textoon interprets text descriptions to create visually appealing and interactive characters efficiently. Unlike traditional 3D character models, Textoon allows for quick generation of 2D characters that simulate 3D movement without extensive resources. The method enhances accessibility and efficiency in digital character creation, catering especially to younger audiences.'}, 'zh': {'title': 'Textoon:快速生成多样化2D卡通角色的创新方法', 'desc': '这篇论文介绍了一种名为Textoon的方法,用于根据文本描述生成多样化的2D卡通角色。与3D角色相比,2D卡通角色的动画制作更为高效,Textoon利用先进的语言和视觉模型来理解文本意图,并生成2D外观。该方法使用Live2D格式,使得角色动画能够模拟3D运动,而无需构建完整的3D模型。Textoon能够在一分钟内创建出多种令人惊叹和互动的2D角色,提升了数字角色创作的效率和可访问性。'}}}, {'id': 'https://huggingface.co/papers/2501.09825', 'title': 'Bridging Language Barriers in Healthcare: A Study on Arabic LLMs', 'url': 'https://huggingface.co/papers/2501.09825', 'abstract': 'This paper investigates the challenges of developing large language models (LLMs) proficient in both multilingual understanding and medical knowledge. We demonstrate that simply translating medical data does not guarantee strong performance on clinical tasks in the target language. Our experiments reveal that the optimal language mix in training data varies significantly across different medical tasks. We find that larger models with carefully calibrated language ratios achieve superior performance on native-language clinical tasks. Furthermore, our results suggest that relying solely on fine-tuning may not be the most effective approach for incorporating new language knowledge into LLMs. Instead, data and computationally intensive pretraining methods may still be necessary to achieve optimal performance in multilingual medical settings. These findings provide valuable guidance for building effective and inclusive medical AI systems for diverse linguistic communities.', 'score': 8, 'issue_id': 1758, 'pub_date': '2025-01-16', 'pub_date_card': {'ru': '16 января', 'en': 'January 16', 'zh': '1月16日'}, 'hash': 'a2bf2d3dc7e978d7', 'authors': ['Nada Saadi', 'Tathagata Raha', 'Clément Christophe', 'Marco AF Pimentel', 'Ronnie Rajan', 'Praveen K Kanithi'], 'affiliations': ['M42 Health, Abu Dhabi, UAE'], 'pdf_title_img': 'assets/pdf/title_img/2501.09825.jpg', 'data': {'categories': ['#healthcare', '#training', '#science', '#low_resource', '#multilingual'], 'emoji': '🌐', 'ru': {'title': 'Многоязычные медицинские LLM: больше, чем просто перевод', 'desc': 'Статья исследует проблемы разработки больших языковых моделей (LLM), обладающих как многоязычным пониманием, так и медицинскими знаниями. Авторы показывают, что простой перевод медицинских данных не гарантирует высокой производительности на клинических задачах в целевом языке. Эксперименты выявляют, что оптимальное соотношение языков в обучающих данных значительно варьируется для разных медицинских задач. Результаты также указывают на то, что для включения знаний нового языка в LLM может потребоваться ресурсоемкое предобучение, а не только тонкая настройка.'}, 'en': {'title': 'Optimizing Multilingual Medical AI: Beyond Translation and Fine-Tuning', 'desc': 'This paper explores the difficulties in creating large language models (LLMs) that can understand multiple languages and possess medical expertise. It shows that merely translating medical information does not ensure good performance in clinical tasks for different languages. The research indicates that the best combination of languages in training data changes depending on the specific medical task. Additionally, it suggests that larger models with well-balanced language inputs perform better, and that extensive pretraining may be more beneficial than just fine-tuning for integrating new language capabilities.'}, 'zh': {'title': '多语言医学模型的优化之道', 'desc': '本论文探讨了开发能够理解多种语言和医学知识的大型语言模型(LLMs)所面临的挑战。我们证明,仅仅翻译医学数据并不能保证在目标语言的临床任务中表现良好。实验结果显示,不同医学任务对训练数据中的语言组合有显著不同的最佳需求。我们的研究表明,经过精心调整语言比例的大型模型在本土语言的临床任务中表现更佳,而仅依赖微调可能不是将新语言知识有效融入LLMs的最佳方法。'}}}, {'id': 'https://huggingface.co/papers/2501.10021', 'title': 'X-Dyna: Expressive Dynamic Human Image Animation', 'url': 'https://huggingface.co/papers/2501.10021', 'abstract': 'We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for animating a single human image using facial expressions and body movements derived from a driving video, that generates realistic, context-aware dynamics for both the subject and the surrounding environment. Building on prior approaches centered on human pose control, X-Dyna addresses key shortcomings causing the loss of dynamic details, enhancing the lifelike qualities of human video animations. At the core of our approach is the Dynamics-Adapter, a lightweight module that effectively integrates reference appearance context into the spatial attentions of the diffusion backbone while preserving the capacity of motion modules in synthesizing fluid and intricate dynamic details. Beyond body pose control, we connect a local control module with our model to capture identity-disentangled facial expressions, facilitating accurate expression transfer for enhanced realism in animated scenes. Together, these components form a unified framework capable of learning physical human motion and natural scene dynamics from a diverse blend of human and scene videos. Comprehensive qualitative and quantitative evaluations demonstrate that X-Dyna outperforms state-of-the-art methods, creating highly lifelike and expressive animations. The code is available at https://github.com/bytedance/X-Dyna.', 'score': 5, 'issue_id': 1752, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '4163d7e5ec4b04ce', 'authors': ['Di Chang', 'Hongyi Xu', 'You Xie', 'Yipeng Gao', 'Zhengfei Kuang', 'Shengqu Cai', 'Chenxu Zhang', 'Guoxian Song', 'Chao Wang', 'Yichun Shi', 'Zeyuan Chen', 'Shijie Zhou', 'Linjie Luo', 'Gordon Wetzstein', 'Mohammad Soleymani'], 'affiliations': ['ByteDance', 'Stanford University', 'University of California Los Angeles', 'University of California San Diego', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.10021.jpg', 'data': {'categories': ['#architecture', '#synthetic', '#diffusion', '#cv', '#video', '#multimodal'], 'emoji': '🎭', 'ru': {'title': 'Оживление статичных изображений с помощью ИИ: реалистичная анимация человека и окружения', 'desc': 'X-Dyna - это новый подход к анимации изображений человека с нуля, основанный на диффузионных моделях. Он использует выражения лица и движения тела из видео-драйвера для создания реалистичной динамики как субъекта, так и окружающей среды. В основе X-Dyna лежит модуль Dynamics-Adapter, который интегрирует контекст внешнего вида в пространственное внимание диффузионной модели. Система также включает локальный модуль управления для передачи выражений лица, что повышает реалистичность анимированных сцен.'}, 'en': {'title': 'X-Dyna: Realistic Animation from a Single Image', 'desc': 'X-Dyna is a new method for animating a single human image by using expressions and movements from a video. It improves on previous techniques by maintaining dynamic details, making animations look more realistic. The key part of X-Dyna is the Dynamics-Adapter, which helps blend the appearance of the subject with their movements while keeping the animation smooth. Additionally, it includes a module for accurately transferring facial expressions, resulting in more lifelike and expressive animations.'}, 'zh': {'title': 'X-Dyna:真实感动画的新突破', 'desc': 'X-Dyna是一种新颖的零样本扩散基础管道,能够通过驱动视频中的面部表情和身体动作为单个人物图像生成动画。该方法解决了以往人类姿态控制方法中的动态细节丢失问题,增强了视频动画的真实感。X-Dyna的核心是Dynamics-Adapter模块,它有效地将参考外观上下文整合到扩散模型的空间注意力中,同时保持运动模块合成流畅动态细节的能力。通过连接局部控制模块,X-Dyna能够捕捉与身份无关的面部表情,实现更真实的动画场景中的表情转移。'}}}, {'id': 'https://huggingface.co/papers/2501.10045', 'title': 'HiFi-SR: A Unified Generative Transformer-Convolutional Adversarial Network for High-Fidelity Speech Super-Resolution', 'url': 'https://huggingface.co/papers/2501.10045', 'abstract': 'The application of generative adversarial networks (GANs) has recently advanced speech super-resolution (SR) based on intermediate representations like mel-spectrograms. However, existing SR methods that typically rely on independently trained and concatenated networks may lead to inconsistent representations and poor speech quality, especially in out-of-domain scenarios. In this work, we propose HiFi-SR, a unified network that leverages end-to-end adversarial training to achieve high-fidelity speech super-resolution. Our model features a unified transformer-convolutional generator designed to seamlessly handle both the prediction of latent representations and their conversion into time-domain waveforms. The transformer network serves as a powerful encoder, converting low-resolution mel-spectrograms into latent space representations, while the convolutional network upscales these representations into high-resolution waveforms. To enhance high-frequency fidelity, we incorporate a multi-band, multi-scale time-frequency discriminator, along with a multi-scale mel-reconstruction loss in the adversarial training process. HiFi-SR is versatile, capable of upscaling any input speech signal between 4 kHz and 32 kHz to a 48 kHz sampling rate. Experimental results demonstrate that HiFi-SR significantly outperforms existing speech SR methods across both objective metrics and ABX preference tests, for both in-domain and out-of-domain scenarios (https://github.com/modelscope/ClearerVoice-Studio).', 'score': 4, 'issue_id': 1751, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '8d8cd8e70ad62b51', 'authors': ['Shengkui Zhao', 'Kun Zhou', 'Zexu Pan', 'Yukun Ma', 'Chong Zhang', 'Bin Ma'], 'affiliations': ['Tongyi Lab, Alibaba Group, Singapore'], 'pdf_title_img': 'assets/pdf/title_img/2501.10045.jpg', 'data': {'categories': ['#audio', '#optimization'], 'emoji': '🎙️', 'ru': {'title': 'HiFi-SR: Единая сеть для сверхчеткой речи', 'desc': 'Статья представляет HiFi-SR - унифицированную нейронную сеть для высококачественного повышения разрешения речи. Модель использует единую архитектуру трансформер-сверточной сети для обработки мел-спектрограмм и генерации высококачественных аудиосигналов. Для улучшения качества высоких частот применяется многополосный дискриминатор и многомасштабная функция потерь реконструкции мел-спектрограмм. Экспериментальные результаты показывают превосходство HiFi-SR над существующими методами как по объективным метрикам, так и по субъективным тестам.'}, 'en': {'title': 'HiFi-SR: Elevating Speech Quality with Unified GANs', 'desc': 'This paper introduces HiFi-SR, a novel approach to speech super-resolution using generative adversarial networks (GANs). Unlike traditional methods that use separate networks, HiFi-SR employs a unified transformer-convolutional architecture for end-to-end training, improving the consistency and quality of generated speech. The transformer encodes low-resolution mel-spectrograms into latent representations, while the convolutional network converts these into high-resolution audio waveforms. The model also integrates a multi-band discriminator and a mel-reconstruction loss to enhance high-frequency details, achieving superior performance in various scenarios.'}, 'zh': {'title': 'HiFi-SR:高保真语音超分辨率的新方法', 'desc': '本研究提出了一种名为HiFi-SR的统一网络,用于语音超分辨率(SR),通过端到端的对抗训练实现高保真语音重建。该模型结合了变换器和卷积网络,能够有效地将低分辨率的mel谱图转换为高分辨率的时域波形。为了提高高频细节的保真度,我们在对抗训练中引入了多带宽、多尺度的时频判别器和多尺度mel重建损失。实验结果表明,HiFi-SR在目标指标和ABX偏好测试中显著优于现有的语音超分辨率方法,适用于不同的输入语音信号。'}}}, {'id': 'https://huggingface.co/papers/2501.10132', 'title': 'ComplexFuncBench: Exploring Multi-Step and Constrained Function Calling under Long-Context Scenario', 'url': 'https://huggingface.co/papers/2501.10132', 'abstract': 'Enhancing large language models (LLMs) with real-time APIs can help generate more accurate and up-to-date responses. However, evaluating the function calling abilities of LLMs in real-world scenarios remains under-explored due to the complexity of data collection and evaluation. In this work, we introduce ComplexFuncBench, a benchmark for complex function calling across five real-world scenarios. Compared to existing benchmarks, ComplexFuncBench encompasses multi-step and constrained function calling, which requires long-parameter filing, parameter value reasoning, and 128k long context. Additionally, we propose an automatic framework, ComplexEval, for quantitatively evaluating complex function calling tasks. Through comprehensive experiments, we demonstrate the deficiencies of state-of-the-art LLMs in function calling and suggest future directions for optimizing these capabilities. The data and code are available at https://github.com/THUDM/ComplexFuncBench.', 'score': 4, 'issue_id': 1749, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': 'de405dcc4bfc8efc', 'authors': ['Lucen Zhong', 'Zhengxiao Du', 'Xiaohan Zhang', 'Haiyi Hu', 'Jie Tang'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.10132.jpg', 'data': {'categories': ['#long_context', '#optimization', '#data', '#benchmark'], 'emoji': '🧪', 'ru': {'title': 'Новый бенчмарк для оценки сложных вызовов функций в больших языковых моделях', 'desc': 'Данная статья представляет новый бенчмарк ComplexFuncBench для оценки способностей больших языковых моделей (LLM) вызывать сложные функции в реальных сценариях. Бенчмарк включает в себя многошаговые и ограниченные вызовы функций, требующие заполнения длинных параметров и рассуждений о значениях параметров. Авторы также предлагают автоматическую систему ComplexEval для количественной оценки задач сложного вызова функций. Эксперименты показывают недостатки современных LLM в вызове функций и предлагают направления для оптимизации этих возможностей.'}, 'en': {'title': 'Benchmarking Complex Function Calling in LLMs', 'desc': 'This paper presents ComplexFuncBench, a new benchmark designed to evaluate the function calling abilities of large language models (LLMs) in real-world scenarios. It focuses on complex tasks that involve multi-step and constrained function calling, which require advanced reasoning and handling of long contexts. The authors also introduce ComplexEval, an automatic framework for quantitatively assessing these complex function calling tasks. Through their experiments, they highlight the limitations of current state-of-the-art LLMs and propose directions for improving their performance in this area.'}, 'zh': {'title': '提升LLMs函数调用能力的基准与评估', 'desc': '本论文提出了ComplexFuncBench,这是一个用于评估大型语言模型(LLMs)在复杂函数调用方面的基准测试。该基准涵盖了五种真实场景,涉及多步骤和受限的函数调用,要求模型进行长参数填写和参数值推理。我们还提出了ComplexEval,一个自动化框架,用于定量评估复杂函数调用任务的能力。通过实验,我们展示了当前最先进的LLMs在函数调用方面的不足,并提出了未来优化的方向。'}}}, {'id': 'https://huggingface.co/papers/2501.09978', 'title': 'GaussianAvatar-Editor: Photorealistic Animatable Gaussian Head Avatar Editor', 'url': 'https://huggingface.co/papers/2501.09978', 'abstract': 'We introduce GaussianAvatar-Editor, an innovative framework for text-driven editing of animatable Gaussian head avatars that can be fully controlled in expression, pose, and viewpoint. Unlike static 3D Gaussian editing, editing animatable 4D Gaussian avatars presents challenges related to motion occlusion and spatial-temporal inconsistency. To address these issues, we propose the Weighted Alpha Blending Equation (WABE). This function enhances the blending weight of visible Gaussians while suppressing the influence on non-visible Gaussians, effectively handling motion occlusion during editing. Furthermore, to improve editing quality and ensure 4D consistency, we incorporate conditional adversarial learning into the editing process. This strategy helps to refine the edited results and maintain consistency throughout the animation. By integrating these methods, our GaussianAvatar-Editor achieves photorealistic and consistent results in animatable 4D Gaussian editing. We conduct comprehensive experiments across various subjects to validate the effectiveness of our proposed techniques, which demonstrates the superiority of our approach over existing methods. More results and code are available at: [Project Link](https://xiangyueliu.github.io/GaussianAvatar-Editor/).', 'score': 2, 'issue_id': 1751, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': 'e5b8603f26a902f9', 'authors': ['Xiangyue Liu', 'Kunming Luo', 'Heng Li', 'Qi Zhang', 'Yuan Liu', 'Li Yi', 'Ping Tan'], 'affiliations': ['Hong Kong University of Science and Technology', 'Tencent AI Lab', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09978.jpg', 'data': {'categories': ['#3d'], 'emoji': '🤖', 'ru': {'title': 'Революция в редактировании анимируемых 3D-аватаров с помощью гауссовых моделей', 'desc': 'Статья представляет GaussianAvatar-Editor - инновационную систему для редактирования анимируемых гауссовых аватаров головы на основе текстовых инструкций. Авторы предлагают функцию Weighted Alpha Blending Equation (WABE) для решения проблем, связанных с окклюзией при движении и пространственно-временной несогласованностью. Система использует условное состязательное обучение для улучшения качества редактирования и обеспечения согласованности в 4D. Эксперименты показывают превосходство предложенного подхода над существующими методами в создании фотореалистичных и согласованных результатов редактирования анимируемых 4D гауссовых аватаров.'}, 'en': {'title': 'Revolutionizing 4D Avatar Editing with GaussianAvatar-Editor', 'desc': 'GaussianAvatar-Editor is a new framework designed for editing animated Gaussian head avatars using text inputs. It tackles challenges like motion occlusion and maintaining spatial-temporal consistency, which are common in 4D animations. The framework introduces the Weighted Alpha Blending Equation (WABE) to improve the blending of visible elements while minimizing the impact of non-visible ones. Additionally, it employs conditional adversarial learning to enhance the quality of edits and ensure consistency throughout the animation process, resulting in photorealistic outputs.'}, 'zh': {'title': '高斯头像编辑的创新之路', 'desc': '我们介绍了GaussianAvatar-Editor,这是一个创新的框架,用于基于文本驱动的可动画高斯头像编辑。与静态3D高斯编辑不同,编辑可动画的4D高斯头像面临运动遮挡和时空不一致等挑战。为了解决这些问题,我们提出了加权阿尔法混合方程(WABE),该函数增强了可见高斯的混合权重,同时抑制了对不可见高斯的影响。通过结合条件对抗学习,我们提高了编辑质量并确保了4D一致性,从而实现了逼真且一致的可动画4D高斯编辑结果。'}}}, {'id': 'https://huggingface.co/papers/2501.17161', 'title': 'SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training', 'url': 'https://huggingface.co/papers/2501.17161', 'abstract': "Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used post-training techniques for foundation models. However, their roles in enhancing model generalization capabilities remain unclear. This paper studies the difference between SFT and RL on generalization and memorization, focusing on text-based rule variants and visual variants. We introduce GeneralPoints, an arithmetic reasoning card game, and adopt V-IRL, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants in both textual and visual domains. We show that RL, especially when trained with an outcome-based reward, generalizes across both rule-based textual and visual variants. SFT, in contrast, tends to memorize training data and struggles to generalize out-of-distribution scenarios. Further analysis reveals that RL improves the model's underlying visual recognition capabilities, contributing to its enhanced generalization in the visual domain. Despite RL's superior generalization, we show that SFT remains essential for effective RL training; SFT stabilizes the model's output format, enabling subsequent RL to achieve its performance gains. These findings demonstrates the capability of RL for acquiring generalizable knowledge in complex, multi-modal tasks.", 'score': 28, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'ce9300709a3cdc7a', 'authors': ['Tianzhe Chu', 'Yuexiang Zhai', 'Jihan Yang', 'Shengbang Tong', 'Saining Xie', 'Dale Schuurmans', 'Quoc V. Le', 'Sergey Levine', 'Yi Ma'], 'affiliations': ['Google DeepMind', 'HKU', 'NYU', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.17161.jpg', 'data': {'categories': ['#reasoning', '#training', '#optimization', '#rl', '#multimodal', '#games'], 'emoji': '🧠', 'ru': {'title': 'RL превосходит SFT в обобщении для мультимодальных задач', 'desc': 'Это исследование сравнивает методы дообучения языковых моделей: обучение с учителем (SFT) и обучение с подкреплением (RL). Авторы анализируют способность моделей к обобщению на новые текстовые и визуальные варианты задач. Результаты показывают, что RL лучше обобщается на новые ситуации, особенно при использовании награды, основанной на результате. SFT, напротив, склонно к запоминанию обучающих данных и хуже справляется с обобщением.'}, 'en': {'title': 'Unlocking Generalization: RL Outshines SFT in Multi-Modal Tasks', 'desc': 'This paper investigates how supervised fine-tuning (SFT) and reinforcement learning (RL) affect the generalization abilities of foundation models. It highlights that while SFT often leads to memorization of training data, RL, particularly with outcome-based rewards, enhances generalization across unseen textual and visual variants. The study introduces GeneralPoints, a reasoning game, and V-IRL, a navigation environment, to evaluate model performance. The results indicate that RL not only improves generalization but also strengthens visual recognition, although SFT is still crucial for stabilizing the model before RL training.'}, 'zh': {'title': '强化学习提升模型泛化能力的研究', 'desc': '这篇论文研究了监督微调(SFT)和强化学习(RL)在基础模型中的作用,特别是在提高模型的泛化能力方面。研究表明,RL在处理文本和视觉变体时,能够更好地泛化,而SFT则倾向于记忆训练数据,难以应对未见过的情况。通过引入算术推理卡牌游戏GeneralPoints和真实世界导航环境V-IRL,作者评估了这两种方法的效果。尽管RL在泛化能力上表现优越,但SFT仍然对有效的RL训练至关重要,因为它稳定了模型的输出格式。'}}}, {'id': 'https://huggingface.co/papers/2501.17116', 'title': 'Optimizing Large Language Model Training Using FP4 Quantization', 'url': 'https://huggingface.co/papers/2501.17116', 'abstract': 'The growing computational demands of training large language models (LLMs) necessitate more efficient methods. Quantized training presents a promising solution by enabling low-bit arithmetic operations to reduce these costs. While FP8 precision has demonstrated feasibility, leveraging FP4 remains a challenge due to significant quantization errors and limited representational capacity. This work introduces the first FP4 training framework for LLMs, addressing these challenges with two key innovations: a differentiable quantization estimator for precise weight updates and an outlier clamping and compensation strategy to prevent activation collapse. To ensure stability, the framework integrates a mixed-precision training scheme and vector-wise quantization. Experimental results demonstrate that our FP4 framework achieves accuracy comparable to BF16 and FP8, with minimal degradation, scaling effectively to 13B-parameter LLMs trained on up to 100B tokens. With the emergence of next-generation hardware supporting FP4, our framework sets a foundation for efficient ultra-low precision training.', 'score': 13, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '9ce85dc91aee17fc', 'authors': ['Ruizhe Wang', 'Yeyun Gong', 'Xiao Liu', 'Guoshuai Zhao', 'Ziyue Yang', 'Baining Guo', 'Zhengjun Zha', 'Peng Cheng'], 'affiliations': ['Microsoft Research Asia', 'Microsoft SIGMA Team', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.17116.jpg', 'data': {'categories': ['#optimization', '#training', '#inference'], 'emoji': '🔢', 'ru': {'title': 'FP4: Революция в эффективности обучения языковых моделей', 'desc': 'Статья представляет первую систему обучения больших языковых моделей (LLM) с использованием 4-битной точности с плавающей запятой (FP4). Авторы разработали дифференцируемый оценщик квантования для точного обновления весов и стратегию ограничения и компенсации выбросов для предотвращения коллапса активаций. Система включает схему обучения со смешанной точностью и векторное квантование для обеспечения стабильности. Экспериментальные результаты показывают, что FP4-обучение достигает точности, сравнимой с BF16 и FP8, эффективно масштабируясь до LLM с 13 млрд параметров.'}, 'en': {'title': 'Efficient Training of Large Language Models with FP4 Precision', 'desc': 'This paper addresses the high computational costs associated with training large language models (LLMs) by introducing a novel FP4 training framework. The framework utilizes quantized training techniques, specifically focusing on low-bit arithmetic to enhance efficiency while maintaining model accuracy. Key innovations include a differentiable quantization estimator for better weight updates and a strategy to manage outliers, which helps prevent activation collapse. Experimental results show that this FP4 approach achieves performance similar to higher precision formats like BF16 and FP8, making it suitable for large-scale LLMs.'}, 'zh': {'title': 'FP4训练框架:高效的超低精度训练新方案', 'desc': '随着大型语言模型(LLMs)训练对计算资源的需求不断增加,寻找更高效的方法变得尤为重要。量化训练通过允许低位数算术运算来降低这些成本,展现出良好的前景。尽管FP8精度已被证明可行,但FP4的应用仍面临显著的量化误差和有限的表示能力。本文提出了首个FP4训练框架,通过可微分量化估计器和异常值钳制与补偿策略,解决了这些挑战,并在稳定性方面结合了混合精度训练方案和向量级量化。'}}}, {'id': 'https://huggingface.co/papers/2501.16975', 'title': 'Over-Tokenized Transformer: Vocabulary is Generally Worth Scaling', 'url': 'https://huggingface.co/papers/2501.16975', 'abstract': 'Tokenization is a fundamental component of large language models (LLMs), yet its influence on model scaling and performance is not fully explored. In this paper, we introduce Over-Tokenized Transformers, a novel framework that decouples input and output vocabularies to improve language modeling performance. Specifically, our approach scales up input vocabularies to leverage multi-gram tokens. Through extensive experiments, we uncover a log-linear relationship between input vocabulary size and training loss, demonstrating that larger input vocabularies consistently enhance model performance, regardless of model size. Using a large input vocabulary, we achieve performance comparable to double-sized baselines with no additional cost. Our findings highlight the importance of tokenization in scaling laws and provide practical insight for tokenizer design, paving the way for more efficient and powerful LLMs.', 'score': 10, 'issue_id': 1920, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '27930c2f5d17471e', 'authors': ['Hongzhi Huang', 'Defa Zhu', 'Banggu Wu', 'Yutao Zeng', 'Ya Wang', 'Qiyang Min', 'Xun Zhou'], 'affiliations': ['Seed-Foundation-Model Team, Bytedance'], 'pdf_title_img': 'assets/pdf/title_img/2501.16975.jpg', 'data': {'categories': ['#optimization', '#training', '#architecture'], 'emoji': '🔤', 'ru': {'title': 'Больше токенов - выше эффективность: новый взгляд на масштабирование языковых моделей', 'desc': 'Статья представляет новый подход к токенизации в больших языковых моделях, называемый Over-Tokenized Transformers. Авторы предлагают разделить входной и выходной словари, увеличивая размер входного словаря для использования мультиграммных токенов. Исследование выявило логарифмически-линейную зависимость между размером входного словаря и потерями при обучении. Результаты показывают, что увеличение входного словаря consistently улучшает производительность модели независимо от её размера.'}, 'en': {'title': 'Unlocking Performance: The Power of Over-Tokenization in Language Models', 'desc': "This paper presents a new approach called Over-Tokenized Transformers, which focuses on improving the tokenization process in large language models (LLMs). By separating the input and output vocabularies, the authors demonstrate that increasing the input vocabulary size can significantly reduce training loss and enhance model performance. Their experiments reveal a consistent log-linear relationship between the size of the input vocabulary and the model's effectiveness, showing that larger vocabularies lead to better results without increasing computational costs. This research emphasizes the critical role of tokenization in the scaling of LLMs and offers valuable insights for designing more efficient tokenizers."}, 'zh': {'title': '分词技术提升大语言模型性能的关键', 'desc': '本文探讨了大语言模型中的分词技术对模型性能的影响。我们提出了一种新的框架——过度分词变换器,旨在通过解耦输入和输出词汇表来提升语言建模性能。研究表明,增大输入词汇表可以有效降低训练损失,从而提高模型性能。我们的实验结果显示,使用更大的输入词汇表可以在不增加成本的情况下,达到与双倍基线相当的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.16764', 'title': 'DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation', 'url': 'https://huggingface.co/papers/2501.16764', 'abstract': 'Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-image diffusion models. It differs from previous 3D generative models by effectively utilizing web-scale 2D priors while maintaining 3D consistency in a unified model. To bootstrap the training, a lightweight reconstruction model is proposed to instantly produce multi-view Gaussian splat grids for scalable dataset curation. In conjunction with the regular diffusion loss on these grids, a 3D rendering loss is introduced to facilitate 3D coherence across arbitrary views. The compatibility with image diffusion models enables seamless adaptions of numerous techniques for image generation to the 3D realm. Extensive experiments reveal the superiority of DiffSplat in text- and image-conditioned generation tasks and downstream applications. Thorough ablation studies validate the efficacy of each critical design choice and provide insights into the underlying mechanism.', 'score': 8, 'issue_id': 1921, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': '00ee1a0338716711', 'authors': ['Chenguo Lin', 'Panwang Pan', 'Bangbang Yang', 'Zeming Li', 'Yadong Mu'], 'affiliations': ['ByteDance', 'Peking University'], 'pdf_title_img': 'assets/pdf/title_img/2501.16764.jpg', 'data': {'categories': ['#diffusion', '#optimization', '#training', '#dataset', '#3d'], 'emoji': '🎨', 'ru': {'title': 'DiffSplat: Генерация 3D контента на новом уровне', 'desc': 'DiffSplat - это новая система генерации 3D контента, использующая диффузионные модели для создания трехмерных гауссовых сплатов. Она решает проблемы ограниченных 3D датасетов и несогласованности при мультиракурсной 2D генерации. DiffSplat объединяет масштабные 2D-приоры с 3D-согласованностью, используя легковесную модель реконструкции и специальную функцию потерь. Эксперименты показывают превосходство DiffSplat в задачах генерации по тексту и изображениям.'}, 'en': {'title': 'Revolutionizing 3D Generation with DiffSplat', 'desc': 'DiffSplat is a new framework for generating 3D content from text or images, addressing challenges like the lack of high-quality 3D datasets. It uses advanced text-to-image diffusion models to create 3D Gaussian splats while ensuring consistency across different views. The framework includes a lightweight reconstruction model that helps quickly generate multi-view datasets for training. Through extensive testing, DiffSplat shows improved performance in generating 3D content and offers insights into its effective design choices.'}, 'zh': {'title': 'DiffSplat:3D生成的新突破', 'desc': '最近,3D内容生成从文本或单张图像中取得了进展,但高质量3D数据集有限,且2D多视图生成存在不一致性。我们提出了DiffSplat,这是一种新颖的3D生成框架,能够通过控制大规模文本到图像的扩散模型,原生生成3D高斯点云。与以往的3D生成模型不同,DiffSplat有效利用了网络规模的2D先验,同时在统一模型中保持3D一致性。通过引入轻量级重建模型和3D渲染损失,DiffSplat在文本和图像条件生成任务中表现出色,且在下游应用中也显示出其优越性。'}}}, {'id': 'https://huggingface.co/papers/2501.16496', 'title': 'Open Problems in Mechanistic Interpretability', 'url': 'https://huggingface.co/papers/2501.16496', 'abstract': "Mechanistic interpretability aims to understand the computational mechanisms underlying neural networks' capabilities in order to accomplish concrete scientific and engineering goals. Progress in this field thus promises to provide greater assurance over AI system behavior and shed light on exciting scientific questions about the nature of intelligence. Despite recent progress toward these goals, there are many open problems in the field that require solutions before many scientific and practical benefits can be realized: Our methods require both conceptual and practical improvements to reveal deeper insights; we must figure out how best to apply our methods in pursuit of specific goals; and the field must grapple with socio-technical challenges that influence and are influenced by our work. This forward-facing review discusses the current frontier of mechanistic interpretability and the open problems that the field may benefit from prioritizing.", 'score': 7, 'issue_id': 1920, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '5a7a914accebfa33', 'authors': ['Lee Sharkey', 'Bilal Chughtai', 'Joshua Batson', 'Jack Lindsey', 'Jeff Wu', 'Lucius Bushnaq', 'Nicholas Goldowsky-Dill', 'Stefan Heimersheim', 'Alejandro Ortega', 'Joseph Bloom', 'Stella Biderman', 'Adria Garriga-Alonso', 'Arthur Conmy', 'Neel Nanda', 'Jessica Rumbelow', 'Martin Wattenberg', 'Nandi Schoots', 'Joseph Miller', 'Eric J. Michaud', 'Stephen Casper', 'Max Tegmark', 'William Saunders', 'David Bau', 'Eric Todd', 'Atticus Geiger', 'Mor Geva', 'Jesse Hoogland', 'Daniel Murfet', 'Tom McGrath'], 'affiliations': ['Anthropic', 'Apollo Research', 'Google DeepMind', 'Harvard University', 'Imperial College London', 'Kings College London', 'Leap Laboratories', 'MIT', 'Northeastern University', 'Tel Aviv University', 'University of Melbourne'], 'pdf_title_img': 'assets/pdf/title_img/2501.16496.jpg', 'data': {'categories': ['#interpretability', '#survey'], 'emoji': '🧠', 'ru': {'title': 'Раскрывая тайны нейронных сетей: путь к пониманию искусственного интеллекта', 'desc': 'Статья посвящена механистической интерпретируемости нейронных сетей, цель которой - понять вычислительные механизмы, лежащие в основе их возможностей. Прогресс в этой области обещает обеспечить большую уверенность в поведении систем искусственного интеллекта и пролить свет на природу интеллекта. Авторы обсуждают открытые проблемы в области, требующие решения для реализации научных и практических преимуществ. Статья рассматривает текущие границы механистической интерпретируемости и приоритетные задачи для дальнейшего развития области.'}, 'en': {'title': 'Unlocking the Secrets of Neural Networks for Reliable AI', 'desc': 'Mechanistic interpretability focuses on understanding how neural networks work to achieve specific tasks, which can enhance the reliability of AI systems. This area of research aims to uncover the underlying processes that contribute to the intelligence exhibited by these models. Despite advancements, there are still significant challenges that need to be addressed, including improving methods for deeper insights and applying these methods effectively. Additionally, the field must consider socio-technical issues that affect and are affected by mechanistic interpretability efforts.'}, 'zh': {'title': '揭示神经网络的计算机制', 'desc': '机械解释性旨在理解神经网络能力背后的计算机制,以实现具体的科学和工程目标。该领域的进展有望提高对人工智能系统行为的信心,并揭示关于智能本质的有趣科学问题。尽管最近在这些目标上取得了一些进展,但仍有许多未解决的问题需要解决,以便实现更多的科学和实际利益。本文回顾了机械解释性的当前前沿及该领域应优先解决的开放问题。'}}}, {'id': 'https://huggingface.co/papers/2501.16372', 'title': 'Low-Rank Adapters Meet Neural Architecture Search for LLM Compression', 'url': 'https://huggingface.co/papers/2501.16372', 'abstract': 'The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank representations with Neural Architecture Search (NAS) techniques, particularly weight-sharing super-networks. Robust solutions for compressing and fine-tuning large pre-trained models are developed by integrating these methodologies. Our analysis highlights the potential of these combined strategies to democratize the use of LLMs, making them more accessible for deployment in resource-constrained environments. The resulting models exhibit reduced memory footprints and faster inference times, paving the way for more practical and scalable applications of LLMs. Models and code are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.', 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'f1d43a985dbea0af', 'authors': ['J. Pablo Muñoz', 'Jinjie Yuan', 'Nilesh Jain'], 'affiliations': ['Intel Corporation', 'Intel Labs'], 'pdf_title_img': 'assets/pdf/title_img/2501.16372.jpg', 'data': {'categories': ['#inference', '#optimization', '#open_source', '#training', '#low_resource', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективная настройка крупных языковых моделей для ограниченных ресурсов', 'desc': 'Эта статья рассматривает проблему больших вычислительных ресурсов, необходимых для настройки и развертывания крупных языковых моделей (LLM). Авторы предлагают комбинировать низкоранговые адаптеры и методы поиска нейронных архитектур (NAS) для эффективной настройки параметров. Такой подход позволяет сжимать и дообучать большие предобученные модели, делая их более доступными в условиях ограниченных ресурсов. В результате получаются модели с меньшим потреблением памяти и более быстрым выводом, что открывает путь к более практичному применению LLM.'}, 'en': {'title': 'Democratizing Large Language Models with Efficient Fine-Tuning Techniques', 'desc': 'This paper addresses the challenges of using Large Language Models (LLMs) due to their high computational demands. It explores the use of low-rank adapters for parameter-efficient fine-tuning (PEFT), which helps reduce the resources needed. The authors combine low-rank representations with Neural Architecture Search (NAS) techniques, particularly through weight-sharing super-networks, to create efficient solutions for model compression and fine-tuning. The findings suggest that these strategies can make LLMs more accessible and practical for deployment in environments with limited resources, resulting in models that are faster and require less memory.'}, 'zh': {'title': '低秩适配器助力大型语言模型的高效微调', 'desc': '大型语言模型(LLMs)的快速发展带来了在微调和部署时对计算资源的巨大挑战。最近,低秩适配器在参数高效微调(PEFT)方面显示出了良好的效果。本文回顾了将低秩表示与神经架构搜索(NAS)技术相结合的创新方法,特别是权重共享超网络。通过整合这些方法,开发了压缩和微调大型预训练模型的稳健解决方案,使得LLMs在资源受限的环境中更易于部署。'}}}, {'id': 'https://huggingface.co/papers/2501.15747', 'title': 'IndicMMLU-Pro: Benchmarking Indic Large Language Models on Multi-Task Language Understanding', 'url': 'https://huggingface.co/papers/2501.15747', 'abstract': "Known by more than 1.5 billion people in the Indian subcontinent, Indic languages present unique challenges and opportunities for natural language processing (NLP) research due to their rich cultural heritage, linguistic diversity, and complex structures. IndicMMLU-Pro is a comprehensive benchmark designed to evaluate Large Language Models (LLMs) across Indic languages, building upon the MMLU Pro (Massive Multitask Language Understanding) framework. Covering major languages such as Hindi, Bengali, Gujarati, Marathi, Kannada, Punjabi, Tamil, Telugu, and Urdu, our benchmark addresses the unique challenges and opportunities presented by the linguistic diversity of the Indian subcontinent. This benchmark encompasses a wide range of tasks in language comprehension, reasoning, and generation, meticulously crafted to capture the intricacies of Indian languages. IndicMMLU-Pro provides a standardized evaluation framework to push the research boundaries in Indic language AI, facilitating the development of more accurate, efficient, and culturally sensitive models. This paper outlines the benchmarks' design principles, task taxonomy, and data collection methodology, and presents baseline results from state-of-the-art multilingual models.", 'score': 4, 'issue_id': 1918, 'pub_date': '2025-01-27', 'pub_date_card': {'ru': '27 января', 'en': 'January 27', 'zh': '1月27日'}, 'hash': '4b666d035c5e5c4c', 'authors': ['Sankalp KJ', 'Ashutosh Kumar', 'Laxmaan Balaji', 'Nikunj Kotecha', 'Vinija Jain', 'Aman Chadha', 'Sreyoshi Bhaduri'], 'affiliations': ['Amazon Gen AI', 'Artificial Intelligence Institute, University of South Carolina', 'Independent Researcher', 'Meta AI', 'Rochester Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.15747.jpg', 'data': {'categories': ['#reasoning', '#low_resource', '#multilingual', '#benchmark'], 'emoji': '🇮🇳', 'ru': {'title': 'Новый рубеж в NLP: комплексная оценка языковых моделей для индийских языков', 'desc': 'IndicMMLU-Pro - это комплексный бенчмарк для оценки языковых моделей в индийских языках. Он охватывает 9 основных языков Индийского субконтинента и включает широкий спектр задач по пониманию языка, рассуждению и генерации текста. Бенчмарк разработан с учетом уникальных особенностей и сложностей индийских языков. IndicMMLU-Pro предоставляет стандартизированную систему оценки для продвижения исследований в области ИИ для индийских языков.'}, 'en': {'title': 'Empowering Indic Languages with Advanced NLP Benchmarks', 'desc': 'The paper introduces IndicMMLU-Pro, a benchmark specifically designed to assess Large Language Models (LLMs) in the context of Indic languages. It builds on the existing MMLU Pro framework and includes major languages like Hindi, Bengali, and Tamil, addressing the unique linguistic challenges of the Indian subcontinent. The benchmark features a variety of tasks that test language comprehension, reasoning, and generation, ensuring a comprehensive evaluation of models. By providing a standardized framework, IndicMMLU-Pro aims to enhance the development of more accurate and culturally aware AI models for Indic languages.'}, 'zh': {'title': '推动印度语言AI研究的基准', 'desc': 'IndicMMLU-Pro是一个专门为印度语言设计的基准,旨在评估大型语言模型(LLMs)的表现。该基准基于MMLU Pro框架,涵盖了印地语、孟加拉语、古吉拉特语等主要语言,解决了印度次大陆语言的多样性带来的挑战。它包括语言理解、推理和生成等多种任务,旨在捕捉印度语言的复杂性。通过提供标准化的评估框架,IndicMMLU-Pro推动了印度语言人工智能的研究,促进了更准确、高效和文化敏感的模型的发展。'}}}, {'id': 'https://huggingface.co/papers/2501.17117', 'title': 'Histoires Morales: A French Dataset for Assessing Moral Alignment', 'url': 'https://huggingface.co/papers/2501.17117', 'abstract': 'Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant progress in languages like English and Chinese, French has seen little attention in this area, leaving a gap in understanding how LLMs handle moral reasoning in this language. To address this gap, we introduce Histoires Morales, a French dataset derived from Moral Stories, created through translation and subsequently refined with the assistance of native speakers to guarantee grammatical accuracy and adaptation to the French cultural context. We also rely on annotations of the moral values within the dataset to ensure their alignment with French norms. Histoires Morales covers a wide range of social situations, including differences in tipping practices, expressions of honesty in relationships, and responsibilities toward animals. To foster future research, we also conduct preliminary experiments on the alignment of multilingual models on French and English data and the robustness of the alignment. We find that while LLMs are generally aligned with human moral norms by default, they can be easily influenced with user-preference optimization for both moral and immoral data.', 'score': 2, 'issue_id': 1924, 'pub_date': '2025-01-28', 'pub_date_card': {'ru': '28 января', 'en': 'January 28', 'zh': '1月28日'}, 'hash': 'd2d1461e245219e8', 'authors': ['Thibaud Leteno', 'Irina Proskurina', 'Antoine Gourru', 'Julien Velcin', 'Charlotte Laclau', 'Guillaume Metzler', 'Christophe Gravier'], 'affiliations': ['Laboratoire Hubert Curien, UMR CNRS 5516, Saint-Etienne, France', 'Télécom Paris, Institut Polytechnique de Paris, Paris, France', 'Université Lumière Lyon 2, Université Claude Bernard Lyon 1, ERIC, 69007, Lyon, France'], 'pdf_title_img': 'assets/pdf/title_img/2501.17117.jpg', 'data': {'categories': ['#dataset', '#multilingual', '#alignment', '#ethics'], 'emoji': '🇫🇷', 'ru': {'title': 'Французский датасет для морального выравнивания языковых моделей', 'desc': "Статья представляет набор данных 'Histoires Morales' на французском языке для выравнивания языковых моделей с человеческими ценностями. Этот датасет создан на основе 'Moral Stories' путем перевода и адаптации к французскому культурному контексту. Исследование включает эксперименты по выравниванию мультиязычных моделей на французских и английских данных. Результаты показывают, что языковые модели в целом соответствуют человеческим моральным нормам, но могут быть легко подвержены влиянию при оптимизации под предпочтения пользователей."}, 'en': {'title': 'Bridging Language Models and French Moral Values', 'desc': 'This paper emphasizes the importance of aligning language models with human values, particularly in the context of the French language. It introduces Histoires Morales, a dataset created from Moral Stories, which has been translated and refined to reflect French cultural norms and moral reasoning. The dataset includes various social situations to better understand how language models handle moral values in French. Preliminary experiments show that while language models generally align with human morals, they can be swayed by user preferences, highlighting the need for careful optimization.'}, 'zh': {'title': '让语言模型与人类价值观对齐', 'desc': '本论文强调了将语言模型与人类价值观对齐的重要性,尤其是在日常生活中。我们介绍了一个名为Histoires Morales的法语数据集,旨在填补法语在道德推理方面的研究空白。该数据集通过翻译和母语者的帮助进行精细化,确保其语法准确并适应法国文化背景。我们的初步实验表明,尽管大型语言模型通常与人类道德规范一致,但它们可以通过用户偏好优化轻易受到影响。'}}}, {'id': 'https://huggingface.co/papers/2501.01895', 'title': 'EnerVerse: Envisioning Embodied Future Space for Robotics Manipulation', 'url': 'https://huggingface.co/papers/2501.01895', 'abstract': "We introduce EnerVerse, a comprehensive framework for embodied future space generation specifically designed for robotic manipulation tasks. EnerVerse seamlessly integrates convolutional and bidirectional attention mechanisms for inner-chunk space modeling, ensuring low-level consistency and continuity. Recognizing the inherent redundancy in video data, we propose a sparse memory context combined with a chunkwise unidirectional generative paradigm to enable the generation of infinitely long sequences. To further augment robotic capabilities, we introduce the Free Anchor View (FAV) space, which provides flexible perspectives to enhance observation and analysis. The FAV space mitigates motion modeling ambiguity, removes physical constraints in confined environments, and significantly improves the robot's generalization and adaptability across various tasks and settings. To address the prohibitive costs and labor intensity of acquiring multi-camera observations, we present a data engine pipeline that integrates a generative model with 4D Gaussian Splatting (4DGS). This pipeline leverages the generative model's robust generalization capabilities and the spatial constraints provided by 4DGS, enabling an iterative enhancement of data quality and diversity, thus creating a data flywheel effect that effectively narrows the sim-to-real gap. Finally, our experiments demonstrate that the embodied future space generation prior substantially enhances policy predictive capabilities, resulting in improved overall performance, particularly in long-range robotic manipulation tasks.", 'score': 41, 'issue_id': 1506, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'bae2a6e63f87958d', 'authors': ['Siyuan Huang', 'Liliang Chen', 'Pengfei Zhou', 'Shengcong Chen', 'Zhengkai Jiang', 'Yue Hu', 'Peng Gao', 'Hongsheng Li', 'Maoqing Yao', 'Guanghui Ren'], 'affiliations': ['AgiBot', 'CUHK', 'FDU', 'HIT', 'HKUST', 'SJTU', 'Shanghai AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.01895.jpg', 'data': {'categories': ['#3d', '#data', '#robotics'], 'emoji': '🤖', 'ru': {'title': 'EnerVerse: Революция в пространственном моделировании для роботов-манипуляторов', 'desc': 'EnerVerse - это комплексная система для генерации пространства будущего в задачах роботизированной манипуляции. Она использует сверточные механизмы и двунаправленное внимание для моделирования внутренних фрагментов пространства, обеспечивая согласованность на низком уровне. Система вводит пространство Free Anchor View для гибких перспектив наблюдения и анализа, улучшая обобщение и адаптивность робота. EnerVerse также включает конвейер данных, интегрирующий генеративную модель с 4D Gaussian Splatting для сужения разрыва между симуляцией и реальностью.'}, 'en': {'title': 'Empowering Robots with EnerVerse: A New Era in Space Generation and Manipulation', 'desc': 'EnerVerse is a new framework designed to help robots better understand and manipulate their environments. It uses advanced techniques like convolutional and bidirectional attention mechanisms to create a consistent model of space. By recognizing that video data often has unnecessary information, EnerVerse employs a sparse memory context to generate long sequences efficiently. Additionally, the Free Anchor View (FAV) space allows robots to observe from different angles, improving their ability to adapt and perform tasks in various settings.'}, 'zh': {'title': 'EnerVerse:提升机器人操作的未来空间生成框架', 'desc': '本文介绍了EnerVerse,这是一个专为机器人操作任务设计的未来空间生成框架。EnerVerse结合了卷积和双向注意机制,以确保内部空间建模的一致性和连续性。我们提出了一种稀疏记忆上下文和单向生成范式的结合,能够生成无限长的序列,从而提高机器人的能力。通过引入自由锚视图空间(FAV),我们增强了观察和分析的灵活性,显著改善了机器人在各种任务和环境中的泛化能力和适应性。'}}}, {'id': 'https://huggingface.co/papers/2501.01957', 'title': 'VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction', 'url': 'https://huggingface.co/papers/2501.01957', 'abstract': 'Recent Multimodal Large Language Models (MLLMs) have typically focused on integrating visual and textual modalities, with less emphasis placed on the role of speech in enhancing interaction. However, speech plays a crucial role in multimodal dialogue systems, and implementing high-performance in both vision and speech tasks remains a significant challenge due to the fundamental modality differences. In this paper, we propose a carefully designed multi-stage training methodology that progressively trains LLM to understand both visual and speech information, ultimately enabling fluent vision and speech interaction. Our approach not only preserves strong vision-language capacity, but also enables efficient speech-to-speech dialogue capabilities without separate ASR and TTS modules, significantly accelerating multimodal end-to-end response speed. By comparing our method against state-of-the-art counterparts across benchmarks for image, video, and speech tasks, we demonstrate that our model is equipped with both strong visual and speech capabilities, making near real-time vision and speech interaction.', 'score': 19, 'issue_id': 1506, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'b6690c7efedf5a39', 'authors': ['Chaoyou Fu', 'Haojia Lin', 'Xiong Wang', 'Yi-Fan Zhang', 'Yunhang Shen', 'Xiaoyu Liu', 'Yangze Li', 'Zuwei Long', 'Heting Gao', 'Ke Li', 'Xiawu Zheng', 'Rongrong Ji', 'Xing Sun', 'Caifeng Shan', 'Ran He'], 'affiliations': ['CASIA', 'NJU', 'Tencent Youtu Lab', 'XMU'], 'pdf_title_img': 'assets/pdf/title_img/2501.01957.jpg', 'data': {'categories': ['#training', '#cv', '#multimodal', '#benchmark', '#audio'], 'emoji': '🗣️', 'ru': {'title': 'Революция в мультимодальном взаимодействии: речь и зрение в одной модели', 'desc': 'В статье представлена новая методология обучения мультимодальных языковых моделей, объединяющая визуальную и речевую модальности. Авторы предлагают поэтапный подход к обучению, который позволяет модели эффективно понимать как визуальную, так и речевую информацию. Модель демонстрирует высокую производительность в задачах обработки изображений, видео и речи, превосходя современные аналоги. Этот подход обеспечивает возможность ведения диалога с использованием речи и изображений в режиме, близком к реальному времени.'}, 'en': {'title': 'Enhancing Multimodal Interaction with Speech and Vision Integration', 'desc': 'This paper introduces a novel training methodology for Multimodal Large Language Models (MLLMs) that enhances their ability to process both visual and speech data. The proposed multi-stage training approach allows the model to progressively learn and integrate information from images, videos, and spoken language, facilitating seamless interaction. By eliminating the need for separate Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) modules, the model achieves faster response times in multimodal dialogues. Experimental results show that this method not only maintains strong vision-language performance but also excels in speech tasks, enabling near real-time interactions.'}, 'zh': {'title': '实现流畅的视觉与语音交互', 'desc': '最近的多模态大型语言模型(MLLMs)主要集中在视觉和文本的整合上,而对语音在增强交互中的作用关注较少。然而,语音在多模态对话系统中起着至关重要的作用,如何在视觉和语音任务中实现高性能仍然是一个重大挑战。本文提出了一种精心设计的多阶段训练方法,逐步训练大型语言模型理解视觉和语音信息,从而实现流畅的视觉和语音交互。我们的方法不仅保持了强大的视觉-语言能力,还实现了高效的语音对话能力,显著加快了多模态端到端的响应速度。'}}}, {'id': 'https://huggingface.co/papers/2501.01904', 'title': 'Virgo: A Preliminary Exploration on Reproducing o1-like MLLM', 'url': 'https://huggingface.co/papers/2501.01904', 'abstract': 'Recently, slow-thinking reasoning systems, built upon large language models (LLMs), have garnered widespread attention by scaling the thinking time during inference. There is also growing interest in adapting this capability to multimodal large language models (MLLMs). Given that MLLMs handle more complex data semantics across different modalities, it is intuitively more challenging to implement multimodal slow-thinking systems. To address this issue, in this paper, we explore a straightforward approach by fine-tuning a capable MLLM with a small amount of textual long-form thought data, resulting in a multimodal slow-thinking system, Virgo (Visual reasoning with long thought). We find that these long-form reasoning processes, expressed in natural language, can be effectively transferred to MLLMs. Moreover, it seems that such textual reasoning data can be even more effective than visual reasoning data in eliciting the slow-thinking capacities of MLLMs. While this work is preliminary, it demonstrates that slow-thinking capacities are fundamentally associated with the language model component, which can be transferred across modalities or domains. This finding can be leveraged to guide the development of more powerful slow-thinking reasoning systems. We release our resources at https://github.com/RUCAIBox/Virgo.', 'score': 12, 'issue_id': 1505, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '576423a20b419d0f', 'authors': ['Yifan Du', 'Zikang Liu', 'Yifan Li', 'Wayne Xin Zhao', 'Yuqi Huo', 'Bingning Wang', 'Weipeng Chen', 'Zheng Liu', 'Zhongyuan Wang', 'Ji-Rong Wen'], 'affiliations': ['BAAI', 'Baichuan AI', 'Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01904.jpg', 'data': {'categories': ['#reasoning', '#multimodal', '#transfer_learning', '#training'], 'emoji': '🧠', 'ru': {'title': 'Обучение мультимодальных ИИ длительным рассуждениям через текст', 'desc': 'Статья описывает исследование в области мультимодальных больших языковых моделей (MLLM) и их способности к медленному мышлению. Авторы предлагают метод Virgo, который позволяет обучить MLLM длительным рассуждениям с помощью небольшого количества текстовых данных. Результаты показывают, что текстовые данные для обучения рассуждениям могут быть даже эффективнее визуальных. Это исследование демонстрирует, что способности к медленному мышлению в основном связаны с языковым компонентом модели и могут переноситься между модальностями.'}, 'en': {'title': 'Unlocking Slow-Thinking in Multimodal Models with Textual Reasoning', 'desc': 'This paper discusses the development of a multimodal slow-thinking reasoning system called Virgo, which is based on fine-tuning a multimodal large language model (MLLM) using long-form textual reasoning data. The authors found that incorporating long-form reasoning in natural language significantly enhances the slow-thinking capabilities of MLLMs, even more so than using visual reasoning data. This suggests that the slow-thinking abilities are closely linked to the language model aspect, allowing for effective transfer across different data modalities. The research indicates a promising direction for creating advanced reasoning systems that can handle complex data semantics.'}, 'zh': {'title': '多模态慢思维推理的探索', 'desc': '最近,基于大型语言模型(LLMs)的慢思维推理系统引起了广泛关注,尤其是在推理过程中延长思考时间的能力。本文探讨了如何将这种能力应用于多模态大型语言模型(MLLMs),尽管处理不同模态的复杂数据语义更具挑战性。我们通过微调一个强大的MLLM,使用少量的长文本思维数据,成功构建了一个多模态慢思维系统,命名为Virgo(视觉推理与长思维)。研究表明,长文本推理过程可以有效转移到MLLMs,并且这种文本推理数据在激发MLLMs的慢思维能力方面,似乎比视觉推理数据更有效。'}}}, {'id': 'https://huggingface.co/papers/2412.21059', 'title': 'VisionReward: Fine-Grained Multi-Dimensional Human Preference Learning for Image and Video Generation', 'url': 'https://huggingface.co/papers/2412.21059', 'abstract': 'We present a general strategy to aligning visual generation models -- both image and video generation -- with human preference. To start with, we build VisionReward -- a fine-grained and multi-dimensional reward model. We decompose human preferences in images and videos into multiple dimensions, each represented by a series of judgment questions, linearly weighted and summed to an interpretable and accurate score. To address the challenges of video quality assessment, we systematically analyze various dynamic features of videos, which helps VisionReward surpass VideoScore by 17.2% and achieve top performance for video preference prediction. Based on VisionReward, we develop a multi-objective preference learning algorithm that effectively addresses the issue of confounding factors within preference data. Our approach significantly outperforms existing image and video scoring methods on both machine metrics and human evaluation. All code and datasets are provided at https://github.com/THUDM/VisionReward.', 'score': 11, 'issue_id': 1510, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '1f3bb267ffa751d9', 'authors': ['Jiazheng Xu', 'Yu Huang', 'Jiale Cheng', 'Yuanming Yang', 'Jiajun Xu', 'Yuan Wang', 'Wenbo Duan', 'Shen Yang', 'Qunlin Jin', 'Shurun Li', 'Jiayan Teng', 'Zhuoyi Yang', 'Wendi Zheng', 'Xiao Liu', 'Ming Ding', 'Xiaohan Zhang', 'Xiaotao Gu', 'Shiyu Huang', 'Minlie Huang', 'Jie Tang', 'Yuxiao Dong'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2412.21059.jpg', 'data': {'categories': ['#rag', '#training', '#open_source', '#cv', '#video', '#optimization', '#alignment'], 'emoji': '🎥', 'ru': {'title': 'VisionReward: многомерная оценка визуального контента с учетом человеческих предпочтений', 'desc': 'Исследователи представили стратегию для согласования моделей генерации визуального контента с человеческими предпочтениями. Они разработали VisionReward - многомерную модель вознаграждения, которая декомпозирует предпочтения в изображениях и видео на несколько измерений. Для оценки качества видео были проанализированы различные динамические характеристики, что позволило VisionReward превзойти существующие методы на 17.2%. На основе VisionReward был разработан алгоритм многоцелевого обучения предпочтениям, эффективно решающий проблему конфаундинг-факторов в данных о предпочтениях.'}, 'en': {'title': 'Aligning Visual Generation with Human Preferences', 'desc': 'This paper introduces a method for aligning visual generation models, such as those for images and videos, with human preferences. The authors create a reward model called VisionReward, which breaks down human preferences into multiple dimensions assessed through specific judgment questions. They enhance video quality assessment by analyzing dynamic features, leading to a 17.2% improvement over previous methods. Additionally, a multi-objective preference learning algorithm is developed to manage confounding factors in preference data, resulting in superior performance compared to existing scoring methods.'}, 'zh': {'title': '视觉生成模型与人类偏好的完美对齐', 'desc': '本文提出了一种通用策略,用于将视觉生成模型(包括图像和视频生成)与人类偏好对齐。我们构建了VisionReward,这是一个细粒度和多维度的奖励模型,能够将人类对图像和视频的偏好分解为多个维度。通过分析视频的动态特征,VisionReward在视频偏好预测中超越了现有方法,提升了17.2%的性能。基于VisionReward,我们开发了一种多目标偏好学习算法,有效解决了偏好数据中的混淆因素问题。'}}}, {'id': 'https://huggingface.co/papers/2501.01821', 'title': 'SDPO: Segment-Level Direct Preference Optimization for Social Agents', 'url': 'https://huggingface.co/papers/2501.01821', 'abstract': "Social agents powered by large language models (LLMs) can simulate human social behaviors but fall short in handling complex goal-oriented social dialogues. Direct Preference Optimization (DPO) has proven effective in aligning LLM behavior with human preferences across a variety of agent tasks. Existing DPO-based approaches for multi-turn interactions are divided into turn-level and session-level methods. The turn-level method is overly fine-grained, focusing exclusively on individual turns, while session-level methods are too coarse-grained, often introducing training noise. To address these limitations, we propose Segment-Level Direct Preference Optimization (SDPO), which focuses on specific key segments within interactions to optimize multi-turn agent behavior while minimizing training noise. Evaluations on the SOTOPIA benchmark demonstrate that SDPO-tuned agents consistently outperform both existing DPO-based methods and proprietary LLMs like GPT-4o, underscoring SDPO's potential to advance the social intelligence of LLM-based agents. We release our code and data at https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/SDPO.", 'score': 10, 'issue_id': 1514, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '499b008b0bce4f74', 'authors': ['Aobo Kong', 'Wentao Ma', 'Shiwan Zhao', 'Yongbin Li', 'Yuchuan Wu', 'Ke Wang', 'Xiaoqian Liu', 'Qicheng Li', 'Yong Qin', 'Fei Huang'], 'affiliations': ['TMCC, CS, Nankai University', 'Tongyi Lab', 'alibaba-inc.com'], 'pdf_title_img': 'assets/pdf/title_img/2501.01821.jpg', 'data': {'categories': ['#open_source', '#benchmark', '#rlhf', '#agents', '#alignment', '#training'], 'emoji': '🤖', 'ru': {'title': 'SDPO: Новый шаг к созданию социально интеллектуальных ИИ-агентов', 'desc': 'В статье представлен новый метод оптимизации поведения языковых моделей (LLM) в сложных многоходовых социальных диалогах - Segment-Level Direct Preference Optimization (SDPO). SDPO фокусируется на ключевых сегментах взаимодействия, что позволяет эффективнее оптимизировать поведение агентов по сравнению с существующими методами. Эксперименты на бенчмарке SOTOPIA показали, что агенты, настроенные с помощью SDPO, превосходят как другие методы на основе DPO, так и проприетарные модели вроде GPT-4. Это демонстрирует потенциал SDPO для повышения социального интеллекта агентов на основе LLM.'}, 'en': {'title': 'Enhancing Social Intelligence in LLMs with SDPO', 'desc': "This paper introduces Segment-Level Direct Preference Optimization (SDPO), a new method for improving the performance of social agents powered by large language models (LLMs) in complex dialogues. Unlike existing methods that either focus too narrowly on individual turns or too broadly on entire sessions, SDPO targets specific key segments of conversations to better align agent behavior with human preferences. The approach reduces training noise and enhances the agent's ability to engage in multi-turn interactions effectively. Evaluations show that agents trained with SDPO outperform both traditional DPO methods and advanced LLMs like GPT-4o, highlighting its effectiveness in enhancing social intelligence."}, 'zh': {'title': '提升社交智能的新方法:分段级直接偏好优化', 'desc': '本论文提出了一种新的方法,称为分段级直接偏好优化(SDPO),旨在提高大型语言模型(LLM)在多轮社交对话中的表现。现有的直接偏好优化(DPO)方法在处理多轮交互时存在细粒度和粗粒度的局限性,导致训练噪声。SDPO通过关注交互中的关键段落,优化代理的多轮行为,从而减少训练噪声。实验结果表明,SDPO调优的代理在SOTOPIA基准测试中表现优于现有的DPO方法和其他大型语言模型,显示出其在提升社交智能方面的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01073', 'title': 'Graph Generative Pre-trained Transformer', 'url': 'https://huggingface.co/papers/2501.01073', 'abstract': "Graph generation is a critical task in numerous domains, including molecular design and social network analysis, due to its ability to model complex relationships and structured data. While most modern graph generative models utilize adjacency matrix representations, this work revisits an alternative approach that represents graphs as sequences of node set and edge set. We advocate for this approach due to its efficient encoding of graphs and propose a novel representation. Based on this representation, we introduce the Graph Generative Pre-trained Transformer (G2PT), an auto-regressive model that learns graph structures via next-token prediction. To further exploit G2PT's capabilities as a general-purpose foundation model, we explore fine-tuning strategies for two downstream applications: goal-oriented generation and graph property prediction. We conduct extensive experiments across multiple datasets. Results indicate that G2PT achieves superior generative performance on both generic graph and molecule datasets. Furthermore, G2PT exhibits strong adaptability and versatility in downstream tasks from molecular design to property prediction.", 'score': 9, 'issue_id': 1508, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '596abc88d57e0650', 'authors': ['Xiaohui Chen', 'Yinkai Wang', 'Jiaxing He', 'Yuanqi Du', 'Soha Hassoun', 'Xiaolin Xu', 'Li-Ping Liu'], 'affiliations': ['Cornell University', 'Northeastern University', 'Tufts University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01073.jpg', 'data': {'categories': ['#dataset', '#optimization', '#training', '#architecture', '#data', '#graphs'], 'emoji': '🕸️', 'ru': {'title': 'G2PT: Универсальный трансформер для эффективной генерации графов', 'desc': 'В статье представлена новая модель генерации графов - Graph Generative Pre-trained Transformer (G2PT). G2PT использует альтернативный подход к представлению графов в виде последовательностей множеств узлов и рёбер вместо матриц смежности. Модель обучается предсказывать следующий токен автореgressивным способом. G2PT показывает превосходные результаты в генерации как общих графов, так и молекул, а также демонстрирует хорошую адаптивность к различным задачам.'}, 'en': {'title': 'Revolutionizing Graph Generation with G2PT', 'desc': 'This paper focuses on improving graph generation, which is important for tasks like designing molecules and analyzing social networks. Instead of using the common adjacency matrix, it proposes a new way to represent graphs as sequences of node and edge sets, making the encoding more efficient. The authors introduce the Graph Generative Pre-trained Transformer (G2PT), an auto-regressive model that learns to generate graph structures by predicting the next token in a sequence. Through various experiments, they demonstrate that G2PT outperforms existing models in generating graphs and is effective in applications like molecular design and predicting graph properties.'}, 'zh': {'title': '图生成的创新:G2PT模型', 'desc': '图生成在许多领域中非常重要,比如分子设计和社交网络分析,因为它能够建模复杂的关系和结构化数据。本文提出了一种新的图表示方法,将图表示为节点集和边集的序列,而不是传统的邻接矩阵。基于这种表示,我们引入了图生成预训练变换器(G2PT),这是一种通过下一个标记预测学习图结构的自回归模型。实验结果表明,G2PT在通用图和分子数据集上表现出色,并且在分子设计和属性预测等下游任务中具有很强的适应性和多功能性。'}}}, {'id': 'https://huggingface.co/papers/2501.00874', 'title': 'LUSIFER: Language Universal Space Integration for Enhanced Multilingual Embeddings with Large Language Models', 'url': 'https://huggingface.co/papers/2501.00874', 'abstract': "Recent advancements in large language models (LLMs) based embedding models have established new state-of-the-art benchmarks for text embedding tasks, particularly in dense vector-based retrieval. However, these models predominantly focus on English, leaving multilingual embedding capabilities largely unexplored. To address this limitation, we present LUSIFER, a novel zero-shot approach that adapts LLM-based embedding models for multilingual tasks without requiring multilingual supervision. LUSIFER's architecture combines a multilingual encoder, serving as a language-universal learner, with an LLM-based embedding model optimized for embedding-specific tasks. These components are seamlessly integrated through a minimal set of trainable parameters that act as a connector, effectively transferring the multilingual encoder's language understanding capabilities to the specialized embedding model. Additionally, to comprehensively evaluate multilingual embedding performance, we introduce a new benchmark encompassing 5 primary embedding tasks, 123 diverse datasets, and coverage across 14 languages. Extensive experimental results demonstrate that LUSIFER significantly enhances the multilingual performance across various embedding tasks, particularly for medium and low-resource languages, without requiring explicit multilingual training data.", 'score': 7, 'issue_id': 1507, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': '5bdfec436923a2a6', 'authors': ['Hieu Man', 'Nghia Trung Ngo', 'Viet Dac Lai', 'Ryan A. Rossi', 'Franck Dernoncourt', 'Thien Huu Nguyen'], 'affiliations': ['Adobe Research, USA', 'Dept. of Computer Science, University of Oregon, OR, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.00874.jpg', 'data': {'categories': ['#transfer_learning', '#architecture', '#benchmark', '#multilingual', '#low_resource'], 'emoji': '🌍', 'ru': {'title': 'Универсальные многоязычные эмбеддинги без многоязычного обучения', 'desc': 'LUSIFER - это новый подход к созданию многоязычных эмбеддингов без использования многоязычных обучающих данных. Он объединяет многоязычный энкодер и LLM-модель для эмбеддингов через набор обучаемых параметров. Авторы также представили новый бенчмарк для оценки качества многоязычных эмбеддингов, охватывающий 5 основных задач, 123 датасета и 14 языков. Эксперименты показали, что LUSIFER значительно улучшает многоязычную производительность, особенно для языков с ограниченными ресурсами.'}, 'en': {'title': 'LUSIFER: Bridging Multilingual Gaps in Text Embedding', 'desc': "This paper introduces LUSIFER, a new method that enhances large language models (LLMs) for multilingual text embedding tasks. Unlike existing models that mainly focus on English, LUSIFER uses a zero-shot approach to adapt LLMs for multiple languages without needing multilingual training data. It combines a multilingual encoder with an LLM-based embedding model, allowing for effective language understanding and embedding performance. The authors also present a comprehensive benchmark to evaluate LUSIFER's performance across various languages and tasks, showing significant improvements, especially for less-resourced languages."}, 'zh': {'title': 'LUSIFER:无监督多语言嵌入的新突破', 'desc': '最近,大型语言模型(LLMs)在文本嵌入任务中取得了新的突破,尤其是在基于密集向量的检索方面。然而,这些模型主要集中在英语上,导致多语言嵌入能力尚未得到充分探索。为了解决这个问题,我们提出了LUSIFER,这是一种新颖的零样本方法,可以在不需要多语言监督的情况下,将LLM嵌入模型适应于多语言任务。LUSIFER的架构结合了一个多语言编码器和一个针对嵌入特定任务优化的LLM嵌入模型,通过一组最小的可训练参数实现无缝连接,有效地将多语言编码器的语言理解能力转移到专门的嵌入模型上。'}}}, {'id': 'https://huggingface.co/papers/2501.01540', 'title': 'BoxingGym: Benchmarking Progress in Automated Experimental Design and Model Discovery', 'url': 'https://huggingface.co/papers/2501.01540', 'abstract': "Understanding the world and explaining it with scientific theories is a central aspiration of artificial intelligence research. Proposing theories, designing experiments to test them, and then revising them based on data are fundamental to scientific discovery. Despite the significant promise of LLM-based scientific agents, no benchmarks systematically test LLM's ability to propose scientific models, collect experimental data, and revise them in light of new data. We introduce BoxingGym, a benchmark with 10 environments for systematically evaluating both experimental design (e.g. collecting data to test a scientific theory) and model discovery (e.g. proposing and revising scientific theories). To enable tractable and quantitative evaluation, we implement each environment as a generative probabilistic model with which a scientific agent can run interactive experiments. These probabilistic models are drawn from various real-world scientific domains ranging from psychology to ecology. To quantitatively evaluate a scientific agent's ability to collect informative experimental data, we compute the expected information gain (EIG), an information-theoretic quantity which measures how much an experiment reduces uncertainty about the parameters of a generative model. A good scientific theory is a concise and predictive explanation. Therefore, to quantitatively evaluate model discovery, we ask a scientific agent to explain their model and then assess whether this explanation enables another scientific agent to make reliable predictions about this environment. In addition to this explanation-based evaluation, we compute standard model evaluation metrics such as prediction errors. We find that current LLMs, such as GPT-4o, struggle with both experimental design and model discovery. We find that augmenting the LLM-based agent with an explicit statistical model does not reliably improve these results.", 'score': 4, 'issue_id': 1510, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '0f853b1681ef29b5', 'authors': ['Kanishk Gandhi', 'Michael Y. Li', 'Lyle Goodyear', 'Louise Li', 'Aditi Bhaskar', 'Mohammed Zaman', 'Noah D. Goodman'], 'affiliations': ['Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01540.jpg', 'data': {'categories': ['#benchmark', '#data', '#science', '#agents'], 'emoji': '🧪', 'ru': {'title': 'BoxingGym: новый вызов для ИИ в научном моделировании', 'desc': 'Статья представляет новый бенчмарк BoxingGym для оценки способности языковых моделей (LLM) к научному открытию. Бенчмарк включает 10 сред, моделирующих различные научные области, и позволяет тестировать планирование экспериментов и построение теорий. Для оценки качества экспериментов используется ожидаемый прирост информации (EIG), а для оценки теорий - их способность объяснять и предсказывать. Результаты показывают, что современные LLM, включая GPT-4, пока слабо справляются с этими задачами.'}, 'en': {'title': 'BoxingGym: Evaluating LLMs in Scientific Discovery', 'desc': 'This paper introduces BoxingGym, a benchmark designed to evaluate the capabilities of large language models (LLMs) in scientific discovery tasks. It focuses on two main aspects: experimental design, which involves collecting data to test scientific theories, and model discovery, which includes proposing and revising these theories. The benchmark consists of 10 environments modeled as generative probabilistic models from various scientific fields, allowing for interactive experimentation. The study finds that current LLMs, like GPT-4o, face challenges in both areas, and adding a statistical model does not consistently enhance their performance.'}, 'zh': {'title': '评估人工智能在科学研究中的能力', 'desc': '这篇论文探讨了人工智能在科学研究中的应用,特别是大型语言模型(LLM)在提出科学理论和设计实验方面的能力。作者提出了一个名为BoxingGym的基准测试,包含10个环境,用于系统评估实验设计和模型发现的能力。通过计算期望信息增益(EIG),论文量化了科学代理收集实验数据的有效性,并评估其提出的模型是否能进行可靠预测。研究发现,当前的LLM在实验设计和模型发现方面表现不佳,且简单地增加统计模型并未显著改善结果。'}}}, {'id': 'https://huggingface.co/papers/2501.04519', 'title': 'rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking', 'url': 'https://huggingface.co/papers/2501.04519', 'abstract': 'We present rStar-Math to demonstrate that small language models (SLMs) can rival or even surpass the math reasoning capability of OpenAI o1, without distillation from superior models. rStar-Math achieves this by exercising "deep thinking" through Monte Carlo Tree Search (MCTS), where a math policy SLM performs test-time search guided by an SLM-based process reward model. rStar-Math introduces three innovations to tackle the challenges in training the two SLMs: (1) a novel code-augmented CoT data sythesis method, which performs extensive MCTS rollouts to generate step-by-step verified reasoning trajectories used to train the policy SLM; (2) a novel process reward model training method that avoids na\\"ive step-level score annotation, yielding a more effective process preference model (PPM); (3) a self-evolution recipe in which the policy SLM and PPM are built from scratch and iteratively evolved to improve reasoning capabilities. Through 4 rounds of self-evolution with millions of synthesized solutions for 747k math problems, rStar-Math boosts SLMs\' math reasoning to state-of-the-art levels. On the MATH benchmark, it improves Qwen2.5-Math-7B from 58.8% to 90.0% and Phi3-mini-3.8B from 41.4% to 86.4%, surpassing o1-preview by +4.5% and +0.9%. On the USA Math Olympiad (AIME), rStar-Math solves an average of 53.3% (8/15) of problems, ranking among the top 20% the brightest high school math students. Code and data will be available at https://github.com/microsoft/rStar.', 'score': 100, 'issue_id': 1572, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'b065003de5fa3bde', 'authors': ['Xinyu Guan', 'Li Lyna Zhang', 'Yifei Liu', 'Ning Shang', 'Youran Sun', 'Yi Zhu', 'Fan Yang', 'Mao Yang'], 'affiliations': ['Microsoft', 'Peking University', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04519.jpg', 'data': {'categories': ['#training', '#reasoning', '#optimization', '#benchmark', '#small_models', '#dataset'], 'emoji': '🧮', 'ru': {'title': 'Малые модели решают большие задачи: rStar-Math превосходит гигантов в математике', 'desc': 'Статья представляет rStar-Math - подход, позволяющий малым языковым моделям (SLM) достичь или превзойти способности крупных моделей в математических рассуждениях. Метод использует поиск по методу Монте-Карло (MCTS) с двумя специально обученными SLM: политикой и моделью вознаграждения. Авторы вводят новые методы синтеза обучающих данных, обучения модели вознаграждения и итеративного улучшения моделей. В результате rStar-Math значительно повышает эффективность SLM на математических тестах, превосходя более крупные модели.'}, 'en': {'title': 'Empowering Small Models to Excel in Math Reasoning', 'desc': 'The paper introduces rStar-Math, a framework that enhances the math reasoning abilities of small language models (SLMs) without relying on larger models. It employs Monte Carlo Tree Search (MCTS) to enable deep thinking, allowing the SLM to perform guided search during problem-solving. Key innovations include a code-augmented Chain of Thought (CoT) data synthesis method for generating verified reasoning paths, a refined process preference model (PPM) for better reward training, and a self-evolution strategy for iterative improvement. As a result, rStar-Math significantly boosts the performance of SLMs on math benchmarks, achieving state-of-the-art results in various assessments.'}, 'zh': {'title': '小型语言模型的数学推理新突破', 'desc': 'rStar-Math展示了小型语言模型(SLMs)在数学推理能力上可以与OpenAI的o1相媲美,甚至超越它,而无需从更强大的模型中蒸馏。该方法通过蒙特卡洛树搜索(MCTS)实现“深度思考”,在测试时由SLM驱动的过程奖励模型指导数学策略SLM进行搜索。rStar-Math引入了三项创新来解决训练两个SLM的挑战,包括新颖的代码增强的链式推理数据合成方法和更有效的过程偏好模型(PPM)训练方法。经过四轮自我进化,rStar-Math在747,000个数学问题上生成了数百万个合成解,使SLMs的数学推理能力达到了最先进的水平。'}}}, {'id': 'https://huggingface.co/papers/2501.04682', 'title': 'Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Though', 'url': 'https://huggingface.co/papers/2501.04682', 'abstract': 'We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends traditional Chain-of-Thought (CoT) by explicitly modeling the underlying reasoning required to arrive at a particular CoT. We present empirical evidence from state-of-the-art models exhibiting behaviors consistent with in-context search, and explore methods for producing Meta-CoT via process supervision, synthetic data generation, and search algorithms. Finally, we outline a concrete pipeline for training a model to produce Meta-CoTs, incorporating instruction tuning with linearized search traces and reinforcement learning post-training. Finally, we discuss open research questions, including scaling laws, verifier roles, and the potential for discovering novel reasoning algorithms. This work provides a theoretical and practical roadmap to enable Meta-CoT in LLMs, paving the way for more powerful and human-like reasoning in artificial intelligence.', 'score': 42, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '3479f7793755e586', 'authors': ['Violet Xiang', 'Charlie Snell', 'Kanishk Gandhi', 'Alon Albalak', 'Anikait Singh', 'Chase Blagden', 'Duy Phung', 'Rafael Rafailov', 'Nathan Lile', 'Dakota Mahan', 'Louis Castricato', 'Jan-Philipp Franken', 'Nick Haber', 'Chelsea Finn'], 'affiliations': ['Stanford University', 'SynthLabs.ai', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.04682.jpg', 'data': {'categories': ['#synthetic', '#training', '#rlhf', '#rl', '#multimodal', '#optimization', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Meta-CoT: новый уровень рассуждений для ИИ', 'desc': 'Исследователи предлагают новую концепцию под названием Meta Chain-of-Thought (Meta-CoT), которая расширяет традиционный подход Chain-of-Thought. Meta-CoT моделирует базовые рассуждения, необходимые для построения цепочки мыслей. Авторы представляют эмпирические доказательства того, что современные языковые модели демонстрируют поведение, согласующееся с контекстным поиском. Они также описывают конкретный процесс обучения модели для генерации Meta-CoT, включающий инструктивную настройку и обучение с подкреплением.'}, 'en': {'title': 'Empowering AI with Enhanced Reasoning through Meta-CoT', 'desc': 'The paper introduces a new framework called Meta Chain-of-Thought (Meta-CoT), which enhances the traditional Chain-of-Thought (CoT) approach by focusing on the reasoning processes behind generating CoTs. It provides experimental results from advanced models that show behaviors similar to in-context search, and discusses techniques for creating Meta-CoT through process supervision, synthetic data, and search algorithms. The authors propose a detailed training pipeline that combines instruction tuning with search traces and reinforcement learning to improve the generation of Meta-CoTs. Additionally, the paper raises important questions about scaling, the role of verifiers, and the potential for discovering new reasoning methods, aiming to advance the reasoning capabilities of large language models (LLMs).'}, 'zh': {'title': '推动人工智能推理能力的元思维链', 'desc': '我们提出了一种新颖的框架,称为元思维链(Meta-CoT),它通过明确建模所需的推理过程来扩展传统的思维链(CoT)。我们展示了来自最先进模型的实证证据,这些模型表现出与上下文搜索一致的行为,并探索了通过过程监督、合成数据生成和搜索算法来生成元思维链的方法。最后,我们概述了一个具体的训练流程,结合了指令调优、线性化搜索轨迹和强化学习后训练,以生成元思维链。此项工作为在大型语言模型中实现元思维链提供了理论和实践的路线图,推动了人工智能更强大和更人性化的推理能力。'}}}, {'id': 'https://huggingface.co/papers/2501.04686', 'title': 'URSA: Understanding and Verifying Chain-of-thought Reasoning in Multimodal Mathematics', 'url': 'https://huggingface.co/papers/2501.04686', 'abstract': 'Chain-of-thought (CoT) reasoning has been widely applied in the mathematical reasoning of Large Language Models (LLMs). Recently, the introduction of derivative process supervision on CoT trajectories has sparked discussions on enhancing scaling capabilities during test time, thereby boosting the potential of these models. However, in multimodal mathematical reasoning, the scarcity of high-quality CoT training data has hindered existing models from achieving high-precision CoT reasoning and has limited the realization of reasoning potential during test time. In this work, we propose a three-module synthesis strategy that integrates CoT distillation, trajectory-format rewriting, and format unification. It results in a high-quality CoT reasoning instruction fine-tuning dataset in multimodal mathematics, MMathCoT-1M. We comprehensively validate the state-of-the-art (SOTA) performance of the trained URSA-7B model on multiple multimodal mathematical benchmarks. For test-time scaling, we introduce a data synthesis strategy that automatically generates process annotation datasets, known as DualMath-1.1M, focusing on both interpretation and logic. By further training URSA-7B on DualMath-1.1M, we transition from CoT reasoning capabilities to robust supervision abilities. The trained URSA-RM-7B acts as a verifier, effectively enhancing the performance of URSA-7B at test time. URSA-RM-7B also demonstrates excellent out-of-distribution (OOD) verifying capabilities, showcasing its generalization. Model weights, training data and code will be open-sourced.', 'score': 35, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '089df0fb9a548ce8', 'authors': ['Ruilin Luo', 'Zhuofan Zheng', 'Yifan Wang', 'Yiyao Yu', 'Xinzhe Ni', 'Zicheng Lin', 'Jin Zeng', 'Yujiu Yang'], 'affiliations': ['ByteDance', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04686.jpg', 'data': {'categories': ['#dataset', '#training', '#multimodal', '#data', '#open_source', '#reasoning', '#math', '#architecture', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Усиление мультимодальных математических рассуждений через синтез данных и верификацию', 'desc': 'Статья представляет новый подход к улучшению математических рассуждений в мультимодальных языковых моделях. Авторы предлагают стратегию синтеза высококачественного набора данных MMathCoT-1M для обучения цепочкам рассуждений. Они также вводят метод DualMath-1.1M для генерации аннотаций процесса рассуждений, что позволяет модели URSA-7B перейти от способности рассуждать к возможности проверять рассуждения. Результаты показывают улучшение производительности и обобщающей способности модели.'}, 'en': {'title': 'Enhancing Multimodal Mathematical Reasoning with CoT Synthesis', 'desc': "This paper discusses improving mathematical reasoning in Large Language Models (LLMs) using a method called Chain-of-Thought (CoT) reasoning. The authors introduce a new dataset, MMathCoT-1M, which is created through a three-module synthesis strategy to enhance the quality of CoT training data in multimodal mathematics. They also present a data synthesis strategy, DualMath-1.1M, that generates additional training data to improve the model's reasoning capabilities during testing. The results show that their model, URSA-RM-7B, significantly enhances performance and generalization in multimodal mathematical tasks."}, 'zh': {'title': '提升多模态数学推理的链式推理能力', 'desc': '本文探讨了链式推理(CoT)在大型语言模型(LLMs)中的应用,特别是在多模态数学推理中的挑战。由于高质量的CoT训练数据稀缺,现有模型在测试时的推理能力受到限制。为了解决这个问题,作者提出了一种三模块合成策略,生成了高质量的多模态数学推理指令微调数据集MMathCoT-1M。通过进一步训练URSA-7B模型,结合生成的数据集DualMath-1.1M,显著提升了模型在测试时的推理能力和验证能力。'}}}, {'id': 'https://huggingface.co/papers/2501.04227', 'title': 'Agent Laboratory: Using LLM Agents as Research Assistants', 'url': 'https://huggingface.co/papers/2501.04227', 'abstract': 'Historically, scientific discovery has been a lengthy and costly process, demanding substantial time and resources from initial conception to final results. To accelerate scientific discovery, reduce research costs, and improve research quality, we introduce Agent Laboratory, an autonomous LLM-based framework capable of completing the entire research process. This framework accepts a human-provided research idea and progresses through three stages--literature review, experimentation, and report writing to produce comprehensive research outputs, including a code repository and a research report, while enabling users to provide feedback and guidance at each stage. We deploy Agent Laboratory with various state-of-the-art LLMs and invite multiple researchers to assess its quality by participating in a survey, providing human feedback to guide the research process, and then evaluate the final paper. We found that: (1) Agent Laboratory driven by o1-preview generates the best research outcomes; (2) The generated machine learning code is able to achieve state-of-the-art performance compared to existing methods; (3) Human involvement, providing feedback at each stage, significantly improves the overall quality of research; (4) Agent Laboratory significantly reduces research expenses, achieving an 84% decrease compared to previous autonomous research methods. We hope Agent Laboratory enables researchers to allocate more effort toward creative ideation rather than low-level coding and writing, ultimately accelerating scientific discovery.', 'score': 34, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'ff592ae1a5a88909', 'authors': ['Samuel Schmidgall', 'Yusheng Su', 'Ze Wang', 'Ximeng Sun', 'Jialian Wu', 'Xiaodong Yu', 'Jiang Liu', 'Zicheng Liu', 'Emad Barsoum'], 'affiliations': ['AMD', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04227.jpg', 'data': {'categories': ['#science', '#training', '#agents', '#rlhf', '#survey'], 'emoji': '🧪', 'ru': {'title': 'Автономная лаборатория ИИ: революция в научных исследованиях', 'desc': 'Статья представляет Agent Laboratory - автономную систему на основе моделей LLM, способную выполнять полный цикл научного исследования. Система проходит через этапы обзора литературы, экспериментов и написания отчета, позволяя пользователям давать обратную связь на каждом этапе. Эксперименты показали, что Agent Laboratory, работающая на модели o1-preview, генерирует лучшие результаты исследований и значительно снижает затраты на исследования. Авторы надеются, что эта система позволит исследователям сосредоточиться на творческом процессе, ускоряя научные открытия.'}, 'en': {'title': 'Accelerating Science with Autonomous Research Frameworks', 'desc': 'The paper presents Agent Laboratory, an autonomous framework that utilizes large language models (LLMs) to streamline the scientific research process. It operates in three stages: conducting a literature review, performing experiments, and writing reports, all while allowing human researchers to provide feedback. The study shows that Agent Laboratory can produce high-quality research outputs, including code that outperforms existing methods, and significantly reduces research costs by 84%. By automating routine tasks, the framework aims to free researchers to focus more on innovative ideas and less on tedious coding and documentation.'}, 'zh': {'title': 'Agent Laboratory:加速科学发现的智能助手', 'desc': '本文介绍了一种名为Agent Laboratory的自主框架,旨在加速科学发现并降低研究成本。该框架基于大型语言模型(LLM),能够完成文献综述、实验和报告撰写等整个研究过程。研究表明,Agent Laboratory在生成研究成果方面表现优异,尤其是在机器学习代码的性能上,达到了最先进的水平。通过人类反馈的参与,研究质量显著提高,同时研究费用减少了84%。'}}}, {'id': 'https://huggingface.co/papers/2501.04306', 'title': 'LLM4SR: A Survey on Large Language Models for Scientific Research', 'url': 'https://huggingface.co/papers/2501.04306', 'abstract': 'In recent years, the rapid advancement of Large Language Models (LLMs) has transformed the landscape of scientific research, offering unprecedented support across various stages of the research cycle. This paper presents the first systematic survey dedicated to exploring how LLMs are revolutionizing the scientific research process. We analyze the unique roles LLMs play across four critical stages of research: hypothesis discovery, experiment planning and implementation, scientific writing, and peer reviewing. Our review comprehensively showcases the task-specific methodologies and evaluation benchmarks. By identifying current challenges and proposing future research directions, this survey not only highlights the transformative potential of LLMs, but also aims to inspire and guide researchers and practitioners in leveraging LLMs to advance scientific inquiry. Resources are available at the following repository: https://github.com/du-nlp-lab/LLM4SR', 'score': 17, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'bfb9039780003b6d', 'authors': ['Ziming Luo', 'Zonglin Yang', 'Zexin Xu', 'Wei Yang', 'Xinya Du'], 'affiliations': ['Nanyang Technological University, Singapore', 'University of Texas at Dallas, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.04306.jpg', 'data': {'categories': ['#science', '#survey', '#multimodal', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'LLM как революционный инструмент в научных исследованиях', 'desc': 'Эта статья представляет собой первый систематический обзор роли больших языковых моделей (LLM) в научных исследованиях. Авторы анализируют, как LLM используются на четырех ключевых этапах исследовательского процесса: формирование гипотез, планирование и проведение экспериментов, научное письмо и рецензирование. В работе рассматриваются специфические методологии и критерии оценки для каждой задачи. Статья также обсуждает текущие проблемы и предлагает направления для будущих исследований в этой области.'}, 'en': {'title': 'Revolutionizing Research: The Power of Large Language Models', 'desc': 'This paper systematically surveys the impact of Large Language Models (LLMs) on the scientific research process. It identifies how LLMs assist in four key stages: generating hypotheses, planning and conducting experiments, writing scientific papers, and facilitating peer reviews. The authors discuss specific methodologies and evaluation benchmarks for each task, highlighting the transformative potential of LLMs in enhancing research efficiency. Additionally, the paper addresses current challenges and suggests future research directions to further integrate LLMs into scientific inquiry.'}, 'zh': {'title': '大型语言模型:科学研究的变革者', 'desc': '近年来,大型语言模型(LLMs)的快速发展改变了科学研究的格局,为研究周期的各个阶段提供了前所未有的支持。本文首次系统性地调查了LLMs如何革新科学研究过程,分析了它们在假设发现、实验规划与实施、科学写作和同行评审等四个关键阶段的独特作用。我们的综述全面展示了任务特定的方法论和评估基准,并识别了当前面临的挑战,提出了未来的研究方向。通过强调LLMs的变革潜力,本文旨在激励和指导研究人员和从业者利用LLMs推动科学探索。'}}}, {'id': 'https://huggingface.co/papers/2501.04575', 'title': 'InfiGUIAgent: A Multimodal Generalist GUI Agent with Native Reasoning and Reflection', 'url': 'https://huggingface.co/papers/2501.04575', 'abstract': 'Graphical User Interface (GUI) Agents, powered by multimodal large language models (MLLMs), have shown great potential for task automation on computing devices such as computers and mobile phones. However, existing agents face challenges in multi-step reasoning and reliance on textual annotations, limiting their effectiveness. We introduce InfiGUIAgent, an MLLM-based GUI Agent trained with a two-stage supervised fine-tuning pipeline. Stage 1 enhances fundamental skills such as GUI understanding and grounding, while Stage 2 integrates hierarchical reasoning and expectation-reflection reasoning skills using synthesized data to enable native reasoning abilities of the agents. InfiGUIAgent achieves competitive performance on several GUI benchmarks, highlighting the impact of native reasoning skills in enhancing GUI interaction for automation tasks. Resources are available at https://github.com/Reallm-Labs/InfiGUIAgent.', 'score': 14, 'issue_id': 1574, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '501c7ba58ede235b', 'authors': ['Yuhang Liu', 'Pengxiang Li', 'Zishu Wei', 'Congkai Xie', 'Xueyu Hu', 'Xinchen Xu', 'Shengyu Zhang', 'Xiaotian Han', 'Hongxia Yang', 'Fei Wu'], 'affiliations': ['ByteDance Inc', 'Dalian University of Technology', 'Reallm Labs', 'The Hong Kong Polytechnic University', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04575.jpg', 'data': {'categories': ['#benchmark', '#synthetic', '#training', '#agents', '#multimodal', '#reasoning'], 'emoji': '🤖', 'ru': {'title': 'Умный агент GUI: новый уровень автоматизации интерфейсов', 'desc': 'InfiGUIAgent - это агент графического пользовательского интерфейса, основанный на мультимодальных больших языковых моделях (MLLM). Он обучается с помощью двухэтапного процесса точной настройки, который улучшает базовые навыки понимания GUI и развивает способности к иерархическому рассуждению. InfiGUIAgent демонстрирует высокую эффективность в автоматизации задач взаимодействия с GUI, превосходя существующие подходы. Разработка направлена на преодоление ограничений, связанных с многошаговыми рассуждениями и зависимостью от текстовых аннотаций.'}, 'en': {'title': 'Empowering GUI Agents with Native Reasoning Skills', 'desc': "InfiGUIAgent is a new type of Graphical User Interface (GUI) agent that uses multimodal large language models (MLLMs) to improve task automation on devices like computers and smartphones. This agent addresses the limitations of existing systems by employing a two-stage supervised fine-tuning process. The first stage focuses on developing basic skills such as understanding and interacting with GUIs, while the second stage enhances the agent's ability to perform complex reasoning tasks. As a result, InfiGUIAgent demonstrates strong performance on various GUI benchmarks, showcasing the importance of advanced reasoning capabilities in automating GUI interactions."}, 'zh': {'title': '提升GUI交互的原生推理能力', 'desc': '本文介绍了一种名为InfiGUIAgent的图形用户界面(GUI)代理,它基于多模态大型语言模型(MLLM)进行任务自动化。InfiGUIAgent通过两阶段的监督微调流程进行训练,第一阶段提升了GUI理解和基础技能,第二阶段则通过合成数据整合了层次推理和期望反思推理能力。该代理在多个GUI基准测试中表现出色,显示了原生推理能力在增强GUI交互中的重要性。此研究为提高计算设备上的自动化任务提供了新的思路和方法。'}}}, {'id': 'https://huggingface.co/papers/2501.02772', 'title': 'GeAR: Generation Augmented Retrieval', 'url': 'https://huggingface.co/papers/2501.02772', 'abstract': 'Document retrieval techniques form the foundation for the development of large-scale information systems. The prevailing methodology is to construct a bi-encoder and compute the semantic similarity. However, such scalar similarity is difficult to reflect enough information and impedes our comprehension of the retrieval results. In addition, this computational process mainly emphasizes the global semantics and ignores the fine-grained semantic relationship between the query and the complex text in the document. In this paper, we propose a new method called Generation Augmented Retrieval (GeAR) that incorporates well-designed fusion and decoding modules. This enables GeAR to generate the relevant text from documents based on the fused representation of the query and the document, thus learning to "focus on" the fine-grained information. Also when used as a retriever, GeAR does not add any computational burden over bi-encoders. To support the training of the new framework, we have introduced a pipeline to efficiently synthesize high-quality data by utilizing large language models. GeAR exhibits competitive retrieval and localization performance across diverse scenarios and datasets. Moreover, the qualitative analysis and the results generated by GeAR provide novel insights into the interpretation of retrieval results. The code, data, and models will be released after completing technical review to facilitate future research.', 'score': 11, 'issue_id': 1572, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'dafa87428ce906b5', 'authors': ['Haoyu Liu', 'Shaohan Huang', 'Jianfeng Liu', 'Yuefeng Zhan', 'Hao Sun', 'Weiwei Deng', 'Feng Sun', 'Furu Wei', 'Qi Zhang'], 'affiliations': ['Microsoft Corporation'], 'pdf_title_img': 'assets/pdf/title_img/2501.02772.jpg', 'data': {'categories': ['#interpretability', '#data', '#rag', '#synthetic', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'GeAR: Новый взгляд на извлечение документов через генерацию', 'desc': 'Статья предлагает новый метод извлечения документов под названием Generation Augmented Retrieval (GeAR). В отличие от традиционных би-энкодеров, GeAR использует модули слияния и декодирования для генерации релевантного текста на основе запроса и документа. Это позволяет модели фокусироваться на детальной информации, не увеличивая вычислительную нагрузку. Авторы также разработали конвейер для синтеза качественных данных с помощью больших языковых моделей для обучения GeAR.'}, 'en': {'title': 'GeAR: Enhancing Document Retrieval with Fine-Grained Semantic Focus', 'desc': 'This paper introduces a new method called Generation Augmented Retrieval (GeAR) that enhances document retrieval techniques by focusing on fine-grained semantic relationships. Unlike traditional bi-encoders that primarily assess global semantics, GeAR generates relevant text from documents by fusing the query and document representations. This approach allows for a deeper understanding of retrieval results without increasing computational costs. Additionally, the authors provide a pipeline for synthesizing high-quality training data using large language models, leading to improved performance across various datasets.'}, 'zh': {'title': '生成增强检索:关注细粒度信息的创新方法', 'desc': '本文提出了一种新的文档检索方法,称为生成增强检索(GeAR)。GeAR通过融合查询和文档的表示,生成相关文本,从而关注细粒度信息。与传统的双编码器方法相比,GeAR在检索时不会增加计算负担,同时在多种场景和数据集上表现出竞争力的检索和定位性能。该方法还通过利用大型语言模型合成高质量数据,支持新框架的训练。'}}}, {'id': 'https://huggingface.co/papers/2501.04144', 'title': 'Chirpy3D: Continuous Part Latents for Creative 3D Bird Generation', 'url': 'https://huggingface.co/papers/2501.04144', 'abstract': 'In this paper, we push the boundaries of fine-grained 3D generation into truly creative territory. Current methods either lack intricate details or simply mimic existing objects -- we enable both. By lifting 2D fine-grained understanding into 3D through multi-view diffusion and modeling part latents as continuous distributions, we unlock the ability to generate entirely new, yet plausible parts through interpolation and sampling. A self-supervised feature consistency loss further ensures stable generation of these unseen parts. The result is the first system capable of creating novel 3D objects with species-specific details that transcend existing examples. While we demonstrate our approach on birds, the underlying framework extends beyond things that can chirp! Code will be released at https://github.com/kamwoh/chirpy3d.', 'score': 9, 'issue_id': 1578, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '89e2fad397bf0684', 'authors': ['Kam Woh Ng', 'Jing Yang', 'Jia Wei Sii', 'Jiankang Deng', 'Chee Seng Chan', 'Yi-Zhe Song', 'Tao Xiang', 'Xiatian Zhu'], 'affiliations': ['Imperial College London', 'Universiti Malaya', 'University of Cambridge', 'University of Surrey'], 'pdf_title_img': 'assets/pdf/title_img/2501.04144.jpg', 'data': {'categories': ['#diffusion', '#open_source', '#3d'], 'emoji': '🐦', 'ru': {'title': 'Генерация креативных 3D-моделей с беспрецедентной детализацией', 'desc': 'Эта статья представляет новый метод генерации детализированных 3D-объектов, выходящий за рамки простого копирования существующих примеров. Авторы используют мультиракурсную диффузию и моделирование латентных представлений частей объекта как непрерывных распределений. Это позволяет создавать совершенно новые, но правдоподобные части объектов путем интерполяции и сэмплирования. Самоконтролируемая функция потерь обеспечивает стабильную генерацию этих невиданных ранее частей.'}, 'en': {'title': 'Unlocking Creative 3D Generation with Fine-Grained Detail', 'desc': 'This paper introduces a novel approach to generating detailed 3D objects that are not just replicas of existing items. By utilizing multi-view diffusion and treating part latents as continuous distributions, the authors enable the creation of new and realistic 3D parts through interpolation and sampling techniques. A self-supervised feature consistency loss is implemented to maintain stability in generating these novel parts. The system is demonstrated on birds, showcasing its ability to produce unique species-specific details, while the framework is applicable to a broader range of objects.'}, 'zh': {'title': '突破性细粒度3D生成,创造全新物体!', 'desc': '本文提出了一种创新的细粒度3D生成方法,能够创造出全新的3D物体,而不仅仅是模仿现有物体。我们通过多视角扩散将2D细粒度理解提升到3D,并将部分潜变量建模为连续分布,从而实现了新部件的插值和采样生成。自监督特征一致性损失确保了这些未见部件的稳定生成。我们的系统能够生成具有特定物种细节的全新3D对象,超越了现有的示例。'}}}, {'id': 'https://huggingface.co/papers/2501.04689', 'title': 'SPAR3D: Stable Point-Aware Reconstruction of 3D Objects from Single Images', 'url': 'https://huggingface.co/papers/2501.04689', 'abstract': 'We study the problem of single-image 3D object reconstruction. Recent works have diverged into two directions: regression-based modeling and generative modeling. Regression methods efficiently infer visible surfaces, but struggle with occluded regions. Generative methods handle uncertain regions better by modeling distributions, but are computationally expensive and the generation is often misaligned with visible surfaces. In this paper, we present SPAR3D, a novel two-stage approach aiming to take the best of both directions. The first stage of SPAR3D generates sparse 3D point clouds using a lightweight point diffusion model, which has a fast sampling speed. The second stage uses both the sampled point cloud and the input image to create highly detailed meshes. Our two-stage design enables probabilistic modeling of the ill-posed single-image 3D task while maintaining high computational efficiency and great output fidelity. Using point clouds as an intermediate representation further allows for interactive user edits. Evaluated on diverse datasets, SPAR3D demonstrates superior performance over previous state-of-the-art methods, at an inference speed of 0.7 seconds. Project page with code and model: https://spar3d.github.io', 'score': 9, 'issue_id': 1576, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '00474027a65aa27c', 'authors': ['Zixuan Huang', 'Mark Boss', 'Aaryaman Vasishta', 'James M. Rehg', 'Varun Jampani'], 'affiliations': ['Stability AI', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2501.04689.jpg', 'data': {'categories': ['#3d'], 'emoji': '🧊', 'ru': {'title': 'SPAR3D: Эффективная реконструкция 3D-объектов с использованием облаков точек', 'desc': 'В статье представлен новый двухэтапный подход SPAR3D для реконструкции 3D-объектов по одному изображению. На первом этапе генерируется разреженное облако точек с помощью легковесной модели диффузии точек. На втором этапе используются сгенерированное облако точек и исходное изображение для создания детализированных 3D-моделей. Этот метод сочетает преимущества регрессионного и генеративного моделирования, обеспечивая высокую вычислительную эффективность и качество результатов.'}, 'en': {'title': 'SPAR3D: Efficient and Detailed 3D Reconstruction from a Single Image', 'desc': 'This paper introduces SPAR3D, a new method for reconstructing 3D objects from a single image. It combines regression and generative modeling to efficiently create 3D point clouds and detailed meshes. The first stage generates sparse point clouds quickly, while the second stage refines these into high-quality meshes using the input image. SPAR3D achieves high fidelity and speed, outperforming existing methods and allowing for user interaction with the 3D output.'}, 'zh': {'title': 'SPAR3D:高效的单图像三维重建新方法', 'desc': '我们研究了单幅图像的三维物体重建问题。最近的研究分为两种方向:基于回归的建模和生成建模。回归方法能够有效推断可见表面,但在处理遮挡区域时表现不佳;而生成方法通过建模分布更好地处理不确定区域,但计算开销大且生成结果常常与可见表面不对齐。本文提出了SPAR3D,这是一种新颖的两阶段方法,旨在结合两种方法的优点,快速生成稀疏的三维点云,并利用输入图像创建高细节的网格。'}}}, {'id': 'https://huggingface.co/papers/2501.03271', 'title': 'DPO Kernels: A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich Paradigm for Direct Preference Optimization', 'url': 'https://huggingface.co/papers/2501.03271', 'abstract': 'The rapid rise of large language models (LLMs) has unlocked many applications but also underscores the challenge of aligning them with diverse values and preferences. Direct Preference Optimization (DPO) is central to alignment but constrained by fixed divergences and limited feature transformations. We propose DPO-Kernels, which integrates kernel methods to address these issues through four key contributions: (i) Kernelized Representations with polynomial, RBF, Mahalanobis, and spectral kernels for richer transformations, plus a hybrid loss combining embedding-based and probability-based objectives; (ii) Divergence Alternatives (Jensen-Shannon, Hellinger, Renyi, Bhattacharyya, Wasserstein, and f-divergences) for greater stability; (iii) Data-Driven Selection metrics that automatically choose the best kernel-divergence pair; and (iv) a Hierarchical Mixture of Kernels for both local precision and global modeling. Evaluations on 12 datasets demonstrate state-of-the-art performance in factuality, safety, reasoning, and instruction following. Grounded in Heavy-Tailed Self-Regularization, DPO-Kernels maintains robust generalization for LLMs, offering a comprehensive resource for further alignment research.', 'score': 5, 'issue_id': 1576, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': '33d1640aee045ed5', 'authors': ['Amitava Das', 'Suranjana Trivedy', 'Danush Khanna', 'Rajarshi Roy', 'Gurpreet Singh', 'Basab Ghosh', 'Yaswanth Narsupalli', 'Vinija Jain', 'Vasu Sharma', 'Aishwarya Naresh Reganti', 'Aman Chadha'], 'affiliations': ['Amazon AI, USA', 'Artificial Intelligence Institute, University of South Carolina, USA', 'Meta AI, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.03271.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#reasoning', '#dataset', '#training'], 'emoji': '🧠', 'ru': {'title': 'DPO-Kernels: Новый подход к выравниванию языковых моделей', 'desc': 'Статья представляет новый метод под названием DPO-Kernels для улучшения выравнивания больших языковых моделей (LLM) с различными ценностями и предпочтениями. Авторы предлагают использовать методы ядер для расширения возможностей прямой оптимизации предпочтений (DPO), включая кернелизованные представления, альтернативные дивергенции и data-driven выбор наилучшей комбинации ядра и дивергенции. DPO-Kernels демонстрирует улучшенные результаты в задачах фактологичности, безопасности, рассуждений и следования инструкциям на 12 наборах данных. Метод основан на саморегуляризации с тяжелыми хвостами и обеспечивает надежную генерализацию для LLM.'}, 'en': {'title': 'Enhancing LLM Alignment with DPO-Kernels', 'desc': 'This paper introduces DPO-Kernels, a method designed to improve the alignment of large language models (LLMs) with diverse user values. It enhances Direct Preference Optimization (DPO) by incorporating kernel methods, allowing for more flexible feature transformations and better divergence measures. The approach includes a hybrid loss function, various divergence alternatives, and data-driven selection metrics to optimize performance. Evaluations show that DPO-Kernels achieves state-of-the-art results in key areas such as factuality and safety across multiple datasets.'}, 'zh': {'title': 'DPO-Kernels:提升大型语言模型对齐的创新方法', 'desc': '大型语言模型(LLMs)的快速发展带来了许多应用,但也突显了与多样化价值观和偏好对齐的挑战。直接偏好优化(DPO)是对齐的核心,但受到固定散度和有限特征变换的限制。我们提出了DPO-Kernels,通过四个关键贡献来解决这些问题,包括使用多项式、RBF、Mahalanobis和谱核的核化表示,以及结合嵌入基础和基于概率的目标的混合损失。我们的评估在12个数据集上展示了在事实性、安全性、推理和指令遵循方面的最先进性能,DPO-Kernels为进一步的对齐研究提供了全面的资源。'}}}, {'id': 'https://huggingface.co/papers/2501.04694', 'title': 'EpiCoder: Encompassing Diversity and Complexity in Code Generation', 'url': 'https://huggingface.co/papers/2501.04694', 'abstract': 'Effective instruction tuning is indispensable for optimizing code LLMs, aligning model behavior with user expectations and enhancing model performance in real-world applications. However, most existing methods focus on code snippets, which are limited to specific functionalities and rigid structures, restricting the complexity and diversity of the synthesized data. To address these limitations, we introduce a novel feature tree-based synthesis framework inspired by Abstract Syntax Trees (AST). Unlike AST, which captures syntactic structure of code, our framework models semantic relationships between code elements, enabling the generation of more nuanced and diverse data. The feature tree is constructed from raw data and refined iteratively to increase the quantity and diversity of the extracted features. This process enables the identification of more complex patterns and relationships within the code. By sampling subtrees with controlled depth and breadth, our framework allows precise adjustments to the complexity of the generated code, supporting a wide range of tasks from simple function-level operations to intricate multi-file scenarios. We fine-tuned widely-used base models to create the EpiCoder series, achieving state-of-the-art performance at both the function and file levels across multiple benchmarks. Notably, empirical evidence indicates that our approach shows significant potential in synthesizing highly complex repository-level code data. Further analysis elucidates the merits of this approach by rigorously assessing data complexity and diversity through software engineering principles and LLM-as-a-judge method.', 'score': 4, 'issue_id': 1581, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1c1ef93cdfc23c2f', 'authors': ['Yaoxiang Wang', 'Haoling Li', 'Xin Zhang', 'Jie Wu', 'Xiao Liu', 'Wenxiang Hu', 'Zhongxin Guo', 'Yangyu Huang', 'Ying Xin', 'Yujiu Yang', 'Jinsong Su', 'Qi Chen', 'Scarlett Li'], 'affiliations': ['Microsoft', 'Tsinghua University', 'Xiamen University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04694.jpg', 'data': {'categories': ['#dataset', '#data', '#synthetic', '#training', '#optimization', '#alignment', '#architecture'], 'emoji': '🌳', 'ru': {'title': 'Дерево признаков: новый путь к улучшению языковых моделей для кода', 'desc': 'Статья представляет новый подход к улучшению языковых моделей для программирования с использованием дерева признаков, вдохновленного абстрактными синтаксическими деревьями. Этот метод позволяет генерировать более сложные и разнообразные обучающие данные, моделируя семантические связи между элементами кода. Авторы создали серию моделей EpiCoder, достигших высоких результатов в нескольких бенчмарках. Эмпирические данные показывают потенциал метода для синтеза сложных репозиториев кода.'}, 'en': {'title': 'Unlocking Code Complexity with Feature Trees', 'desc': 'This paper presents a new framework for instruction tuning in code language models (LLMs) that enhances their performance by generating more complex and diverse code data. The proposed feature tree-based synthesis framework goes beyond traditional code snippet methods by modeling semantic relationships between code elements, inspired by Abstract Syntax Trees (AST). By iteratively refining the feature tree, the framework captures intricate patterns and relationships, allowing for the generation of code that ranges from simple functions to complex multi-file scenarios. The authors demonstrate that their fine-tuned EpiCoder models achieve state-of-the-art results across various benchmarks, highlighting the effectiveness of their approach in synthesizing complex repository-level code data.'}, 'zh': {'title': '特征树框架:提升代码生成的复杂性与多样性', 'desc': '本论文提出了一种新的特征树合成框架,用于优化代码大语言模型(LLMs)的指令调优。该框架通过建模代码元素之间的语义关系,克服了现有方法在功能和结构上的局限性,从而生成更复杂和多样化的数据。特征树从原始数据构建,并通过迭代精炼,增加提取特征的数量和多样性。最终,我们通过微调广泛使用的基础模型,创建了EpiCoder系列,在多个基准测试中实现了函数和文件级别的最先进性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04652', 'title': 'Multi-task retriever fine-tuning for domain-specific and efficient RAG', 'url': 'https://huggingface.co/papers/2501.04652', 'abstract': 'Retrieval-Augmented Generation (RAG) has become ubiquitous when deploying Large Language Models (LLMs), as it can address typical limitations such as generating hallucinated or outdated information. However, when building real-world RAG applications, practical issues arise. First, the retrieved information is generally domain-specific. Since it is computationally expensive to fine-tune LLMs, it is more feasible to fine-tune the retriever to improve the quality of the data included in the LLM input. Second, as more applications are deployed in the same real-world system, one cannot afford to deploy separate retrievers. Moreover, these RAG applications normally retrieve different kinds of data. Our solution is to instruction fine-tune a small retriever encoder on a variety of domain-specific tasks to allow us to deploy one encoder that can serve many use cases, thereby achieving low-cost, scalability, and speed. We show how this encoder generalizes to out-of-domain settings as well as to an unseen retrieval task on real-world enterprise use cases.', 'score': 1, 'issue_id': 1584, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1c906eb3ec9e3da5', 'authors': ['Patrice Béchard', 'Orlando Marquez Ayala'], 'affiliations': ['ServiceNow'], 'pdf_title_img': 'assets/pdf/title_img/2501.04652.jpg', 'data': {'categories': ['#transfer_learning', '#training', '#hallucinations', '#rag', '#optimization'], 'emoji': '🔍', 'ru': {'title': 'Универсальный извлекатель информации для эффективного RAG', 'desc': 'Данная статья представляет новый подход к улучшению систем извлечения информации для крупных языковых моделей. Авторы предлагают дообучать небольшой энкодер для извлечения информации на различных доменно-специфичных задачах. Это позволяет использовать один энкодер для множества приложений, обеспечивая масштабируемость и эффективность. Исследование показывает, что такой подход хорошо обобщается на новые домены и задачи извлечения информации в реальных корпоративных сценариях.'}, 'en': {'title': 'One Retriever to Rule Them All: Scalable RAG Solutions', 'desc': 'This paper discusses the challenges of using Retrieval-Augmented Generation (RAG) with Large Language Models (LLMs), particularly the issues of domain-specific information retrieval and the high cost of fine-tuning LLMs. The authors propose a solution that involves instruction fine-tuning a small retriever encoder on multiple domain-specific tasks, allowing it to serve various applications without needing separate retrievers. This approach enhances the quality of data fed into the LLM while maintaining low costs and scalability. The results demonstrate that the fine-tuned encoder can effectively generalize to new, unseen tasks in real-world scenarios.'}, 'zh': {'title': '一个编码器,多种应用,低成本高效能', 'desc': '检索增强生成(RAG)在部署大型语言模型(LLM)时变得非常普遍,因为它可以解决生成虚假或过时信息的典型问题。本文提出了一种解决方案,通过对小型检索器编码器进行指令微调,使其能够在多种特定领域任务上工作,从而实现一个编码器服务多个用例。这样可以降低成本,提高可扩展性和速度,同时避免为每个应用程序部署单独的检索器。我们的实验表明,该编码器在不同领域设置和未见过的检索任务中也能很好地泛化。'}}}, {'id': 'https://huggingface.co/papers/2501.00958', 'title': '2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining', 'url': 'https://huggingface.co/papers/2501.00958', 'abstract': 'Compared to image-text pair data, interleaved corpora enable Vision-Language Models (VLMs) to understand the world more naturally like humans. However, such existing datasets are crawled from webpage, facing challenges like low knowledge density, loose image-text relations, and poor logical coherence between images. On the other hand, the internet hosts vast instructional videos (e.g., online geometry courses) that are widely used by humans to learn foundational subjects, yet these valuable resources remain underexplored in VLM training. In this paper, we introduce a high-quality multimodal textbook corpus with richer foundational knowledge for VLM pretraining. It collects over 2.5 years of instructional videos, totaling 22,000 class hours. We first use an LLM-proposed taxonomy to systematically gather instructional videos. Then we progressively extract and refine visual (keyframes), audio (ASR), and textual knowledge (OCR) from the videos, and organize as an image-text interleaved corpus based on temporal order. Compared to its counterparts, our video-centric textbook offers more coherent context, richer knowledge, and better image-text alignment. Experiments demonstrate its superb pretraining performance, particularly in knowledge- and reasoning-intensive tasks like ScienceQA and MathVista. Moreover, VLMs pre-trained on our textbook exhibit outstanding interleaved context awareness, leveraging visual and textual cues in their few-shot context for task solving~Our code are available at \\url{https://github.com/DAMO-NLP-SG/multimodal_textbook}.', 'score': 68, 'issue_id': 1475, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'b10f0cd62f6334fc', 'authors': ['Wenqi Zhang', 'Hang Zhang', 'Xin Li', 'Jiashuo Sun', 'Yongliang Shen', 'Weiming Lu', 'Deli Zhao', 'Yueting Zhuang', 'Lidong Bing'], 'affiliations': ['College of Computer Science and Technology, Zhejiang University', 'DAMO Academy, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.00958.jpg', 'data': {'categories': ['#science', '#dataset', '#reasoning', '#multimodal', '#cv', '#video'], 'emoji': '📚', 'ru': {'title': 'Мультимодальный учебник: новый стандарт для обучения VLM', 'desc': 'Эта статья представляет новый подход к обучению моделей компьютерного зрения и обработки естественного языка (VLM) с использованием мультимодального учебного корпуса. Авторы создали базу данных из 22 000 часов обучающих видео, систематически собранных с помощью таксономии, предложенной языковой моделью (LLM). Этот корпус отличается более высокой плотностью знаний, лучшей связью между изображениями и текстом, а также логической согласованностью по сравнению с существующими наборами данных. Эксперименты показывают превосходную производительность предобучения на этом корпусе, особенно в задачах, требующих глубоких знаний и рассуждений.'}, 'en': {'title': 'Harnessing Instructional Videos for Superior Vision-Language Model Training', 'desc': 'This paper presents a new approach to training Vision-Language Models (VLMs) using a multimodal textbook corpus derived from instructional videos. Unlike traditional datasets that often suffer from low knowledge density and weak image-text relationships, this corpus offers a richer and more coherent context for VLM pretraining. The authors systematically extract visual, audio, and textual information from over 22,000 hours of instructional content, enhancing the alignment between images and text. Experiments show that VLMs trained on this video-centric dataset perform significantly better on knowledge-intensive tasks, demonstrating improved reasoning and context awareness.'}, 'zh': {'title': '视频教材:提升视觉语言模型的知识与推理能力', 'desc': '本文提出了一种高质量的多模态教材语料库,旨在为视觉语言模型(VLM)提供更丰富的基础知识。该语料库收集了超过2.5年的教学视频,总计22,000小时,系统性地提取了视频中的视觉、音频和文本知识。与现有的数据集相比,这种视频中心的教材提供了更连贯的上下文、更丰富的知识和更好的图像-文本对齐。实验结果表明,基于该教材预训练的VLM在知识和推理密集型任务中表现优异,尤其在ScienceQA和MathVista等任务中。'}}}, {'id': 'https://huggingface.co/papers/2501.01427', 'title': 'VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion Control', 'url': 'https://huggingface.co/papers/2501.01427', 'abstract': 'Despite significant advancements in video generation, inserting a given object into videos remains a challenging task. The difficulty lies in preserving the appearance details of the reference object and accurately modeling coherent motions at the same time. In this paper, we propose VideoAnydoor, a zero-shot video object insertion framework with high-fidelity detail preservation and precise motion control. Starting from a text-to-video model, we utilize an ID extractor to inject the global identity and leverage a box sequence to control the overall motion. To preserve the detailed appearance and meanwhile support fine-grained motion control, we design a pixel warper. It takes the reference image with arbitrary key-points and the corresponding key-point trajectories as inputs. It warps the pixel details according to the trajectories and fuses the warped features with the diffusion U-Net, thus improving detail preservation and supporting users in manipulating the motion trajectories. In addition, we propose a training strategy involving both videos and static images with a reweight reconstruction loss to enhance insertion quality. VideoAnydoor demonstrates significant superiority over existing methods and naturally supports various downstream applications (e.g., talking head generation, video virtual try-on, multi-region editing) without task-specific fine-tuning.', 'score': 39, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '4c67f688775a3eca', 'authors': ['Yuanpeng Tu', 'Hao Luo', 'Xi Chen', 'Sihui Ji', 'Xiang Bai', 'Hengshuang Zhao'], 'affiliations': ['DAMO Academy, Alibaba Group', 'HUST', 'Hupan Lab', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.01427.jpg', 'data': {'categories': ['#diffusion', '#games', '#video'], 'emoji': '🎬', 'ru': {'title': 'Точная вставка объектов в видео с сохранением деталей', 'desc': 'В этой статье представлен VideoAnydoor - фреймворк для вставки объектов в видео без предварительного обучения. Он использует экстрактор идентификаторов и последовательность ограничивающих рамок для контроля движения объекта. Ключевым компонентом является пиксельный варпер, который сохраняет детали внешнего вида и позволяет точно управлять движением. Предложенная стратегия обучения с использованием видео и статических изображений улучшает качество вставки объектов.'}, 'en': {'title': 'Seamless Object Insertion in Videos with VideoAnydoor', 'desc': 'This paper introduces VideoAnydoor, a novel framework for zero-shot video object insertion that excels in maintaining high-fidelity details and precise motion control. The approach begins with a text-to-video model and incorporates an ID extractor to ensure consistent object identity while using a box sequence for motion management. A key innovation is the pixel warper, which adjusts pixel details based on key-point trajectories, enhancing both detail preservation and user control over motion. The proposed training strategy, which combines videos and static images with a reweighted reconstruction loss, significantly improves the quality of object insertion, making VideoAnydoor versatile for various applications without needing specific fine-tuning.'}, 'zh': {'title': '高保真视频对象插入的新突破', 'desc': '尽管视频生成技术取得了显著进展,但将特定对象插入视频仍然是一项具有挑战性的任务。本文提出了VideoAnydoor,这是一个零-shot视频对象插入框架,能够高保真地保留细节并精确控制运动。我们设计了一种像素变形器,能够根据关键点轨迹扭曲像素细节,并与扩散U-Net融合,从而提高细节保留能力。VideoAnydoor在现有方法中表现出显著优势,并支持多种下游应用,无需特定任务的微调。'}}}, {'id': 'https://huggingface.co/papers/2501.01257', 'title': 'CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings', 'url': 'https://huggingface.co/papers/2501.01257', 'abstract': 'With the increasing code reasoning capabilities of existing large language models (LLMs) and breakthroughs in reasoning models like OpenAI o1 and o3, there is a growing need to develop more challenging and comprehensive benchmarks that effectively test their sophisticated competition-level coding abilities. Existing benchmarks, like LiveCodeBench and USACO, fall short due to the unavailability of private test cases, lack of support for special judges, and misaligned execution environments. To bridge this gap, we introduce CodeElo, a standardized competition-level code generation benchmark that effectively addresses all these challenges for the first time. CodeElo benchmark is mainly based on the official CodeForces platform and tries to align with the platform as much as possible. We compile the recent six months of contest problems on CodeForces with detailed information such as contest divisions, problem difficulty ratings, and problem algorithm tags. We introduce a unique judging method in which problems are submitted directly to the platform and develop a reliable Elo rating calculation system that aligns with the platform and is comparable with human participants but has lower variance. By testing on our CodeElo, we provide the Elo ratings of 30 existing popular open-source and 3 proprietary LLMs for the first time. The results show that o1-mini and QwQ-32B-Preview stand out significantly, achieving Elo ratings of 1578 and 1261, respectively, while other models struggle even with the easiest problems, placing in the lowest 20 percent among all human participants. Detailed analysis experiments are also conducted to provide insights into performance across algorithms and comparisons between using C++ and Python, which can suggest directions for future studies.', 'score': 36, 'issue_id': 1475, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'e31430bb6ba5dfc8', 'authors': ['Shanghaoran Quan', 'Jiaxi Yang', 'Bowen Yu', 'Bo Zheng', 'Dayiheng Liu', 'An Yang', 'Xuancheng Ren', 'Bofei Gao', 'Yibo Miao', 'Yunlong Feng', 'Zekun Wang', 'Jian Yang', 'Zeyu Cui', 'Yang Fan', 'Yichang Zhang', 'Binyuan Hui', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.01257.jpg', 'data': {'categories': ['#dataset', '#benchmark', '#reasoning', '#optimization', '#open_source'], 'emoji': '🏆', 'ru': {'title': 'CodeElo: новый стандарт оценки LLM в соревновательном программировании', 'desc': 'Статья представляет новый бенчмарк CodeElo для оценки способностей больших языковых моделей (LLM) в решении задач по программированию соревновательного уровня. CodeElo основан на платформе CodeForces и включает проблемы с детальной информацией о сложности и алгоритмических тегах. Авторы разработали систему расчета рейтинга Эло, сопоставимую с рейтингами человеческих участников. Результаты тестирования 33 LLM показали, что модели o1-mini и QwQ-32B-Preview значительно превосходят остальные, достигая рейтингов 1578 и 1261 соответственно.'}, 'en': {'title': 'CodeElo: Elevating Code Generation Benchmarks for LLMs', 'desc': 'This paper presents CodeElo, a new benchmark designed to evaluate the coding abilities of large language models (LLMs) in a competitive setting. Unlike existing benchmarks, CodeElo addresses limitations such as the lack of private test cases and misaligned execution environments by utilizing the CodeForces platform. The benchmark includes a unique judging method and an Elo rating system that allows for fair comparisons between LLMs and human participants. Results indicate that certain models, like o1-mini, perform significantly better than others, highlighting the varying capabilities of LLMs in code generation tasks.'}, 'zh': {'title': 'CodeElo:提升代码生成能力的标准化基准测试', 'desc': '随着大型语言模型(LLMs)在代码推理能力上的提升,开发更具挑战性和全面性的基准测试变得愈发重要。现有的基准测试如LiveCodeBench和USACO存在一些不足,例如缺乏私有测试用例和特殊评判支持。为了解决这些问题,我们提出了CodeElo,这是一个标准化的竞赛级代码生成基准,首次有效应对这些挑战。通过在CodeForces平台上编译最近六个月的竞赛问题,我们为30个流行的开源和3个专有LLMs提供了Elo评分,结果显示o1-mini和QwQ-32B-Preview表现突出。'}}}, {'id': 'https://huggingface.co/papers/2501.00599', 'title': 'VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with Video LLM', 'url': 'https://huggingface.co/papers/2501.00599', 'abstract': 'Video Large Language Models (Video LLMs) have recently exhibited remarkable capabilities in general video understanding. However, they mainly focus on holistic comprehension and struggle with capturing fine-grained spatial and temporal details. Besides, the lack of high-quality object-level video instruction data and a comprehensive benchmark further hinders their advancements. To tackle these challenges, we introduce the VideoRefer Suite to empower Video LLM for finer-level spatial-temporal video understanding, i.e., enabling perception and reasoning on any objects throughout the video. Specially, we thoroughly develop VideoRefer Suite across three essential aspects: dataset, model, and benchmark. Firstly, we introduce a multi-agent data engine to meticulously curate a large-scale, high-quality object-level video instruction dataset, termed VideoRefer-700K. Next, we present the VideoRefer model, which equips a versatile spatial-temporal object encoder to capture precise regional and sequential representations. Finally, we meticulously create a VideoRefer-Bench to comprehensively assess the spatial-temporal understanding capability of a Video LLM, evaluating it across various aspects. Extensive experiments and analyses demonstrate that our VideoRefer model not only achieves promising performance on video referring benchmarks but also facilitates general video understanding capabilities.', 'score': 31, 'issue_id': 1474, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': 'daee687ce36ef3db', 'authors': ['Yuqian Yuan', 'Hang Zhang', 'Wentong Li', 'Zesen Cheng', 'Boqiang Zhang', 'Long Li', 'Xin Li', 'Deli Zhao', 'Wenqiao Zhang', 'Yueting Zhuang', 'Jianke Zhu', 'Lidong Bing'], 'affiliations': ['DAMO Academy, Alibaba Group', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00599.jpg', 'data': {'categories': ['#reasoning', '#benchmark', '#dataset', '#optimization', '#video'], 'emoji': '🎥', 'ru': {'title': 'Точное пространственно-временное понимание видео с помощью VideoRefer Suite', 'desc': 'Статья представляет VideoRefer Suite - комплексный подход к улучшению пространственно-временного понимания видео большими языковыми моделями. Авторы разработали масштабный набор данных VideoRefer-700K с инструкциями на уровне объектов, созданный с помощью мультиагентного движка. Они также представили модель VideoRefer с универсальным пространственно-временным кодировщиком объектов. Для оценки возможностей видео-LLM был создан бенчмарк VideoRefer-Bench, охватывающий различные аспекты понимания видео.'}, 'en': {'title': 'Empowering Video LLMs for Fine-Grained Understanding', 'desc': 'This paper introduces the VideoRefer Suite, which enhances Video Large Language Models (Video LLMs) for better understanding of videos by focusing on fine-grained spatial and temporal details. It addresses the limitations of existing models that primarily focus on overall comprehension and lack high-quality object-level instruction data. The suite includes a new dataset called VideoRefer-700K, a specialized VideoRefer model with a spatial-temporal object encoder, and a benchmark for evaluating video understanding capabilities. Experimental results show that the VideoRefer model significantly improves performance on video referring tasks while also enhancing general video comprehension.'}, 'zh': {'title': '提升视频理解,细致捕捉空间与时间', 'desc': '视频大型语言模型(Video LLMs)在视频理解方面展现了出色的能力,但在捕捉细粒度的空间和时间细节上存在困难。为了应对这些挑战,我们提出了VideoRefer Suite,以增强视频LLM在空间-时间视频理解方面的能力。我们开发了一个多代理数据引擎,创建了一个高质量的对象级视频指令数据集VideoRefer-700K,并提出了VideoRefer模型,配备了多功能的空间-时间对象编码器。最后,我们创建了VideoRefer-Bench,以全面评估视频LLM的空间-时间理解能力,实验结果表明我们的模型在视频引用基准上表现优异。'}}}, {'id': 'https://huggingface.co/papers/2501.01423', 'title': 'Reconstruction vs. Generation: Taming Optimization Dilemma in Latent Diffusion Models', 'url': 'https://huggingface.co/papers/2501.01423', 'abstract': 'Latent diffusion models with Transformer architectures excel at generating high-fidelity images. However, recent studies reveal an optimization dilemma in this two-stage design: while increasing the per-token feature dimension in visual tokenizers improves reconstruction quality, it requires substantially larger diffusion models and more training iterations to achieve comparable generation performance. Consequently, existing systems often settle for sub-optimal solutions, either producing visual artifacts due to information loss within tokenizers or failing to converge fully due to expensive computation costs. We argue that this dilemma stems from the inherent difficulty in learning unconstrained high-dimensional latent spaces. To address this, we propose aligning the latent space with pre-trained vision foundation models when training the visual tokenizers. Our proposed VA-VAE (Vision foundation model Aligned Variational AutoEncoder) significantly expands the reconstruction-generation frontier of latent diffusion models, enabling faster convergence of Diffusion Transformers (DiT) in high-dimensional latent spaces. To exploit the full potential of VA-VAE, we build an enhanced DiT baseline with improved training strategies and architecture designs, termed LightningDiT. The integrated system achieves state-of-the-art (SOTA) performance on ImageNet 256x256 generation with an FID score of 1.35 while demonstrating remarkable training efficiency by reaching an FID score of 2.11 in just 64 epochs--representing an over 21 times convergence speedup compared to the original DiT. Models and codes are available at: https://github.com/hustvl/LightningDiT.', 'score': 30, 'issue_id': 1473, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '173fa21b6e47d04c', 'authors': ['Jingfeng Yao', 'Xinggang Wang'], 'affiliations': ['Huazhong University of Science and Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.01423.jpg', 'data': {'categories': ['#training', '#optimization', '#cv', '#architecture', '#diffusion'], 'emoji': '⚡', 'ru': {'title': 'Революция в латентных диффузионных моделях: быстрее, лучше, эффективнее', 'desc': 'Статья представляет новый подход к улучшению латентных диффузионных моделей с архитектурой Трансформер для генерации изображений высокого качества. Авторы предлагают метод VA-VAE, который выравнивает латентное пространство с предобученными моделями компьютерного зрения. Это позволяет значительно расширить границы реконструкции-генерации и ускорить сходимость Диффузионных Трансформеров в высокоразмерных латентных пространствах. На основе VA-VAE авторы создали улучшенную модель LightningDiT, достигающую современного уровня производительности на задаче генерации изображений ImageNet 256x256.'}, 'en': {'title': 'Accelerating Image Generation with Aligned Latent Spaces', 'desc': 'This paper discusses the challenges faced by latent diffusion models, particularly when using Transformer architectures for image generation. It highlights an optimization issue where increasing the feature dimensions in visual tokenizers can lead to larger models and longer training times, often resulting in sub-optimal image quality. The authors propose a solution by aligning the latent space with pre-trained vision models, introducing a new framework called VA-VAE to enhance the training process. Their improved model, LightningDiT, achieves state-of-the-art performance in image generation while significantly speeding up the training process.'}, 'zh': {'title': '优化潜在扩散模型,提升图像生成效率', 'desc': '本论文探讨了潜在扩散模型与变换器架构在生成高质量图像时的优化困境。研究表明,虽然增加视觉标记器中的每个标记特征维度可以提高重建质量,但这也导致需要更大的扩散模型和更多的训练迭代。为了解决这一问题,作者提出将潜在空间与预训练的视觉基础模型对齐,从而提高训练效率。最终,提出的VA-VAE模型显著提升了潜在扩散模型的重建生成能力,并在ImageNet数据集上实现了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.00103', 'title': 'LTX-Video: Realtime Video Latent Diffusion', 'url': 'https://huggingface.co/papers/2501.00103', 'abstract': "We introduce LTX-Video, a transformer-based latent diffusion model that adopts a holistic approach to video generation by seamlessly integrating the responsibilities of the Video-VAE and the denoising transformer. Unlike existing methods, which treat these components as independent, LTX-Video aims to optimize their interaction for improved efficiency and quality. At its core is a carefully designed Video-VAE that achieves a high compression ratio of 1:192, with spatiotemporal downscaling of 32 x 32 x 8 pixels per token, enabled by relocating the patchifying operation from the transformer's input to the VAE's input. Operating in this highly compressed latent space enables the transformer to efficiently perform full spatiotemporal self-attention, which is essential for generating high-resolution videos with temporal consistency. However, the high compression inherently limits the representation of fine details. To address this, our VAE decoder is tasked with both latent-to-pixel conversion and the final denoising step, producing the clean result directly in pixel space. This approach preserves the ability to generate fine details without incurring the runtime cost of a separate upsampling module. Our model supports diverse use cases, including text-to-video and image-to-video generation, with both capabilities trained simultaneously. It achieves faster-than-real-time generation, producing 5 seconds of 24 fps video at 768x512 resolution in just 2 seconds on an Nvidia H100 GPU, outperforming all existing models of similar scale. The source code and pre-trained models are publicly available, setting a new benchmark for accessible and scalable video generation.", 'score': 29, 'issue_id': 1484, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': 'a2358f7cf156ff08', 'authors': ['Yoav HaCohen', 'Nisan Chiprut', 'Benny Brazowski', 'Daniel Shalem', 'Dudu Moshe', 'Eitan Richardson', 'Eran Levin', 'Guy Shiran', 'Nir Zabari', 'Ori Gordon', 'Poriya Panet', 'Sapir Weissbuch', 'Victor Kulikov', 'Yaki Bitterman', 'Zeev Melumian', 'Ofir Bibi'], 'affiliations': ['Lightricks'], 'pdf_title_img': 'assets/pdf/title_img/2501.00103.jpg', 'data': {'categories': ['#open_source', '#benchmark', '#video', '#diffusion'], 'emoji': '🎬', 'ru': {'title': 'Революция в генерации видео: быстрее реального времени', 'desc': 'LTX-Video - это трансформерная модель латентной диффузии для генерации видео. Она объединяет функции Video-VAE и шумоподавляющего трансформера, оптимизируя их взаимодействие. Модель использует сильно сжатое латентное пространство, позволяя трансформеру эффективно выполнять полное пространственно-временное самовнимание. LTX-Video поддерживает генерацию видео из текста и изображений, превосходя существующие модели по скорости и качеству.'}, 'en': {'title': 'Revolutionizing Video Generation with LTX-Video', 'desc': "LTX-Video is a novel transformer-based latent diffusion model designed for efficient video generation by integrating the roles of Video-VAE and denoising transformers. It achieves a high compression ratio of 1:192, allowing the model to operate in a compressed latent space while maintaining spatiotemporal self-attention for generating high-resolution videos. The model's VAE decoder performs both latent-to-pixel conversion and denoising, enabling the generation of fine details without the need for a separate upsampling module. With capabilities for text-to-video and image-to-video generation, LTX-Video produces videos faster than real-time, setting a new standard in the field."}, 'zh': {'title': 'LTX-Video:高效视频生成的新标准', 'desc': 'LTX-Video是一种基于变换器的潜在扩散模型,旨在通过整合视频生成中的Video-VAE和去噪变换器的功能来提高效率和质量。该模型的核心是一个高压缩比的Video-VAE,能够在压缩的潜在空间中高效执行时空自注意力,从而生成高分辨率且具有时间一致性的视频。为了克服高压缩带来的细节损失,VAE解码器同时负责潜在到像素的转换和最终的去噪步骤,直接在像素空间中生成清晰的结果。LTX-Video支持多种应用场景,包括文本到视频和图像到视频的生成,并且在Nvidia H100 GPU上以超实时速度生成视频,设立了视频生成的新基准。'}}}, {'id': 'https://huggingface.co/papers/2501.01264', 'title': 'ProgCo: Program Helps Self-Correction of Large Language Models', 'url': 'https://huggingface.co/papers/2501.01264', 'abstract': 'Self-Correction aims to enable large language models (LLMs) to self-verify and self-refine their initial responses without external feedback. However, LLMs often fail to effectively self-verify and generate correct feedback, further misleading refinement and leading to the failure of self-correction, especially in complex reasoning tasks. In this paper, we propose Program-driven Self-Correction (ProgCo). First, program-driven verification (ProgVe) achieves complex verification logic and extensive validation through self-generated, self-executing verification pseudo-programs. Then, program-driven refinement (ProgRe) receives feedback from ProgVe, conducts dual reflection and refinement on both responses and verification programs to mitigate misleading of incorrect feedback in complex reasoning tasks. Experiments on three instruction-following and mathematical benchmarks indicate that ProgCo achieves effective self-correction, and can be further enhance performance when combined with real program tools.', 'score': 22, 'issue_id': 1473, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'bda3f96e83319526', 'authors': ['Xiaoshuai Song', 'Yanan Wu', 'Weixun Wang', 'Jiaheng Liu', 'Wenbo Su', 'Bo Zheng'], 'affiliations': ['Taobao & Tmall Group of Alibaba'], 'pdf_title_img': 'assets/pdf/title_img/2501.01264.jpg', 'data': {'categories': ['#training', '#math', '#reasoning', '#interpretability', '#rlhf'], 'emoji': '🤖', 'ru': {'title': 'ProgCo: Самокоррекция языковых моделей через программно-управляемую верификацию и уточнение', 'desc': 'Эта статья представляет новый подход к самокоррекции больших языковых моделей (LLM) под названием Program-driven Self-Correction (ProgCo). Метод включает в себя программно-управляемую верификацию (ProgVe), которая использует самогенерируемые и самовыполняющиеся псевдопрограммы для сложной логики проверки. Затем программно-управляемое уточнение (ProgRe) проводит двойную рефлексию и улучшение как ответов, так и программ верификации. Эксперименты показали, что ProgCo эффективен в самокоррекции и может дополнительно улучшить производительность при комбинировании с реальными программными инструментами.'}, 'en': {'title': 'Empowering LLMs with Program-Driven Self-Correction', 'desc': 'This paper introduces Program-driven Self-Correction (ProgCo) to improve the self-verification and self-refinement capabilities of large language models (LLMs). It addresses the common issue where LLMs struggle to provide accurate feedback, which can lead to incorrect refinements, particularly in complex reasoning tasks. ProgCo utilizes program-driven verification (ProgVe) to create self-executing verification pseudo-programs that enhance the verification process. Additionally, program-driven refinement (ProgRe) allows the model to reflect on and refine both its responses and the verification programs, leading to more reliable self-correction outcomes.'}, 'zh': {'title': '基于程序的自我纠正:提升语言模型的自我验证能力', 'desc': '自我纠正旨在使大型语言模型(LLMs)能够在没有外部反馈的情况下自我验证和自我完善其初始响应。然而,LLMs往往无法有效自我验证并生成正确的反馈,这会进一步误导其完善过程,尤其是在复杂推理任务中。本文提出了基于程序的自我纠正(ProgCo),通过自生成、自执行的验证伪程序实现复杂的验证逻辑和广泛的验证。实验结果表明,ProgCo在三个指令遵循和数学基准测试中实现了有效的自我纠正,并且与真实程序工具结合时可以进一步提升性能。'}}}, {'id': 'https://huggingface.co/papers/2501.00316', 'title': 'MapEval: A Map-Based Evaluation of Geo-Spatial Reasoning in Foundation Models', 'url': 'https://huggingface.co/papers/2501.00316', 'abstract': "Recent advancements in foundation models have enhanced AI systems' capabilities in autonomous tool usage and reasoning. However, their ability in location or map-based reasoning - which improves daily life by optimizing navigation, facilitating resource discovery, and streamlining logistics - has not been systematically studied. To bridge this gap, we introduce MapEval, a benchmark designed to assess diverse and complex map-based user queries with geo-spatial reasoning. MapEval features three task types (textual, API-based, and visual) that require collecting world information via map tools, processing heterogeneous geo-spatial contexts (e.g., named entities, travel distances, user reviews or ratings, images), and compositional reasoning, which all state-of-the-art foundation models find challenging. Comprising 700 unique multiple-choice questions about locations across 180 cities and 54 countries, MapEval evaluates foundation models' ability to handle spatial relationships, map infographics, travel planning, and navigation challenges. Using MapEval, we conducted a comprehensive evaluation of 28 prominent foundation models. While no single model excelled across all tasks, Claude-3.5-Sonnet, GPT-4o, and Gemini-1.5-Pro achieved competitive performance overall. However, substantial performance gaps emerged, particularly in MapEval, where agents with Claude-3.5-Sonnet outperformed GPT-4o and Gemini-1.5-Pro by 16% and 21%, respectively, and the gaps became even more amplified when compared to open-source LLMs. Our detailed analyses provide insights into the strengths and weaknesses of current models, though all models still fall short of human performance by more than 20% on average, struggling with complex map images and rigorous geo-spatial reasoning. This gap highlights MapEval's critical role in advancing general-purpose foundation models with stronger geo-spatial understanding.", 'score': 20, 'issue_id': 1477, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': 'a4e45c6bd9d30ff4', 'authors': ['Mahir Labib Dihan', 'Md Tanvir Hassan', 'Md Tanvir Parvez', 'Md Hasebul Hasan', 'Md Almash Alam', 'Muhammad Aamir Cheema', 'Mohammed Eunus Ali', 'Md Rizwan Parvez'], 'affiliations': ['Bangladesh Computer Council (BCC)', 'Department of Computer Science and Engineering Bangladesh University of Engineering and Technology (BUET)', 'Monash University', 'Qatar Computing Research Institute (QCRI)', 'Statistics, Islamic University Bangladesh'], 'pdf_title_img': 'assets/pdf/title_img/2501.00316.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#multimodal', '#survey'], 'emoji': '🗺️', 'ru': {'title': 'MapEval: Новый рубеж в геопространственном ИИ', 'desc': 'Статья представляет MapEval - новый бенчмарк для оценки способностей моделей искусственного интеллекта в области пространственных рассуждений и работы с картами. MapEval включает 700 вопросов с множественным выбором, охватывающих 180 городов и 54 страны, и оценивает навыки моделей в понимании пространственных отношений, инфографики карт, планирования путешествий и навигации. Авторы провели оценку 28 ведущих фундаментальных моделей, выявив значительные различия в производительности, при этом все модели все еще отстают от человеческого уровня более чем на 20%. Результаты исследования подчеркивают важность MapEval для развития моделей с более сильным геопространственным пониманием.'}, 'en': {'title': "Enhancing AI's Geo-Spatial Reasoning with MapEval", 'desc': 'This paper introduces MapEval, a benchmark designed to evaluate the performance of foundation models in map-based reasoning tasks. It focuses on assessing how well these models can handle complex geo-spatial queries, which are essential for navigation and resource discovery. The benchmark includes various task types that require models to process diverse information, such as travel distances and user reviews, and perform compositional reasoning. The evaluation reveals that while some models perform competitively, they still lag behind human capabilities, indicating a need for further advancements in geo-spatial understanding within AI systems.'}, 'zh': {'title': '提升地图推理能力的基准评估', 'desc': '最近基础模型的进展提升了人工智能系统在自主工具使用和推理方面的能力。然而,它们在基于位置或地图的推理能力上尚未得到系统研究,这对于优化导航、资源发现和物流管理至关重要。为了解决这个问题,我们引入了MapEval,一个旨在评估复杂地图用户查询的基准,涉及地理空间推理。MapEval包含700个关于180个城市和54个国家的独特多项选择题,评估基础模型在处理空间关系、地图信息、旅行规划和导航挑战方面的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.01149', 'title': 'A3: Android Agent Arena for Mobile GUI Agents', 'url': 'https://huggingface.co/papers/2501.01149', 'abstract': 'AI agents have become increasingly prevalent in recent years, driven by significant advancements in the field of large language models (LLMs). Mobile GUI agents, a subset of AI agents, are designed to autonomously perform tasks on mobile devices. While numerous studies have introduced agents, datasets, and benchmarks to advance mobile GUI agent research, many existing datasets focus on static frame evaluations and fail to provide a comprehensive platform for assessing performance on real-world, in-the-wild tasks. To address this gap, we present Android Agent Arena (A3), a novel evaluation platform. Unlike existing in-the-wild systems, A3 offers: (1) meaningful and practical tasks, such as real-time online information retrieval and operational instructions; (2) a larger, more flexible action space, enabling compatibility with agents trained on any dataset; and (3) automated business-level LLM-based evaluation process. A3 includes 21 widely used general third-party apps and 201 tasks representative of common user scenarios, providing a robust foundation for evaluating mobile GUI agents in real-world situations and a new autonomous evaluation process for less human labor and coding expertise. The project is available at https://yuxiangchai.github.io/Android-Agent-Arena/.', 'score': 20, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '050f155aa526c100', 'authors': ['Yuxiang Chai', 'Hanhao Li', 'Jiayu Zhang', 'Liang Liu', 'Guozhi Wang', 'Shuai Ren', 'Siyuan Huang', 'Hongsheng Li'], 'affiliations': ['EE department @ CUHK', 'MMLab @ CUHK'], 'pdf_title_img': 'assets/pdf/title_img/2501.01149.jpg', 'data': {'categories': ['#benchmark', '#dataset', '#agents'], 'emoji': '🤖', 'ru': {'title': 'A3: Арена для тестирования мобильных AI-агентов в реальном мире', 'desc': 'Статья представляет новую платформу для оценки мобильных GUI-агентов под названием Android Agent Arena (A3). A3 предлагает реалистичные задачи, широкое пространство действий и автоматизированную оценку на основе больших языковых моделей. Платформа включает 21 популярное стороннее приложение и 201 задачу, отражающую типичные пользовательские сценарии. A3 позволяет оценивать производительность агентов в реальных условиях, что отличает её от существующих статических наборов данных.'}, 'en': {'title': 'Revolutionizing Mobile GUI Agent Evaluation with A3', 'desc': 'This paper introduces the Android Agent Arena (A3), a new evaluation platform for mobile GUI agents that addresses limitations in existing datasets. A3 focuses on real-world tasks, providing a larger action space that accommodates agents trained on various datasets. It features 21 popular third-party apps and 201 tasks that reflect common user scenarios, enhancing the assessment of agent performance. Additionally, A3 incorporates an automated evaluation process using large language models, reducing the need for extensive human involvement and coding skills.'}, 'zh': {'title': 'Android Agent Arena:移动GUI代理的新评估平台', 'desc': '近年来,人工智能代理的应用越来越广泛,尤其是在大型语言模型(LLMs)领域的进步推动下。移动图形用户界面(GUI)代理是人工智能代理的一种,旨在自主执行移动设备上的任务。现有的研究虽然提出了许多代理、数据集和基准,但大多数数据集仅关注静态框架评估,无法全面评估真实世界中的任务表现。为了解决这一问题,我们提出了Android Agent Arena(A3),这是一个新颖的评估平台,提供了实际的任务和更灵活的操作空间,支持基于LLM的自动化评估过程。'}}}, {'id': 'https://huggingface.co/papers/2501.00192', 'title': 'MLLM-as-a-Judge for Image Safety without Human Labeling', 'url': 'https://huggingface.co/papers/2501.00192', 'abstract': 'Image content safety has become a significant challenge with the rise of visual media on online platforms. Meanwhile, in the age of AI-generated content (AIGC), many image generation models are capable of producing harmful content, such as images containing sexual or violent material. Thus, it becomes crucial to identify such unsafe images based on established safety rules. Pre-trained Multimodal Large Language Models (MLLMs) offer potential in this regard, given their strong pattern recognition abilities. Existing approaches typically fine-tune MLLMs with human-labeled datasets, which however brings a series of drawbacks. First, relying on human annotators to label data following intricate and detailed guidelines is both expensive and labor-intensive. Furthermore, users of safety judgment systems may need to frequently update safety rules, making fine-tuning on human-based annotation more challenging. This raises the research question: Can we detect unsafe images by querying MLLMs in a zero-shot setting using a predefined safety constitution (a set of safety rules)? Our research showed that simply querying pre-trained MLLMs does not yield satisfactory results. This lack of effectiveness stems from factors such as the subjectivity of safety rules, the complexity of lengthy constitutions, and the inherent biases in the models. To address these challenges, we propose a MLLM-based method includes objectifying safety rules, assessing the relevance between rules and images, making quick judgments based on debiased token probabilities with logically complete yet simplified precondition chains for safety rules, and conducting more in-depth reasoning with cascaded chain-of-thought processes if necessary. Experiment results demonstrate that our method is highly effective for zero-shot image safety judgment tasks.', 'score': 20, 'issue_id': 1474, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': '2a62bcbb87c1b7a5', 'authors': ['Zhenting Wang', 'Shuming Hu', 'Shiyu Zhao', 'Xiaowen Lin', 'Felix Juefei-Xu', 'Zhuowei Li', 'Ligong Han', 'Harihar Subramanyam', 'Li Chen', 'Jianfa Chen', 'Nan Jiang', 'Lingjuan Lyu', 'Shiqing Ma', 'Dimitris N. Metaxas', 'Ankit Jain'], 'affiliations': ['GenAI @ Meta', 'Rutgers University', 'UMass Amherst', 'Westlake University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00192.jpg', 'data': {'categories': ['#reasoning', '#training', '#ethics', '#cv', '#multimodal'], 'emoji': '🛡️', 'ru': {'title': 'Интеллектуальная защита: Zero-shot оценка безопасности изображений с помощью MLLM', 'desc': 'Статья представляет метод определения безопасности изображений с использованием мультимодальных больших языковых моделей (MLLM) в режиме zero-shot. Авторы предлагают подход, включающий объективизацию правил безопасности, оценку релевантности между правилами и изображениями, и быстрое принятие решений на основе дебиасированных вероятностей токенов. Метод также включает каскадные цепочки рассуждений для более глубокого анализа при необходимости. Эксперименты показывают высокую эффективность предложенного метода для задач оценки безопасности изображений без предварительного обучения.'}, 'en': {'title': 'Zero-Shot Image Safety Detection with MLLMs', 'desc': 'This paper addresses the challenge of identifying unsafe images in the context of AI-generated content using Multimodal Large Language Models (MLLMs). The authors propose a novel approach that allows for zero-shot detection of harmful images by utilizing predefined safety rules without the need for extensive human labeling. They highlight the limitations of traditional methods, such as the subjectivity of safety rules and the biases present in models. The proposed method enhances safety judgment by objectifying rules, assessing their relevance to images, and employing a reasoning process that simplifies complex safety guidelines.'}, 'zh': {'title': '利用MLLMs实现零样本图像安全判断', 'desc': '随着在线平台视觉媒体的兴起,图像内容安全成为一个重要挑战。许多图像生成模型能够产生有害内容,因此识别不安全图像变得至关重要。我们提出了一种基于预训练多模态大语言模型(MLLMs)的方法,通过查询这些模型来检测不安全图像,而无需依赖人工标注。实验结果表明,我们的方法在零样本图像安全判断任务中非常有效。'}}}, {'id': 'https://huggingface.co/papers/2501.01426', 'title': 'Unifying Specialized Visual Encoders for Video Language Models', 'url': 'https://huggingface.co/papers/2501.01426', 'abstract': 'The recent advent of Large Language Models (LLMs) has ushered sophisticated reasoning capabilities into the realm of video through Video Large Language Models (VideoLLMs). However, VideoLLMs currently rely on a single vision encoder for all of their visual processing, which limits the amount and type of visual information that can be conveyed to the LLM. Our method, MERV, Multi-Encoder Representation of Videos, instead leverages multiple frozen visual encoders to create a unified representation of a video, providing the VideoLLM with a comprehensive set of specialized visual knowledge. Spatio-temporally aligning the features from each encoder allows us to tackle a wider range of open-ended and multiple-choice video understanding questions and outperform prior state-of-the-art works. MERV is up to 3.7% better in accuracy than Video-LLaVA across the standard suite video understanding benchmarks, while also having a better Video-ChatGPT score. We also improve upon SeViLA, the previous best on zero-shot Perception Test accuracy, by 2.2%. MERV introduces minimal extra parameters and trains faster than equivalent single-encoder methods while parallelizing the visual processing. Finally, we provide qualitative evidence that MERV successfully captures domain knowledge from each of its encoders. Our results offer promising directions in utilizing multiple vision encoders for comprehensive video understanding.', 'score': 19, 'issue_id': 1488, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'c868a7ebcbafa704', 'authors': ['Jihoon Chung', 'Tyler Zhu', 'Max Gonzalez Saez-Diez', 'Juan Carlos Niebles', 'Honglu Zhou', 'Olga Russakovsky'], 'affiliations': ['Princeton University', 'Salesforce Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.01426.jpg', 'data': {'categories': ['#architecture', '#reasoning', '#video', '#benchmark', '#multimodal', '#optimization'], 'emoji': '🎥', 'ru': {'title': 'MERV: Многоэнкодерное представление видео для улучшенного машинного понимания', 'desc': 'Статья представляет MERV - новый метод для улучшения понимания видео с помощью больших языковых моделей. MERV использует несколько замороженных визуальных энкодеров для создания единого представления видео, что позволяет охватить больший объем визуальной информации. Этот подход превосходит предыдущие методы в точности на стандартных тестах понимания видео. MERV вводит минимальное количество дополнительных параметров и обучается быстрее, чем эквивалентные методы с одним энкодером.'}, 'en': {'title': 'Unlocking Video Understanding with Multi-Encoder Magic!', 'desc': 'This paper introduces MERV, a method that enhances Video Large Language Models (VideoLLMs) by using multiple visual encoders instead of just one. By combining the outputs of these encoders, MERV creates a richer representation of videos, which helps the model understand complex video content better. The approach allows for improved performance on various video understanding tasks, achieving higher accuracy than previous models. Additionally, MERV is efficient, requiring fewer parameters and training time while effectively leveraging the strengths of each encoder.'}, 'zh': {'title': '多编码器提升视频理解能力', 'desc': '本文介绍了一种名为MERV(多编码器视频表示)的方法,旨在提升视频理解的能力。MERV通过使用多个冻结的视觉编码器,创建视频的统一表示,从而为视频大型语言模型(VideoLLM)提供更全面的视觉知识。通过时空对齐每个编码器的特征,MERV能够更好地处理开放式和多选的视频理解问题,且在准确性上超越了之前的最佳模型。该方法不仅提高了性能,还在参数和训练速度上优于单编码器方法,展示了多视觉编码器在视频理解中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01054', 'title': 'Dynamic Scaling of Unit Tests for Code Reward Modeling', 'url': 'https://huggingface.co/papers/2501.01054', 'abstract': 'Current large language models (LLMs) often struggle to produce accurate responses on the first attempt for complex reasoning tasks like code generation. Prior research tackles this challenge by generating multiple candidate solutions and validating them with LLM-generated unit tests. The execution results of unit tests serve as reward signals to identify correct solutions. As LLMs always confidently make mistakes, these unit tests are not reliable, thereby diminishing the quality of reward signals. Motivated by the observation that scaling the number of solutions improves LLM performance, we explore the impact of scaling unit tests to enhance reward signal quality. Our pioneer experiment reveals a positive correlation between the number of unit tests and reward signal quality, with greater benefits observed in more challenging problems. Based on these insights, we propose CodeRM-8B, a lightweight yet effective unit test generator that enables efficient and high-quality unit test scaling. Additionally, we implement a dynamic scaling mechanism that adapts the number of unit tests based on problem difficulty, further improving efficiency. Experimental results show that our approach significantly improves performance across various models on three benchmarks (e.g., with gains of 18.43% for Llama3-8B and 3.42% for GPT-4o-mini on HumanEval Plus).', 'score': 15, 'issue_id': 1474, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '33b9590f2acb0e48', 'authors': ['Zeyao Ma', 'Xiaokang Zhang', 'Jing Zhang', 'Jifan Yu', 'Sijia Luo', 'Jie Tang'], 'affiliations': ['Key Laboratory of Data Engineering and Knowledge Engineering, Beijing, China', 'School of Information, Renmin University of China', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01054.jpg', 'data': {'categories': ['#reasoning', '#benchmark', '#training', '#small_models', '#rlhf', '#optimization'], 'emoji': '🧪', 'ru': {'title': 'Масштабирование юнит-тестов для повышения точности LLM в сложных задачах', 'desc': 'Эта статья посвящена улучшению точности больших языковых моделей (LLM) в задачах сложного мышления, таких как генерация кода. Авторы предлагают метод масштабирования юнит-тестов для повышения качества сигналов вознаграждения при оценке решений. Они разработали легковесный генератор юнит-тестов CodeRM-8B и механизм динамического масштабирования, адаптирующийся к сложности задачи. Эксперименты показали значительное улучшение производительности различных моделей на нескольких тестовых наборах.'}, 'en': {'title': 'Enhancing LLM Performance through Scaled Unit Testing', 'desc': 'This paper addresses the limitations of large language models (LLMs) in generating accurate responses for complex tasks like code generation. It highlights the issue of unreliable reward signals from LLM-generated unit tests, which can lead to incorrect solutions. The authors propose a novel approach, CodeRM-8B, which generates a larger number of unit tests to improve the quality of these reward signals. Their experiments demonstrate that scaling unit tests enhances LLM performance, particularly for more challenging problems, leading to significant improvements across various models.'}, 'zh': {'title': '提升单元测试质量,增强模型性能', 'desc': '当前的大型语言模型(LLMs)在复杂推理任务(如代码生成)中,往往难以在第一次尝试时产生准确的响应。以往的研究通过生成多个候选解决方案并使用LLM生成的单元测试进行验证来应对这一挑战。单元测试的执行结果作为奖励信号,用于识别正确的解决方案。然而,由于LLMs常常自信地犯错,这些单元测试的可靠性不足,从而降低了奖励信号的质量。我们提出了CodeRM-8B,一个轻量级且有效的单元测试生成器,能够高效地扩展单元测试,并根据问题的难度动态调整单元测试的数量,从而进一步提高效率。'}}}, {'id': 'https://huggingface.co/papers/2501.01320', 'title': 'SeedVR: Seeding Infinity in Diffusion Transformer Towards Generic Video Restoration', 'url': 'https://huggingface.co/papers/2501.01320', 'abstract': "Video restoration poses non-trivial challenges in maintaining fidelity while recovering temporally consistent details from unknown degradations in the wild. Despite recent advances in diffusion-based restoration, these methods often face limitations in generation capability and sampling efficiency. In this work, we present SeedVR, a diffusion transformer designed to handle real-world video restoration with arbitrary length and resolution. The core design of SeedVR lies in the shifted window attention that facilitates effective restoration on long video sequences. SeedVR further supports variable-sized windows near the boundary of both spatial and temporal dimensions, overcoming the resolution constraints of traditional window attention. Equipped with contemporary practices, including causal video autoencoder, mixed image and video training, and progressive training, SeedVR achieves highly-competitive performance on both synthetic and real-world benchmarks, as well as AI-generated videos. Extensive experiments demonstrate SeedVR's superiority over existing methods for generic video restoration.", 'score': 8, 'issue_id': 1479, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': 'fa277e5baed864a4', 'authors': ['Jianyi Wang', 'Zhijie Lin', 'Meng Wei', 'Yang Zhao', 'Ceyuan Yang', 'Chen Change Loy', 'Lu Jiang'], 'affiliations': ['ByteDance', 'Nanyang Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01320.jpg', 'data': {'categories': ['#architecture', '#benchmark', '#long_context', '#video', '#training', '#diffusion', '#synthetic'], 'emoji': '🎥', 'ru': {'title': 'SeedVR: Восстановление видео нового поколения с помощью диффузионных трансформеров', 'desc': 'SeedVR - это диффузионный трансформер для восстановления видео в реальных условиях. Он использует сдвинутое оконное внимание для эффективной обработки длинных видеопоследовательностей. SeedVR поддерживает окна переменного размера на границах пространственных и временных измерений, преодолевая ограничения традиционного оконного внимания. Благодаря современным практикам, таким как каузальный видеоавтоэнкодер и прогрессивное обучение, SeedVR достигает высоких результатов на синтетических и реальных тестовых наборах.'}, 'en': {'title': 'SeedVR: Revolutionizing Video Restoration with Diffusion Transformers', 'desc': 'This paper introduces SeedVR, a novel diffusion transformer aimed at improving video restoration by effectively managing long sequences and varying resolutions. It utilizes shifted window attention to enhance the restoration process, allowing for better handling of temporal consistency and fidelity in videos. SeedVR incorporates advanced techniques such as causal video autoencoders and mixed training strategies to boost its performance on both synthetic and real-world datasets. The results show that SeedVR outperforms existing video restoration methods, making it a significant advancement in the field.'}, 'zh': {'title': 'SeedVR:高效的视频修复新方法', 'desc': '视频修复面临着在恢复未知退化的同时保持细节一致性的挑战。尽管基于扩散的修复方法有所进展,但它们在生成能力和采样效率上仍存在局限性。本文提出了SeedVR,这是一种专为处理任意长度和分辨率的真实视频修复而设计的扩散变换器。SeedVR通过移动窗口注意力机制,有效地处理长视频序列,并在空间和时间维度的边界附近支持可变大小的窗口,克服了传统窗口注意力的分辨率限制。'}}}, {'id': 'https://huggingface.co/papers/2412.21015', 'title': 'MapQaTor: A System for Efficient Annotation of Map Query Datasets', 'url': 'https://huggingface.co/papers/2412.21015', 'abstract': 'Mapping and navigation services like Google Maps, Apple Maps, Openstreet Maps, are essential for accessing various location-based data, yet they often struggle to handle natural language geospatial queries. Recent advancements in Large Language Models (LLMs) show promise in question answering (QA), but creating reliable geospatial QA datasets from map services remains challenging. We introduce MapQaTor, a web application that streamlines the creation of reproducible, traceable map-based QA datasets. With its plug-and-play architecture, MapQaTor enables seamless integration with any maps API, allowing users to gather and visualize data from diverse sources with minimal setup. By caching API responses, the platform ensures consistent ground truth, enhancing the reliability of the data even as real-world information evolves. MapQaTor centralizes data retrieval, annotation, and visualization within a single platform, offering a unique opportunity to evaluate the current state of LLM-based geospatial reasoning while advancing their capabilities for improved geospatial understanding. Evaluation metrics show that, MapQaTor speeds up the annotation process by at least 30 times compared to manual methods, underscoring its potential for developing geospatial resources, such as complex map reasoning datasets. The website is live at: https://mapqator.github.io/ and a demo video is available at: https://youtu.be/7_aV9Wmhs6Q.', 'score': 8, 'issue_id': 1477, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '0d1081756b5bc4f7', 'authors': ['Mahir Labib Dihan', 'Mohammed Eunus Ali', 'Md Rizwan Parvez'], 'affiliations': ['Department of Computer Science and Engineering Bangladesh University of Engineering and Technology (BUET)', 'Qatar Computing Research Institute (QCRI)'], 'pdf_title_img': 'assets/pdf/title_img/2412.21015.jpg', 'data': {'categories': ['#dataset', '#science', '#reasoning', '#data', '#benchmark'], 'emoji': '🗺️', 'ru': {'title': 'MapQaTor: Революция в создании геопространственных данных для ИИ', 'desc': 'MapQaTor - это веб-приложение, которое упрощает создание воспроизводимых наборов данных для вопросно-ответных систем на основе карт. Оно интегрируется с любым картографическим API и позволяет собирать и визуализировать данные из различных источников. MapQaTor кэширует ответы API, обеспечивая согласованность данных, и централизует процессы сбора, аннотации и визуализации. Приложение ускоряет процесс аннотации в 30 раз по сравнению с ручными методами, что делает его полезным инструментом для развития геопространственных ресурсов и оценки возможностей больших языковых моделей в области геопространственных рассуждений.'}, 'en': {'title': 'Streamlining Geospatial QA with MapQaTor', 'desc': 'This paper presents MapQaTor, a web application designed to facilitate the creation of geospatial question answering (QA) datasets using map services. It leverages recent advancements in Large Language Models (LLMs) to improve the handling of natural language queries related to locations. The platform features a plug-and-play architecture that integrates with various maps APIs, allowing users to efficiently gather, annotate, and visualize geospatial data. By caching API responses, MapQaTor ensures consistent and reliable data, significantly speeding up the annotation process and enhancing the evaluation of LLM-based geospatial reasoning capabilities.'}, 'zh': {'title': 'MapQaTor:提升地图问答数据集创建效率的利器', 'desc': '本文介绍了MapQaTor,一个用于创建地图问答数据集的网络应用程序。它利用大型语言模型的优势,简化了从地图服务生成可重复和可追溯的数据集的过程。MapQaTor支持与任何地图API的无缝集成,并通过缓存API响应来确保数据的一致性。该平台显著提高了数据标注的效率,展示了在地理空间推理方面的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.01407', 'title': 'Nested Attention: Semantic-aware Attention Values for Concept Personalization', 'url': 'https://huggingface.co/papers/2501.01407', 'abstract': "Personalizing text-to-image models to generate images of specific subjects across diverse scenes and styles is a rapidly advancing field. Current approaches often face challenges in maintaining a balance between identity preservation and alignment with the input text prompt. Some methods rely on a single textual token to represent a subject, which limits expressiveness, while others employ richer representations but disrupt the model's prior, diminishing prompt alignment. In this work, we introduce Nested Attention, a novel mechanism that injects a rich and expressive image representation into the model's existing cross-attention layers. Our key idea is to generate query-dependent subject values, derived from nested attention layers that learn to select relevant subject features for each region in the generated image. We integrate these nested layers into an encoder-based personalization method, and show that they enable high identity preservation while adhering to input text prompts. Our approach is general and can be trained on various domains. Additionally, its prior preservation allows us to combine multiple personalized subjects from different domains in a single image.", 'score': 7, 'issue_id': 1487, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '537e7bcc16fb17f5', 'authors': ['Or Patashnik', 'Rinon Gal', 'Daniil Ostashev', 'Sergey Tulyakov', 'Kfir Aberman', 'Daniel Cohen-Or'], 'affiliations': ['Snap Research', 'Tel Aviv University'], 'pdf_title_img': 'assets/pdf/title_img/2501.01407.jpg', 'data': {'categories': ['#multimodal', '#architecture', '#cv'], 'emoji': '🎨', 'ru': {'title': 'Nested Attention: новый подход к персонализации генерации изображений', 'desc': "Статья представляет новый метод под названием 'Nested Attention' для персонализации моделей text-to-image. Этот механизм внедряет богатое и выразительное представление изображения в существующие слои кросс-внимания модели. Ключевая идея заключается в генерации зависимых от запроса значений субъекта, полученных из вложенных слоев внимания. Метод позволяет достичь высокого сохранения идентичности при соблюдении входных текстовых подсказок."}, 'en': {'title': 'Nested Attention: Balancing Identity and Text Alignment in Image Generation', 'desc': 'This paper presents a new method called Nested Attention for personalizing text-to-image models. The method addresses the challenge of balancing identity preservation of subjects with the alignment to text prompts. By using query-dependent subject values from nested attention layers, the model can effectively select relevant features for each part of the generated image. This approach not only maintains high identity fidelity but also allows for the integration of multiple personalized subjects from different domains into a single image.'}, 'zh': {'title': '嵌套注意力:个性化图像生成的新方法', 'desc': '本文介绍了一种新的机制,称为嵌套注意力,用于个性化文本到图像模型。该方法通过在模型的交叉注意力层中注入丰富的图像表示,解决了身份保留与文本提示对齐之间的平衡问题。嵌套注意力层能够为生成图像的每个区域选择相关的主题特征,从而实现高效的个性化。我们的研究表明,这种方法可以在多个领域进行训练,并允许在单个图像中结合来自不同领域的多个个性化主题。'}}}, {'id': 'https://huggingface.co/papers/2501.00658', 'title': 'Understanding and Mitigating Bottlenecks of State Space Models through the Lens of Recency and Over-smoothing', 'url': 'https://huggingface.co/papers/2501.00658', 'abstract': "Structured State Space Models (SSMs) have emerged as alternatives to transformers. While SSMs are often regarded as effective in capturing long-sequence dependencies, we rigorously demonstrate that they are inherently limited by strong recency bias. Our empirical studies also reveal that this bias impairs the models' ability to recall distant information and introduces robustness issues. Our scaling experiments then discovered that deeper structures in SSMs can facilitate the learning of long contexts. However, subsequent theoretical analysis reveals that as SSMs increase in depth, they exhibit another inevitable tendency toward over-smoothing, e.g., token representations becoming increasingly indistinguishable. This fundamental dilemma between recency and over-smoothing hinders the scalability of existing SSMs. Inspired by our theoretical findings, we propose to polarize two channels of the state transition matrices in SSMs, setting them to zero and one, respectively, simultaneously addressing recency bias and over-smoothing. Experiments demonstrate that our polarization technique consistently enhances the associative recall accuracy of long-range tokens and unlocks SSMs to benefit further from deeper architectures. All source codes are released at https://github.com/VITA-Group/SSM-Bottleneck.", 'score': 6, 'issue_id': 1476, 'pub_date': '2025-12-31', 'pub_date_card': {'ru': '31 декабря', 'en': 'December 31', 'zh': '12月31日'}, 'hash': '253304ea64defbe0', 'authors': ['Peihao Wang', 'Ruisi Cai', 'Yuehao Wang', 'Jiajun Zhu', 'Pragya Srivastava', 'Zhangyang Wang', 'Pan Li'], 'affiliations': ['Georgia Tech', 'Google DeepMind', 'University of Texas at Austin', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00658.jpg', 'data': {'categories': ['#training', '#open_source', '#long_context', '#optimization', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Преодоление ограничений SSM: баланс между недавностью и сглаживанием', 'desc': 'Структурированные модели пространства состояний (SSM) рассматриваются как альтернатива трансформерам в обработке длинных последовательностей. Исследование показало, что SSM имеют существенное ограничение в виде сильного смещения к недавним данным, что затрудняет запоминание отдаленной информации. Увеличение глубины SSM улучшает обработку длинных контекстов, но приводит к проблеме чрезмерного сглаживания. Авторы предлагают метод поляризации каналов матриц перехода состояний для решения этих проблем, что улучшает точность ассоциативного извлечения дальних токенов.'}, 'en': {'title': 'Balancing Recency and Over-Smoothing in SSMs', 'desc': "This paper discusses Structured State Space Models (SSMs) as alternatives to transformers, highlighting their limitations due to strong recency bias. This bias affects the models' ability to remember distant information and creates robustness issues. The authors propose a solution by polarizing the state transition matrices, which helps mitigate both recency bias and over-smoothing that occurs with deeper architectures. Their experiments show that this new approach improves the accuracy of recalling long-range tokens, allowing SSMs to effectively utilize deeper structures."}, 'zh': {'title': '解决近期偏见与过平滑的双重挑战', 'desc': '结构状态空间模型(SSMs)作为变换器的替代方案,虽然在捕捉长序列依赖性方面表现出色,但存在强烈的近期偏见限制。我们的实证研究表明,这种偏见影响了模型对远程信息的回忆能力,并引入了鲁棒性问题。通过扩展实验,我们发现SSMs的深层结构可以促进长上下文的学习,但理论分析显示,随着深度增加,模型会出现过平滑的趋势,使得标记表示变得难以区分。我们提出的极化技术通过将状态转移矩阵的两个通道设置为零和一,解决了近期偏见和过平滑的问题,显著提高了长距离标记的关联回忆准确性。'}}}, {'id': 'https://huggingface.co/papers/2501.01245', 'title': 'SeFAR: Semi-supervised Fine-grained Action Recognition with Temporal Perturbation and Learning Stabilization', 'url': 'https://huggingface.co/papers/2501.01245', 'abstract': 'Human action understanding is crucial for the advancement of multimodal systems. While recent developments, driven by powerful large language models (LLMs), aim to be general enough to cover a wide range of categories, they often overlook the need for more specific capabilities. In this work, we address the more challenging task of Fine-grained Action Recognition (FAR), which focuses on detailed semantic labels within shorter temporal duration (e.g., "salto backward tucked with 1 turn"). Given the high costs of annotating fine-grained labels and the substantial data needed for fine-tuning LLMs, we propose to adopt semi-supervised learning (SSL). Our framework, SeFAR, incorporates several innovative designs to tackle these challenges. Specifically, to capture sufficient visual details, we construct Dual-level temporal elements as more effective representations, based on which we design a new strong augmentation strategy for the Teacher-Student learning paradigm through involving moderate temporal perturbation. Furthermore, to handle the high uncertainty within the teacher model\'s predictions for FAR, we propose the Adaptive Regulation to stabilize the learning process. Experiments show that SeFAR achieves state-of-the-art performance on two FAR datasets, FineGym and FineDiving, across various data scopes. It also outperforms other semi-supervised methods on two classical coarse-grained datasets, UCF101 and HMDB51. Further analysis and ablation studies validate the effectiveness of our designs. Additionally, we show that the features extracted by our SeFAR could largely promote the ability of multimodal foundation models to understand fine-grained and domain-specific semantics.', 'score': 5, 'issue_id': 1475, 'pub_date': '2025-01-02', 'pub_date_card': {'ru': '2 января', 'en': 'January 2', 'zh': '1月2日'}, 'hash': '30d94590a5c78569', 'authors': ['Yongle Huang', 'Haodong Chen', 'Zhenbang Xu', 'Zihan Jia', 'Haozhou Sun', 'Dian Shao'], 'affiliations': ['School of Automation, Northwestern Polytechnical University, Xian, China', 'School of Computer Science, Northwestern Polytechnical University, Xian, China', 'School of Software, Northwestern Polytechnical University, Xian, China', 'Unmanned System Research Institute, Northwestern Polytechnical University, Xian, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01245.jpg', 'data': {'categories': ['#dataset', '#transfer_learning', '#multimodal', '#optimization', '#training'], 'emoji': '🤸', 'ru': {'title': 'SeFAR: Прорыв в распознавании детализированных действий с помощью полу-контролируемого обучения', 'desc': 'Статья представляет новый подход к задаче распознавания детализированных действий (Fine-grained Action Recognition, FAR) с использованием полу-контролируемого обучения. Авторы предлагают фреймворк SeFAR, который включает в себя двухуровневые временные элементы для более эффективного представления действий и новую стратегию аугментации данных. SeFAR также использует адаптивную регуляцию для стабилизации процесса обучения при работе с неопределенностью в предсказаниях модели-учителя. Эксперименты показывают, что SeFAR достигает лучших результатов на нескольких наборах данных FAR и классических наборах данных для распознавания действий.'}, 'en': {'title': 'SeFAR: Elevating Fine-grained Action Recognition with Semi-supervised Learning', 'desc': "This paper focuses on improving Fine-grained Action Recognition (FAR), which identifies specific actions in short time frames. The authors introduce a semi-supervised learning framework called SeFAR, which uses innovative techniques to enhance the learning process despite the challenges of limited labeled data. They develop Dual-level temporal elements for better visual representation and implement a strong augmentation strategy within a Teacher-Student learning setup. The results demonstrate that SeFAR achieves top performance on FAR datasets and enhances multimodal models' understanding of detailed actions."}, 'zh': {'title': '细粒度动作识别的新突破', 'desc': '人类动作理解对多模态系统的发展至关重要。本文提出了一种新的框架SeFAR,专注于细粒度动作识别(FAR),旨在处理短时间内的详细语义标签。我们采用半监督学习(SSL)来减少对大量标注数据的需求,并通过构建双层时间元素和新的强增强策略来提高模型的表现。实验结果表明,SeFAR在多个数据集上达到了最先进的性能,证明了我们设计的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.00910', 'title': 'Population Aware Diffusion for Time Series Generation', 'url': 'https://huggingface.co/papers/2501.00910', 'abstract': 'Diffusion models have shown promising ability in generating high-quality time series (TS) data. Despite the initial success, existing works mostly focus on the authenticity of data at the individual level, but pay less attention to preserving the population-level properties on the entire dataset. Such population-level properties include value distributions for each dimension and distributions of certain functional dependencies (e.g., cross-correlation, CC) between different dimensions. For instance, when generating house energy consumption TS data, the value distributions of the outside temperature and the kitchen temperature should be preserved, as well as the distribution of CC between them. Preserving such TS population-level properties is critical in maintaining the statistical insights of the datasets, mitigating model bias, and augmenting downstream tasks like TS prediction. Yet, it is often overlooked by existing models. Hence, data generated by existing models often bear distribution shifts from the original data. We propose Population-aware Diffusion for Time Series (PaD-TS), a new TS generation model that better preserves the population-level properties. The key novelties of PaD-TS include 1) a new training method explicitly incorporating TS population-level property preservation, and 2) a new dual-channel encoder model architecture that better captures the TS data structure. Empirical results in major benchmark datasets show that PaD-TS can improve the average CC distribution shift score between real and synthetic data by 5.9x while maintaining a performance comparable to state-of-the-art models on individual-level authenticity.', 'score': 4, 'issue_id': 1486, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'cd3f9282d55e15f2', 'authors': ['Yang Li', 'Han Meng', 'Zhenyu Bi', 'Ingolv T. Urnes', 'Haipeng Chen'], 'affiliations': ['Generated Health', 'Virginia Tech', 'William & Mary'], 'pdf_title_img': 'assets/pdf/title_img/2501.00910.jpg', 'data': {'categories': ['#synthetic', '#benchmark', '#dataset', '#data', '#training', '#architecture', '#diffusion'], 'emoji': '📊', 'ru': {'title': 'Генерация временных рядов с сохранением свойств популяции', 'desc': 'Статья представляет новую модель генерации временных рядов под названием PaD-TS (Population-aware Diffusion for Time Series). Модель нацелена на сохранение свойств на уровне популяции, таких как распределения значений и функциональные зависимости между измерениями. PaD-TS использует новый метод обучения, явно включающий сохранение свойств временных рядов на уровне популяции, и новую архитектуру модели с двухканальным энкодером. Эмпирические результаты показывают значительное улучшение в сохранении распределения кросс-корреляций при сравнимой аутентичности на индивидуальном уровне.'}, 'en': {'title': 'Preserving Population Insights in Time Series Generation', 'desc': 'This paper introduces a new model called Population-aware Diffusion for Time Series (PaD-TS) that focuses on generating time series data while preserving important population-level properties. Unlike previous models that mainly ensure individual data authenticity, PaD-TS emphasizes maintaining the overall statistical characteristics of the dataset, such as value distributions and cross-correlations between different dimensions. The model employs a novel training method and a dual-channel encoder architecture to effectively capture the structure of time series data. Experimental results demonstrate that PaD-TS significantly reduces distribution shifts in generated data while achieving comparable performance in individual-level authenticity to existing state-of-the-art models.'}, 'zh': {'title': '保留人口级特性,提升时间序列生成质量', 'desc': '扩散模型在生成高质量时间序列数据方面表现出色。然而,现有研究主要关注个体数据的真实性,而忽视了整个数据集的人口级特性。我们提出了一种新的时间序列生成模型PaD-TS,旨在更好地保留这些人口级特性,包括值分布和不同维度之间的交叉相关性。实验结果表明,PaD-TS在保持个体级真实性的同时,显著改善了真实数据与合成数据之间的分布差异。'}}}, {'id': 'https://huggingface.co/papers/2501.00712', 'title': 'Rethinking Addressing in Language Models via Contexualized Equivariant Positional Encoding', 'url': 'https://huggingface.co/papers/2501.00712', 'abstract': 'Transformers rely on both content-based and position-based addressing mechanisms to make predictions, but existing positional encoding techniques often diminish the effectiveness of position-based addressing. Many current methods enforce rigid patterns in attention maps, limiting the ability to model long-range dependencies and adapt to diverse tasks. Additionally, most positional encodings are learned as general biases, lacking the specialization required for different instances within a dataset. To address this, we propose conTextualized equivariAnt Position Embedding (TAPE), a novel framework that enhances positional embeddings by incorporating sequence content across layers. TAPE introduces dynamic, context-aware positional encodings, overcoming the constraints of traditional fixed patterns. By enforcing permutation and orthogonal equivariance, TAPE ensures the stability of positional encodings during updates, improving robustness and adaptability. Our method can be easily integrated into pre-trained transformers, offering parameter-efficient fine-tuning with minimal overhead. Extensive experiments shows that TAPE achieves superior performance in language modeling, arithmetic reasoning, and long-context retrieval tasks compared to existing positional embedding techniques.', 'score': 4, 'issue_id': 1485, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'e5119d0e83ce2af2', 'authors': ['Jiajun Zhu', 'Peihao Wang', 'Ruisi Cai', 'Jason D. Lee', 'Pan Li', 'Zhangyang Wang'], 'affiliations': ['Georgia Tech', 'Princeton University', 'University of Texas at Austin', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.00712.jpg', 'data': {'categories': ['#long_context', '#optimization', '#training', '#architecture', '#reasoning'], 'emoji': '🔀', 'ru': {'title': 'Динамические позиционные эмбеддинги для улучшения работы трансформеров', 'desc': 'Авторы предлагают новый метод позиционного кодирования для трансформеров под названием TAPE. Этот подход учитывает контекст последовательности и создает динамические позиционные эмбеддинги, адаптированные к конкретным задачам. TAPE обеспечивает стабильность кодирования благодаря свойствам перестановочной и ортогональной эквивариантности. Метод легко интегрируется в предобученные модели и показывает превосходные результаты в задачах языкового моделирования, арифметических рассуждений и поиска в длинных контекстах.'}, 'en': {'title': 'Enhancing Transformers with Context-Aware Positional Embeddings', 'desc': "This paper introduces a new method called conTextualized equivariAnt Position Embedding (TAPE) to improve how transformers use positional information. Traditional positional encodings often restrict the model's ability to understand long-range relationships in data. TAPE enhances these encodings by making them dynamic and context-aware, allowing them to adapt to different sequences and tasks. The method shows better performance in various applications, such as language modeling and reasoning, while being easy to integrate into existing transformer models."}, 'zh': {'title': '提升变换器模型的位置信息处理能力', 'desc': '本文提出了一种新的位置编码方法,称为TAPE(conTextualized equivariAnt Position Embedding),旨在提高变换器模型的预测能力。传统的位置编码方法往往限制了模型对长距离依赖关系的建模能力,而TAPE通过引入动态的、上下文感知的位置编码来克服这一问题。该方法确保了位置编码在更新过程中的稳定性,从而提高了模型的鲁棒性和适应性。实验结果表明,TAPE在语言建模、算术推理和长上下文检索任务中表现优于现有的位置编码技术。'}}}, {'id': 'https://huggingface.co/papers/2501.05441', 'title': 'The GAN is dead; long live the GAN! A Modern GAN Baseline', 'url': 'https://huggingface.co/papers/2501.05441', 'abstract': 'There is a widely-spread claim that GANs are difficult to train, and GAN architectures in the literature are littered with empirical tricks. We provide evidence against this claim and build a modern GAN baseline in a more principled manner. First, we derive a well-behaved regularized relativistic GAN loss that addresses issues of mode dropping and non-convergence that were previously tackled via a bag of ad-hoc tricks. We analyze our loss mathematically and prove that it admits local convergence guarantees, unlike most existing relativistic losses. Second, our new loss allows us to discard all ad-hoc tricks and replace outdated backbones used in common GANs with modern architectures. Using StyleGAN2 as an example, we present a roadmap of simplification and modernization that results in a new minimalist baseline -- R3GAN. Despite being simple, our approach surpasses StyleGAN2 on FFHQ, ImageNet, CIFAR, and Stacked MNIST datasets, and compares favorably against state-of-the-art GANs and diffusion models.', 'score': 51, 'issue_id': 1596, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'eb1cd90c4d5cb0ef', 'authors': ['Yiwen Huang', 'Aaron Gokaslan', 'Volodymyr Kuleshov', 'James Tompkin'], 'affiliations': ['Brown University', 'Cornell University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05441.jpg', 'data': {'categories': ['#training', '#architecture', '#diffusion', '#optimization', '#cv'], 'emoji': '🔬', 'ru': {'title': 'Упрощение и модернизация GAN: новый взгляд на обучение генеративных моделей', 'desc': 'Исследователи опровергают распространенное мнение о сложности обучения генеративно-состязательных сетей (GAN). Они разработали новый регуляризованный релятивистский GAN-лосс, который решает проблемы потери мод и отсутствия сходимости. Авторы математически доказывают, что их лосс обеспечивает локальную сходимость, в отличие от большинства существующих релятивистских лоссов. На основе этого подхода они создали минималистичную базовую модель R3GAN, которая превосходит StyleGAN2 и другие современные GAN на нескольких наборах данных.'}, 'en': {'title': 'Simplifying GAN Training with R3GAN: A New Era of Efficiency', 'desc': 'This paper challenges the common belief that Generative Adversarial Networks (GANs) are inherently difficult to train. It introduces a new GAN loss function called the regularized relativistic GAN loss, which effectively addresses issues like mode dropping and non-convergence without relying on numerous empirical tricks. The authors provide mathematical analysis showing that their loss function guarantees local convergence, which is a significant improvement over existing methods. By applying this new loss to modern architectures like StyleGAN2, they create a simplified and efficient GAN model named R3GAN, which outperforms previous models on several benchmark datasets.'}, 'zh': {'title': '简化GAN训练,超越传统架构', 'desc': '这篇论文探讨了生成对抗网络(GAN)训练的难点,并提出了一种新的方法来简化这一过程。作者提出了一种正则化的相对GAN损失函数,解决了模式丢失和非收敛的问题。通过数学分析,证明了这种损失函数具有局部收敛的保证,优于现有的相对损失函数。最终,作者展示了一个新的简约基线R3GAN,其在多个数据集上的表现超过了StyleGAN2,并与最先进的GAN和扩散模型相媲美。'}}}, {'id': 'https://huggingface.co/papers/2501.05032', 'title': 'Enhancing Human-Like Responses in Large Language Models', 'url': 'https://huggingface.co/papers/2501.05032', 'abstract': 'This paper explores the advancements in making large language models (LLMs) more human-like. We focus on techniques that enhance natural language understanding, conversational coherence, and emotional intelligence in AI systems. The study evaluates various approaches, including fine-tuning with diverse datasets, incorporating psychological principles, and designing models that better mimic human reasoning patterns. Our findings demonstrate that these enhancements not only improve user interactions but also open new possibilities for AI applications across different domains. Future work will address the ethical implications and potential biases introduced by these human-like attributes.', 'score': 28, 'issue_id': 1609, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '64e14687fd1e5dab', 'authors': ['Ethem Yağız Çalık', 'Talha Rüzgar Akkuş'], 'affiliations': ['Hugging Face'], 'pdf_title_img': 'assets/pdf/title_img/2501.05032.jpg', 'data': {'categories': ['#training', '#alignment', '#rlhf', '#ethics', '#multimodal'], 'emoji': '🤖', 'ru': {'title': 'Путь к человекоподобному ИИ: улучшение больших языковых моделей', 'desc': 'Статья исследует методы повышения человекоподобности больших языковых моделей (LLM). Авторы рассматривают техники улучшения понимания естественного языка, связности диалогов и эмоционального интеллекта в системах искусственного интеллекта. Исследование оценивает различные подходы, включая дообучение на разнообразных датасетах, внедрение психологических принципов и разработку моделей, лучше имитирующих человеческие паттерны мышления. Результаты показывают, что эти улучшения не только совершенствуют взаимодействие с пользователем, но и открывают новые возможности для применения ИИ в различных областях.'}, 'en': {'title': 'Enhancing AI: Making Language Models More Human-Like', 'desc': 'This paper investigates how to make large language models (LLMs) behave more like humans. It emphasizes improving natural language understanding, making conversations more coherent, and increasing emotional intelligence in AI. The research assesses methods such as fine-tuning models with varied datasets and applying psychological principles to enhance human-like reasoning. The results show that these improvements lead to better user experiences and expand the potential uses of AI, while also highlighting the need to consider ethical issues and biases that may arise.'}, 'zh': {'title': '让人工智能更像人类的未来', 'desc': '本文探讨了使大型语言模型(LLMs)更具人性化的进展。我们重点关注增强自然语言理解、对话连贯性和情感智能的技术。研究评估了多种方法,包括使用多样化数据集进行微调、融入心理学原理,以及设计更好模拟人类推理模式的模型。我们的发现表明,这些增强不仅改善了用户互动,还为不同领域的人工智能应用开辟了新可能。'}}}, {'id': 'https://huggingface.co/papers/2501.05453', 'title': 'An Empirical Study of Autoregressive Pre-training from Videos', 'url': 'https://huggingface.co/papers/2501.05453', 'abstract': 'We empirically study autoregressive pre-training from videos. To perform our study, we construct a series of autoregressive video models, called Toto. We treat videos as sequences of visual tokens and train transformer models to autoregressively predict future tokens. Our models are pre-trained on a diverse dataset of videos and images comprising over 1 trillion visual tokens. We explore different architectural, training, and inference design choices. We evaluate the learned visual representations on a range of downstream tasks including image recognition, video classification, object tracking, and robotics. Our results demonstrate that, despite minimal inductive biases, autoregressive pre-training leads to competitive performance across all benchmarks. Finally, we find that scaling our video models results in similar scaling curves to those seen in language models, albeit with a different rate. More details at https://brjathu.github.io/toto/', 'score': 28, 'issue_id': 1596, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '3846ea8507d046be', 'authors': ['Jathushan Rajasegaran', 'Ilija Radosavovic', 'Rahul Ravishankar', 'Yossi Gandelsman', 'Christoph Feichtenhofer', 'Jitendra Malik'], 'affiliations': ['Meta FAIR', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.05453.jpg', 'data': {'categories': ['#training', '#dataset', '#benchmark', '#architecture', '#robotics', '#video', '#cv'], 'emoji': '🎬', 'ru': {'title': 'Авторегрессионное предобучение видео: путь к универсальному компьютерному зрению', 'desc': 'В статье исследуется авторегрессионное предобучение на видеоданных с использованием модели Toto. Авторы рассматривают видео как последовательности визуальных токенов и обучают трансформеры предсказывать будущие токены. Модели предобучаются на разнообразном наборе данных из более чем триллиона визуальных токенов. Результаты показывают, что такой подход дает конкурентоспособную производительность на различных задачах компьютерного зрения.'}, 'en': {'title': 'Unlocking Video Understanding with Autoregressive Models', 'desc': 'This paper investigates the use of autoregressive pre-training for video data through a series of models named Toto. The authors treat videos as sequences of visual tokens and employ transformer architectures to predict future tokens in these sequences. They pre-train their models on a massive dataset containing over 1 trillion visual tokens, exploring various design choices in architecture and training. The results show that these autoregressive models achieve strong performance on tasks like image recognition and video classification, indicating that scaling video models can yield similar benefits as seen in language models.'}, 'zh': {'title': '自回归预训练:视频模型的新突破', 'desc': '本文研究了视频的自回归预训练。我们构建了一系列名为Toto的自回归视频模型,将视频视为视觉标记的序列,并训练变换器模型以自回归方式预测未来的标记。我们的模型在一个包含超过1万亿视觉标记的多样化视频和图像数据集上进行预训练,并在多个下游任务上评估学习到的视觉表示。结果表明,尽管诱导偏差较小,自回归预训练在所有基准测试中表现出竞争力的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04003', 'title': 'Are VLMs Ready for Autonomous Driving? An Empirical Study from the Reliability, Data, and Metric Perspectives', 'url': 'https://huggingface.co/papers/2501.04003', 'abstract': "Recent advancements in Vision-Language Models (VLMs) have sparked interest in their use for autonomous driving, particularly in generating interpretable driving decisions through natural language. However, the assumption that VLMs inherently provide visually grounded, reliable, and interpretable explanations for driving remains largely unexamined. To address this gap, we introduce DriveBench, a benchmark dataset designed to evaluate VLM reliability across 17 settings (clean, corrupted, and text-only inputs), encompassing 19,200 frames, 20,498 question-answer pairs, three question types, four mainstream driving tasks, and a total of 12 popular VLMs. Our findings reveal that VLMs often generate plausible responses derived from general knowledge or textual cues rather than true visual grounding, especially under degraded or missing visual inputs. This behavior, concealed by dataset imbalances and insufficient evaluation metrics, poses significant risks in safety-critical scenarios like autonomous driving. We further observe that VLMs struggle with multi-modal reasoning and display heightened sensitivity to input corruptions, leading to inconsistencies in performance. To address these challenges, we propose refined evaluation metrics that prioritize robust visual grounding and multi-modal understanding. Additionally, we highlight the potential of leveraging VLMs' awareness of corruptions to enhance their reliability, offering a roadmap for developing more trustworthy and interpretable decision-making systems in real-world autonomous driving contexts. The benchmark toolkit is publicly accessible.", 'score': 20, 'issue_id': 1599, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '720b493a608f478a', 'authors': ['Shaoyuan Xie', 'Lingdong Kong', 'Yuhao Dong', 'Chonghao Sima', 'Wenwei Zhang', 'Qi Alfred Chen', 'Ziwei Liu', 'Liang Pan'], 'affiliations': ['National University of Singapore', 'S-Lab, Nanyang Technological University', 'Shanghai AI Laboratory', 'The University of Hong Kong', 'University of California, Irvine'], 'pdf_title_img': 'assets/pdf/title_img/2501.04003.jpg', 'data': {'categories': ['#security', '#interpretability', '#dataset', '#multimodal', '#reasoning', '#benchmark', '#cv'], 'emoji': '🚗', 'ru': {'title': 'Проверка надёжности VLM для безопасного автономного вождения', 'desc': 'Статья представляет DriveBench - набор данных для оценки надёжности мультимодальных языковых моделей (VLM) в контексте автономного вождения. Исследование выявило, что VLM часто генерируют правдоподобные ответы на основе общих знаний, а не визуальной информации, что опасно в критически важных сценариях. Авторы предлагают усовершенствованные метрики оценки, ориентированные на надёжную визуальную привязку и мультимодальное понимание. Также отмечается потенциал использования осведомленности VLM о искажениях для повышения их надёжности.'}, 'en': {'title': 'Enhancing Trust in Vision-Language Models for Safer Autonomous Driving', 'desc': 'This paper discusses the limitations of Vision-Language Models (VLMs) in the context of autonomous driving, particularly their ability to provide reliable and interpretable driving decisions. The authors introduce DriveBench, a comprehensive benchmark dataset that tests VLM performance across various conditions, including clean and corrupted inputs. Their research shows that VLMs often rely on general knowledge rather than true visual understanding, especially when visual data is compromised. To improve VLM reliability, the paper suggests new evaluation metrics focused on visual grounding and multi-modal reasoning, aiming to enhance the safety of autonomous driving systems.'}, 'zh': {'title': '提升自动驾驶决策的可靠性与可解释性', 'desc': '本文介绍了DriveBench,一个用于评估视觉语言模型(VLMs)在自动驾驶中可靠性的基准数据集。该数据集包含19200帧图像和20498个问答对,涵盖了多种驾驶任务和输入类型。研究发现,VLMs在处理受损或缺失的视觉输入时,往往依赖于一般知识而非真实的视觉信息,导致安全隐患。为了解决这些问题,本文提出了改进的评估指标,强调视觉基础和多模态理解的重要性。'}}}, {'id': 'https://huggingface.co/papers/2501.05122', 'title': 'Centurio: On Drivers of Multilingual Ability of Large Vision-Language Model', 'url': 'https://huggingface.co/papers/2501.05122', 'abstract': 'Most Large Vision-Language Models (LVLMs) to date are trained predominantly on English data, which makes them struggle to understand non-English input and fail to generate output in the desired target language. Existing efforts mitigate these issues by adding multilingual training data, but do so in a largely ad-hoc manner, lacking insight into how different training mixes tip the scale for different groups of languages. In this work, we present a comprehensive investigation into the training strategies for massively multilingual LVLMs. First, we conduct a series of multi-stage experiments spanning 13 downstream vision-language tasks and 43 languages, systematically examining: (1) the number of training languages that can be included without degrading English performance and (2) optimal language distributions of pre-training as well as (3) instruction-tuning data. Further, we (4) investigate how to improve multilingual text-in-image understanding, and introduce a new benchmark for the task. Surprisingly, our analysis reveals that one can (i) include as many as 100 training languages simultaneously (ii) with as little as 25-50\\% of non-English data, to greatly improve multilingual performance while retaining strong English performance. We further find that (iii) including non-English OCR data in pre-training and instruction-tuning is paramount for improving multilingual text-in-image understanding. Finally, we put all our findings together and train Centurio, a 100-language LVLM, offering state-of-the-art performance in an evaluation covering 14 tasks and 56 languages.', 'score': 13, 'issue_id': 1604, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '92d74f3bbeb4a400', 'authors': ['Gregor Geigle', 'Florian Schneider', 'Carolin Holtermann', 'Chris Biemann', 'Radu Timofte', 'Anne Lauscher', 'Goran Glavaš'], 'affiliations': ['Data Science Group, University of Hamburg', 'Language Technology Group', 'WüNLP, Computer Vision Lab, CAIDAS, University of Würzburg'], 'pdf_title_img': 'assets/pdf/title_img/2501.05122.jpg', 'data': {'categories': ['#machine_translation', '#multilingual', '#benchmark', '#low_resource'], 'emoji': '🌍', 'ru': {'title': 'Centurio: Прорыв в многоязычном визуально-языковом ИИ', 'desc': 'В статье описывается исследование стратегий обучения многоязычных крупномасштабных визуально-языковых моделей (LVLMs). Авторы проводят эксперименты на 13 задачах и 43 языках, изучая оптимальное распределение языков в данных для предобучения и инструктивной настройки. Они обнаруживают, что можно включить до 100 языков обучения, используя всего 25-50% неанглийских данных, значительно улучшив многоязычную производительность при сохранении высокой эффективности на английском. На основе полученных результатов авторы обучают Centurio - 100-язычную LVLM, демонстрирующую передовые результаты на 14 задачах и 56 языках.'}, 'en': {'title': 'Unlocking Multilingual Mastery in Vision-Language Models', 'desc': 'This paper investigates how to effectively train Large Vision-Language Models (LVLMs) on multiple languages, particularly focusing on improving their performance in non-English languages. The authors conduct experiments across various tasks and languages to determine the best strategies for including multilingual data without harming English performance. They discover that including up to 100 languages and using a smaller proportion of non-English data can enhance multilingual capabilities while maintaining strong English results. Additionally, they emphasize the importance of incorporating non-English OCR data to boost understanding of text within images, culminating in the development of Centurio, a 100-language LVLM with state-of-the-art performance.'}, 'zh': {'title': '提升多语言理解,Centurio引领新潮流', 'desc': '本文研究了大规模多语言视觉-语言模型(LVLM)的训练策略,特别关注如何提高模型对非英语输入的理解和输出能力。我们通过多阶段实验,分析了包含多种语言的训练数据对英语性能的影响,并探索了最佳的语言分布策略。研究发现,最多可以同时包含100种语言的训练数据,并且只需25-50%的非英语数据即可显著提升多语言性能。最后,我们结合所有发现,训练了Centurio,一个支持100种语言的LVLM,在14个任务和56种语言的评估中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.03489', 'title': 'Entropy-Guided Attention for Private LLMs', 'url': 'https://huggingface.co/papers/2501.03489', 'abstract': "The pervasiveness of proprietary language models has raised critical privacy concerns, necessitating advancements in private inference (PI), where computations are performed directly on encrypted data without revealing users' sensitive information. While PI offers a promising solution, its practical deployment is hindered by substantial communication and latency overheads, primarily stemming from nonlinear operations. To address this, we introduce an information-theoretic framework to characterize the role of nonlinearities in decoder-only language models, laying a principled foundation for optimizing transformer-architectures tailored to the demands of PI. By leveraging Shannon's entropy as a quantitative measure, we uncover the previously unexplored dual significance of nonlinearities: beyond ensuring training stability, they are crucial for maintaining attention head diversity. Specifically, we find that their removal triggers two critical failure modes: {\\em entropy collapse} in deeper layers that destabilizes training, and {\\em entropic overload} in earlier layers that leads to under-utilization of Multi-Head Attention's (MHA) representational capacity. We propose an entropy-guided attention mechanism paired with a novel entropy regularization technique to mitigate entropic overload. Additionally, we explore PI-friendly alternatives to layer normalization for preventing entropy collapse and stabilizing the training of LLMs with reduced-nonlinearities. Our study bridges the gap between information theory and architectural design, establishing entropy dynamics as a principled guide for developing efficient PI architectures. The code and implementation are available at https://github.com/Nandan91/entropy-guided-attention-llm{entropy-guided-llm}.", 'score': 11, 'issue_id': 1597, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '18abcfb3fe1b209b', 'authors': ['Nandan Kumar Jha', 'Brandon Reagen'], 'affiliations': ['New York University'], 'pdf_title_img': 'assets/pdf/title_img/2501.03489.jpg', 'data': {'categories': ['#security', '#inference', '#optimization', '#architecture', '#training', '#open_source'], 'emoji': '🔐', 'ru': {'title': 'Энтропия как ключ к конфиденциальным языковым моделям', 'desc': 'Статья рассматривает проблему конфиденциальности при использовании языковых моделей и предлагает решение через частное вычисление (PI). Авторы представляют информационно-теоретическую основу для оптимизации архитектур трансформеров под задачи PI, используя энтропию Шеннона как количественную меру. Исследование выявляет двойную роль нелинейностей в моделях: обеспечение стабильности обучения и поддержание разнообразия в механизме внимания. Предложен энтропийно-управляемый механизм внимания и новая техника регуляризации энтропии для улучшения эффективности PI-архитектур.'}, 'en': {'title': 'Optimizing Language Models for Privacy with Entropy Dynamics', 'desc': 'This paper addresses privacy concerns related to proprietary language models by focusing on private inference (PI), which allows computations on encrypted data. The authors introduce an information-theoretic framework to analyze the impact of nonlinearities in decoder-only language models, which are essential for optimizing transformer architectures for PI. They identify two critical issues caused by the removal of nonlinearities: entropy collapse in deeper layers and entropic overload in earlier layers, both of which affect training stability and attention mechanisms. To resolve these issues, the paper proposes an entropy-guided attention mechanism and explores alternatives to layer normalization, aiming to enhance the efficiency of PI architectures while maintaining model performance.'}, 'zh': {'title': '优化私密推理架构的熵动态', 'desc': '本论文探讨了在加密数据上进行私密推理(PI)时,非线性操作对解码器语言模型的影响。我们提出了一种信息论框架,帮助优化适合PI需求的变换器架构。研究发现,非线性不仅确保了训练的稳定性,还对注意力头的多样性至关重要。为了解决熵崩溃和熵过载问题,我们提出了一种基于熵的注意力机制和新的熵正则化技术。'}}}, {'id': 'https://huggingface.co/papers/2501.05040', 'title': 'SWE-Fixer: Training Open-Source LLMs for Effective and Efficient GitHub Issue Resolution', 'url': 'https://huggingface.co/papers/2501.05040', 'abstract': 'Large Language Models (LLMs) have demonstrated remarkable proficiency across a variety of complex tasks. One significant application of LLMs is in tackling software engineering challenges, particularly in resolving real-world tasks on GitHub by fixing code based on the issues reported by the users. However, many current approaches rely on proprietary LLMs, which limits reproducibility, accessibility, and transparency. The critical components of LLMs for addressing software engineering issues and how their capabilities can be effectively enhanced remain unclear. To address these challenges, we introduce SWE-Fixer, a novel open-source LLM designed to effectively and efficiently resolve GitHub issues. SWE-Fixer comprises two essential modules: a code file retrieval module and a code editing module. The retrieval module employs BM25 along with a lightweight LLM model to achieve coarse-to-fine file retrieval. Subsequently, the code editing module utilizes the other LLM model to generate patches for the identified files. Then, to mitigate the lack of publicly available datasets, we compile an extensive dataset that includes 110K GitHub issues along with their corresponding patches, and train the two modules of SWE-Fixer separately. We assess our approach on the SWE-Bench Lite and Verified benchmarks, achieving state-of-the-art performance among open-source models with scores of 23.3% and 30.2%, respectively. These outcomes highlight the efficacy of our approach. We will make our model, dataset, and code publicly available at https://github.com/InternLM/SWE-Fixer.', 'score': 8, 'issue_id': 1608, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '54d8f8a0fe5436c6', 'authors': ['Chengxing Xie', 'Bowen Li', 'Chang Gao', 'He Du', 'Wai Lam', 'Difan Zou', 'Kai Chen'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong', 'The University of Hong Kong', 'Xidian University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05040.jpg', 'data': {'categories': ['#data', '#open_source', '#dataset', '#architecture', '#benchmark', '#training', '#science'], 'emoji': '🛠️', 'ru': {'title': 'Открытая языковая модель для эффективного решения проблем на GitHub', 'desc': 'SWE-Fixer - это новая модель с открытым исходным кодом для решения проблем на GitHub. Она состоит из модуля поиска файлов кода и модуля редактирования кода, использующих легковесные языковые модели. Авторы создали обширный датасет из 110 тысяч GitHub-issues с патчами для обучения модели. SWE-Fixer достигла лучших результатов среди моделей с открытым кодом на бенчмарках SWE-Bench Lite и Verified.'}, 'en': {'title': 'SWE-Fixer: Open-Source Solutions for GitHub Issues', 'desc': 'This paper presents SWE-Fixer, an open-source Large Language Model (LLM) specifically designed to address software engineering challenges on GitHub. It features two main components: a code file retrieval module that uses BM25 and a lightweight LLM for efficient file identification, and a code editing module that generates code patches using another LLM. The authors also created a comprehensive dataset of 110,000 GitHub issues and their corresponding patches to train the model effectively. SWE-Fixer achieves state-of-the-art performance on benchmark tests, demonstrating its potential to enhance accessibility and transparency in software engineering solutions.'}, 'zh': {'title': '开源LLM助力软件工程问题解决', 'desc': '大型语言模型(LLMs)在处理复杂任务方面表现出色,尤其是在软件工程领域。本文介绍了一种新颖的开源LLM,名为SWE-Fixer,旨在有效解决GitHub上的问题。SWE-Fixer包含两个主要模块:代码文件检索模块和代码编辑模块,前者使用BM25和轻量级LLM进行文件检索,后者生成代码补丁。通过构建包含11万个GitHub问题及其补丁的数据集,SWE-Fixer在开源模型中实现了领先的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.04377', 'title': 'On Computational Limits and Provably Efficient Criteria of Visual Autoregressive Models: A Fine-Grained Complexity Analysis', 'url': 'https://huggingface.co/papers/2501.04377', 'abstract': 'Recently, Visual Autoregressive (VAR) Models introduced a groundbreaking advancement in the field of image generation, offering a scalable approach through a coarse-to-fine "next-scale prediction" paradigm. However, the state-of-the-art algorithm of VAR models in [Tian, Jiang, Yuan, Peng and Wang, NeurIPS 2024] takes O(n^4) time, which is computationally inefficient. In this work, we analyze the computational limits and efficiency criteria of VAR Models through a fine-grained complexity lens. Our key contribution is identifying the conditions under which VAR computations can achieve sub-quadratic time complexity. Specifically, we establish a critical threshold for the norm of input matrices used in VAR attention mechanisms. Above this threshold, assuming the Strong Exponential Time Hypothesis (SETH) from fine-grained complexity theory, a sub-quartic time algorithm for VAR models is impossible. To substantiate our theoretical findings, we present efficient constructions leveraging low-rank approximations that align with the derived criteria. This work initiates the study of the computational efficiency of the VAR model from a theoretical perspective. Our technique will shed light on advancing scalable and efficient image generation in VAR frameworks.', 'score': 8, 'issue_id': 1597, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': 'be8a0f20db676680', 'authors': ['Yekun Ke', 'Xiaoyu Li', 'Yingyu Liang', 'Zhizhou Sha', 'Zhenmei Shi', 'Zhao Song'], 'affiliations': ['The Simons Institute for the Theory of Computing at UC Berkeley', 'The University of Hong Kong', 'Tsinghua University', 'University of Wisconsin-Madison'], 'pdf_title_img': 'assets/pdf/title_img/2501.04377.jpg', 'data': {'categories': ['#math', '#optimization', '#cv'], 'emoji': '🔬', 'ru': {'title': 'Преодоление вычислительных барьеров в VAR моделях', 'desc': 'Статья исследует вычислительные ограничения и критерии эффективности Визуальных Авторегрессионных (VAR) моделей с точки зрения тонкой теории сложности. Авторы определяют условия, при которых вычисления VAR могут достичь субквадратичной временной сложности. Они устанавливают критический порог для нормы входных матриц, используемых в механизмах внимания VAR, выше которого невозможен субкварцевый алгоритм времени для моделей VAR. Представлены эффективные конструкции, использующие аппроксимации низкого ранга, которые соответствуют выведенным критериям.'}, 'en': {'title': 'Unlocking Efficiency in Image Generation with VAR Models', 'desc': 'This paper explores the computational efficiency of Visual Autoregressive (VAR) Models, which are used for generating images. The authors identify that the current state-of-the-art VAR algorithm is computationally expensive, operating in O(n^4) time complexity. They establish conditions under which VAR computations can be optimized to achieve sub-quadratic time complexity, particularly focusing on the input matrix norms in the attention mechanisms. By applying low-rank approximations, the authors provide practical constructions that meet their theoretical criteria, paving the way for more efficient image generation techniques in VAR frameworks.'}, 'zh': {'title': '提升VAR模型的计算效率', 'desc': '最近,视觉自回归(VAR)模型在图像生成领域取得了突破性进展,采用粗到细的“下一个尺度预测”范式。然而,VAR模型的最新算法在计算上效率低下,时间复杂度为O(n^4)。本文通过细粒度复杂性分析,探讨了VAR模型的计算限制和效率标准。我们确定了VAR计算可以实现亚二次时间复杂度的条件,并提出了利用低秩近似的高效构造,以支持我们的理论发现。'}}}, {'id': 'https://huggingface.co/papers/2501.04828', 'title': 'Building Foundations for Natural Language Processing of Historical Turkish: Resources and Models', 'url': 'https://huggingface.co/papers/2501.04828', 'abstract': 'This paper introduces foundational resources and models for natural language processing (NLP) of historical Turkish, a domain that has remained underexplored in computational linguistics. We present the first named entity recognition (NER) dataset, HisTR and the first Universal Dependencies treebank, OTA-BOUN for a historical form of the Turkish language along with transformer-based models trained using these datasets for named entity recognition, dependency parsing, and part-of-speech tagging tasks. Additionally, we introduce Ottoman Text Corpus (OTC), a clean corpus of transliterated historical Turkish texts that spans a wide range of historical periods. Our experimental results show significant improvements in the computational analysis of historical Turkish, achieving promising results in tasks that require understanding of historical linguistic structures. They also highlight existing challenges, such as domain adaptation and language variations across time periods. All of the presented resources and models are made available at https://huggingface.co/bucolin to serve as a benchmark for future progress in historical Turkish NLP.', 'score': 6, 'issue_id': 1603, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '40fe69c40d907fc4', 'authors': ['Şaziye Betül Özateş', 'Tarık Emre Tıraş', 'Ece Elif Adak', 'Berat Doğan', 'Fatih Burak Karagöz', 'Efe Eren Genç', 'Esma F. Bilgin Taşdemir'], 'affiliations': ['Bogaziçi University', 'Medeniyet University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04828.jpg', 'data': {'categories': ['#dataset', '#data', '#low_resource', '#science', '#multilingual', '#benchmark'], 'emoji': '🏛️', 'ru': {'title': 'Прорыв в NLP для исторического турецкого языка', 'desc': 'Статья представляет первые ресурсы и модели для обработки естественного языка (NLP) исторического турецкого языка. Авторы создали первый датасет для распознавания именованных сущностей (NER) HisTR и первый Universal Dependencies тривбанк OTA-BOUN для исторической формы турецкого языка. Также были разработаны трансформерные модели для задач NER, синтаксического анализа и морфологической разметки. Дополнительно представлен Османский текстовый корпус (OTC) - очищенный корпус транслитерированных исторических турецких текстов разных периодов.'}, 'en': {'title': 'Unlocking Historical Turkish: New Resources for NLP', 'desc': 'This paper provides essential resources and models for processing historical Turkish language using natural language processing (NLP) techniques. It introduces the first named entity recognition (NER) dataset, HisTR, and the first Universal Dependencies treebank, OTA-BOUN, specifically for historical Turkish. The authors also present the Ottoman Text Corpus (OTC), a comprehensive collection of transliterated texts from various historical periods. The results demonstrate advancements in analyzing historical Turkish, while also addressing challenges like domain adaptation and linguistic variations over time.'}, 'zh': {'title': '推动历史土耳其语NLP的进步', 'desc': '本文介绍了历史土耳其语自然语言处理(NLP)的基础资源和模型,这是一个在计算语言学中尚未深入研究的领域。我们首次发布了命名实体识别(NER)数据集HisTR和历史土耳其语的Universal Dependencies树库OTA-BOUN,并基于这些数据集训练了用于命名实体识别、依存句法分析和词性标注任务的变换器模型。此外,我们还推出了奥斯曼文本语料库(OTC),这是一个涵盖多个历史时期的清晰转写历史土耳其语文本的语料库。实验结果显示,在历史土耳其语的计算分析中取得了显著进展,但也突显了领域适应和语言随时间变化等挑战。'}}}, {'id': 'https://huggingface.co/papers/2501.11425', 'title': 'Agent-R: Training Language Model Agents to Reflect via Iterative Self-Training', 'url': 'https://huggingface.co/papers/2501.11425', 'abstract': "Large Language Models (LLMs) agents are increasingly pivotal for addressing complex tasks in interactive environments. Existing work mainly focuses on enhancing performance through behavior cloning from stronger experts, yet such approaches often falter in real-world applications, mainly due to the inability to recover from errors. However, step-level critique data is difficult and expensive to collect. Automating and dynamically constructing self-critique datasets is thus crucial to empowering models with intelligent agent capabilities. In this work, we propose an iterative self-training framework, Agent-R, that enables language Agent to Reflect on the fly. Unlike traditional methods that reward or penalize actions based on correctness, Agent-R leverages MCTS to construct training data that recover correct trajectories from erroneous ones. A key challenge of agent reflection lies in the necessity for timely revision rather than waiting until the end of a rollout. To address this, we introduce a model-guided critique construction mechanism: the actor model identifies the first error step (within its current capability) in a failed trajectory. Starting from it, we splice it with the adjacent correct path, which shares the same parent node in the tree. This strategy enables the model to learn reflection based on its current policy, therefore yielding better learning efficiency. To further explore the scalability of this self-improvement paradigm, we investigate iterative refinement of both error correction capabilities and dataset construction. Our findings demonstrate that Agent-R continuously improves the model's ability to recover from errors and enables timely error correction. Experiments on three interactive environments show that Agent-R effectively equips agents to correct erroneous actions while avoiding loops, achieving superior performance compared to baseline methods (+5.59%).", 'score': 54, 'issue_id': 1798, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': '96d073b4606b0493', 'authors': ['Siyu Yuan', 'Zehui Chen', 'Zhiheng Xi', 'Junjie Ye', 'Zhengyin Du', 'Jiecao Chen'], 'affiliations': ['ByteDance', 'Fudan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.11425.jpg', 'data': {'categories': ['#reasoning', '#optimization', '#agents', '#training', '#agi'], 'emoji': '🤖', 'ru': {'title': 'Самообучающиеся ИИ-агенты: исправление ошибок на лету', 'desc': 'Статья представляет новый метод обучения языковых агентов на основе искусственного интеллекта под названием Agent-R. Этот подход использует самообучение и самокритику для улучшения способности модели исправлять ошибки в процессе выполнения задач. Agent-R применяет метод Монте-Карло для построения дерева поиска (MCTS) для создания обучающих данных, которые помогают агенту восстанавливаться после ошибочных действий. Эксперименты показывают, что Agent-R значительно повышает производительность агентов в интерактивных средах по сравнению с базовыми методами.'}, 'en': {'title': 'Empowering Language Agents with Real-Time Self-Critique', 'desc': "This paper introduces Agent-R, an iterative self-training framework designed to enhance the performance of Large Language Models (LLMs) in interactive environments. Unlike traditional methods that rely on static feedback, Agent-R utilizes Monte Carlo Tree Search (MCTS) to dynamically create training data that helps models recover from mistakes in real-time. The framework focuses on timely error correction by identifying the first error in a trajectory and splicing it with a correct path, allowing the model to learn from its current policy. Experimental results show that Agent-R significantly improves the model's error recovery capabilities and overall performance, outperforming baseline methods by 5.59%."}, 'zh': {'title': 'Agent-R:实时反思,提升学习效率', 'desc': '大型语言模型(LLMs)在复杂任务的交互环境中变得越来越重要。现有研究主要通过模仿更强专家的行为来提升性能,但这种方法在实际应用中常常失败,主要是因为无法从错误中恢复。为了解决这个问题,我们提出了一种迭代自我训练框架Agent-R,使语言代理能够实时反思。Agent-R通过构建训练数据来纠正错误轨迹,从而提高模型的学习效率和错误恢复能力。'}}}, {'id': 'https://huggingface.co/papers/2501.11873', 'title': 'Demons in the Detail: On Implementing Load Balancing Loss for Training Specialized Mixture-of-Expert Models', 'url': 'https://huggingface.co/papers/2501.11873', 'abstract': 'This paper revisits the implementation of Load-balancing Loss (LBL) when training Mixture-of-Experts (MoEs) models. Specifically, LBL for MoEs is defined as N_E sum_{i=1}^{N_E} f_i p_i, where N_E is the total number of experts, f_i represents the frequency of expert i being selected, and p_i denotes the average gating score of the expert i. Existing MoE training frameworks usually employ the parallel training strategy so that f_i and the LBL are calculated within a micro-batch and then averaged across parallel groups. In essence, a micro-batch for training billion-scale LLMs normally contains very few sequences. So, the micro-batch LBL is almost at the sequence level, and the router is pushed to distribute the token evenly within each sequence. Under this strict constraint, even tokens from a domain-specific sequence (e.g., code) are uniformly routed to all experts, thereby inhibiting expert specialization. In this work, we propose calculating LBL using a global-batch to loose this constraint. Because a global-batch contains much more diverse sequences than a micro-batch, which will encourage load balance at the corpus level. Specifically, we introduce an extra communication step to synchronize f_i across micro-batches and then use it to calculate the LBL. Through experiments on training MoEs-based LLMs (up to 42.8B total parameters and 400B tokens), we surprisingly find that the global-batch LBL strategy yields excellent performance gains in both pre-training perplexity and downstream tasks. Our analysis reveals that the global-batch LBL also greatly improves the domain specialization of MoE experts.', 'score': 48, 'issue_id': 1797, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '370d057fec504963', 'authors': ['Zihan Qiu', 'Zeyu Huang', 'Bo Zheng', 'Kaiyue Wen', 'Zekun Wang', 'Rui Men', 'Ivan Titov', 'Dayiheng Liu', 'Jingren Zhou', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Group', 'Stanford University', 'University of Edinburgh'], 'pdf_title_img': 'assets/pdf/title_img/2501.11873.jpg', 'data': {'categories': ['#optimization', '#architecture', '#training'], 'emoji': '⚖️', 'ru': {'title': 'Глобальный подход к балансировке нагрузки экспертов в MoE моделях', 'desc': 'Статья предлагает новый подход к реализации функции потерь балансировки нагрузки (LBL) при обучении моделей Mixture-of-Experts (MoE). Авторы предлагают вычислять LBL на уровне глобального батча, а не микро-батча, что позволяет ослабить ограничения на распределение токенов между экспертами. Эксперименты на крупномасштабных языковых моделях показывают, что этот метод улучшает перплексию при предобучении и результаты на задачах downstream. Анализ также демонстрирует улучшение специализации экспертов по доменам.'}, 'en': {'title': 'Enhancing Expert Specialization with Global-Batch Load-Balancing', 'desc': 'This paper focuses on improving the Load-balancing Loss (LBL) in training Mixture-of-Experts (MoEs) models. The authors highlight that traditional methods use micro-batches, which limit the diversity of sequences and hinder expert specialization. They propose a new approach that utilizes global-batches, allowing for a broader range of sequences and better load balancing across the entire dataset. Experimental results show that this global-batch LBL method significantly enhances model performance and expert specialization in large language models.'}, 'zh': {'title': '全局批次提升混合专家模型的负载均衡与专业化', 'desc': '本文重新审视了在训练混合专家模型(MoEs)时的负载均衡损失(LBL)实现。我们提出使用全局批次来计算LBL,以打破微批次的严格约束,从而在语料库层面上促进负载均衡。通过在训练中引入额外的通信步骤来同步专家选择频率,实验结果显示全局批次LBL策略在预训练困惑度和下游任务中均显著提升了性能。我们的分析表明,全局批次LBL还大大改善了MoE专家的领域专业化。'}}}, {'id': 'https://huggingface.co/papers/2501.12380', 'title': 'MMVU: Measuring Expert-Level Multi-Discipline Video Understanding', 'url': 'https://huggingface.co/papers/2501.12380', 'abstract': 'We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark for evaluating foundation models in video understanding. MMVU includes 3,000 expert-annotated questions spanning 27 subjects across four core disciplines: Science, Healthcare, Humanities & Social Sciences, and Engineering. Compared to prior benchmarks, MMVU features three key advancements. First, it challenges models to apply domain-specific knowledge and perform expert-level reasoning to analyze specialized-domain videos, moving beyond the basic visual perception typically assessed in current video benchmarks. Second, each example is annotated by human experts from scratch. We implement strict data quality controls to ensure the high quality of the dataset. Finally, each example is enriched with expert-annotated reasoning rationals and relevant domain knowledge, facilitating in-depth analysis. We conduct an extensive evaluation of 32 frontier multimodal foundation models on MMVU. The latest System-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest performance among the tested models. However, they still fall short of matching human expertise. Through in-depth error analyses and case studies, we offer actionable insights for future advancements in expert-level, knowledge-intensive video understanding for specialized domains.', 'score': 48, 'issue_id': 1797, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'dcb04aaca349cc32', 'authors': ['Yilun Zhao', 'Lujing Xie', 'Haowei Zhang', 'Guo Gan', 'Yitao Long', 'Zhiyuan Hu', 'Tongyan Hu', 'Weiyuan Chen', 'Chuhan Li', 'Junyang Song', 'Zhijian Xu', 'Chengye Wang', 'Weifeng Pan', 'Ziyao Shangguan', 'Xiangru Tang', 'Zhenwen Liang', 'Yixin Liu', 'Chen Zhao', 'Arman Cohan'], 'affiliations': ['Yale NLP'], 'pdf_title_img': 'assets/pdf/title_img/2501.12380.jpg', 'data': {'categories': ['#multimodal', '#science', '#benchmark', '#video', '#healthcare', '#reasoning'], 'emoji': '🎓', 'ru': {'title': 'Новый рубеж в понимании видео: от базового восприятия к экспертному анализу', 'desc': 'Статья представляет MMVU - многодисциплинарный экспертный бенчмарк для оценки фундаментальных моделей в понимании видео. MMVU включает 3000 вопросов по 27 предметам в четырех основных дисциплинах, требующих применения специализированных знаний и экспертного анализа. Бенчмарк отличается высоким качеством данных, аннотированных экспертами, и включает обоснования и релевантные знания для каждого примера. Оценка 32 мультимодальных моделей на MMVU показала, что даже лучшие модели пока не достигают уровня человека-эксперта в этой задаче.'}, 'en': {'title': 'MMVU: Elevating Video Understanding to Expert Levels', 'desc': 'The paper presents MMVU, a new benchmark designed to evaluate foundation models specifically in video understanding across various expert domains. It includes 3,000 questions that require advanced reasoning and domain-specific knowledge, moving beyond simple visual recognition tasks. Each question is meticulously annotated by human experts, ensuring high data quality and providing reasoning rationales to enhance analysis. The evaluation of 32 advanced multimodal models reveals that while some perform well, they still do not reach the level of human expertise, highlighting areas for future improvement in this field.'}, 'zh': {'title': 'MMVU:视频理解的新标准', 'desc': '我们介绍了MMVU,这是一个全面的专家级多学科基准,用于评估基础模型在视频理解方面的表现。MMVU包含3000个专家注释的问题,涵盖科学、医疗、人文学科与社会科学和工程四个核心学科。与之前的基准相比,MMVU在三个关键方面有所改进,包括要求模型应用领域特定知识进行专家级推理,确保数据集的高质量,以及为每个示例提供专家注释的推理依据和相关领域知识。我们对32个前沿多模态基础模型在MMVU上的表现进行了广泛评估,发现最新的系统2能力模型o1和Gemini 2.0 Flash Thinking在测试模型中表现最佳,但仍未能达到人类专家的水平。'}}}, {'id': 'https://huggingface.co/papers/2501.12224', 'title': 'TokenVerse: Versatile Multi-concept Personalization in Token Modulation Space', 'url': 'https://huggingface.co/papers/2501.12224', 'abstract': "We present TokenVerse -- a method for multi-concept personalization, leveraging a pre-trained text-to-image diffusion model. Our framework can disentangle complex visual elements and attributes from as little as a single image, while enabling seamless plug-and-play generation of combinations of concepts extracted from multiple images. As opposed to existing works, TokenVerse can handle multiple images with multiple concepts each, and supports a wide-range of concepts, including objects, accessories, materials, pose, and lighting. Our work exploits a DiT-based text-to-image model, in which the input text affects the generation through both attention and modulation (shift and scale). We observe that the modulation space is semantic and enables localized control over complex concepts. Building on this insight, we devise an optimization-based framework that takes as input an image and a text description, and finds for each word a distinct direction in the modulation space. These directions can then be used to generate new images that combine the learned concepts in a desired configuration. We demonstrate the effectiveness of TokenVerse in challenging personalization settings, and showcase its advantages over existing methods. project's webpage in https://token-verse.github.io/", 'score': 31, 'issue_id': 1804, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '20dcd865e2d7bc5c', 'authors': ['Daniel Garibi', 'Shahar Yadin', 'Roni Paiss', 'Omer Tov', 'Shiran Zada', 'Ariel Ephrat', 'Tomer Michaeli', 'Inbar Mosseri', 'Tali Dekel'], 'affiliations': ['Google DeepMind', 'Technion', 'Tel Aviv University', 'Weizmann Institute'], 'pdf_title_img': 'assets/pdf/title_img/2501.12224.jpg', 'data': {'categories': ['#multimodal', '#cv', '#optimization', '#diffusion'], 'emoji': '🎨', 'ru': {'title': 'Персонализация изображений с помощью семантического пространства модуляции', 'desc': 'TokenVerse - это метод многоконцептуальной персонализации, использующий предобученную модель диффузии текста в изображение. Он позволяет выделять сложные визуальные элементы и атрибуты даже из одного изображения, обеспечивая при этом возможность комбинировать концепты из нескольких изображений. TokenVerse использует модель DiT, где входной текст влияет на генерацию через внимание и модуляцию. Метод оптимизирует направления в пространстве модуляции для каждого слова, что позволяет генерировать новые изображения с желаемой комбинацией выученных концептов.'}, 'en': {'title': 'TokenVerse: Mastering Multi-Concept Image Personalization', 'desc': 'TokenVerse is a novel approach for personalizing images by using a pre-trained text-to-image diffusion model. It can separate and manipulate various visual elements from just one image, allowing for the creation of new images that combine concepts from multiple sources. Unlike previous methods, TokenVerse effectively manages multiple images with different concepts, covering a wide array of attributes such as objects, poses, and lighting. The framework utilizes a DiT-based model that enables precise control over image generation through semantic modulation, making it a powerful tool for complex personalization tasks.'}, 'zh': {'title': 'TokenVerse:多概念个性化的新方法', 'desc': 'TokenVerse是一种多概念个性化的方法,利用预训练的文本到图像扩散模型。该框架能够从单张图像中解耦复杂的视觉元素和属性,并支持从多张图像中提取概念的无缝组合生成。与现有方法不同,TokenVerse可以处理每张图像中包含多个概念的情况,并支持广泛的概念类型,包括物体、配件、材料、姿势和光照。我们的研究利用基于DiT的文本到图像模型,通过注意力和调制(偏移和缩放)来影响生成过程,从而实现对复杂概念的局部控制。'}}}, {'id': 'https://huggingface.co/papers/2501.12326', 'title': 'UI-TARS: Pioneering Automated GUI Interaction with Native Agents', 'url': 'https://huggingface.co/papers/2501.12326', 'abstract': 'This paper introduces UI-TARS, a native GUI agent model that solely perceives the screenshots as input and performs human-like interactions (e.g., keyboard and mouse operations). Unlike prevailing agent frameworks that depend on heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts and workflows, UI-TARS is an end-to-end model that outperforms these sophisticated frameworks. Experiments demonstrate its superior performance: UI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating perception, grounding, and GUI task execution. Notably, in the OSWorld benchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15 steps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld, UI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several key innovations: (1) Enhanced Perception: leveraging a large-scale dataset of GUI screenshots for context-aware understanding of UI elements and precise captioning; (2) Unified Action Modeling, which standardizes actions into a unified space across platforms and achieves precise grounding and interaction through large-scale action traces; (3) System-2 Reasoning, which incorporates deliberate reasoning into multi-step decision making, involving multiple reasoning patterns such as task decomposition, reflection thinking, milestone recognition, etc. (4) Iterative Training with Reflective Online Traces, which addresses the data bottleneck by automatically collecting, filtering, and reflectively refining new interaction traces on hundreds of virtual machines. Through iterative training and reflection tuning, UI-TARS continuously learns from its mistakes and adapts to unforeseen situations with minimal human intervention. We also analyze the evolution path of GUI agents to guide the further development of this domain.', 'score': 27, 'issue_id': 1797, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '1f98d8f49b073983', 'authors': ['Yujia Qin', 'Yining Ye', 'Junjie Fang', 'Haoming Wang', 'Shihao Liang', 'Shizuo Tian', 'Junda Zhang', 'Jiahao Li', 'Yunxin Li', 'Shijue Huang', 'Wanjun Zhong', 'Kuanye Li', 'Jiale Yang', 'Yu Miao', 'Woyu Lin', 'Longxiang Liu', 'Xu Jiang', 'Qianli Ma', 'Jingyu Li', 'Xiaojun Xiao', 'Kai Cai', 'Chuang Li', 'Yaowei Zheng', 'Chaolin Jin', 'Chen Li', 'Xiao Zhou', 'Minchao Wang', 'Haoli Chen', 'Zhaojian Li', 'Haihua Yang', 'Haifeng Liu', 'Feng Lin', 'Tao Peng', 'Xin Liu', 'Guang Shi'], 'affiliations': ['ByteDance Seed', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.12326.jpg', 'data': {'categories': ['#optimization', '#dataset', '#agents', '#training', '#reasoning'], 'emoji': '🖥️', 'ru': {'title': 'UI-TARS: Революция в мире GUI-агентов', 'desc': 'Статья представляет UI-TARS - модель графического агента, которая воспринимает только скриншоты и выполняет операции, подобные человеческим. UI-TARS превосходит существующие фреймворки агентов, достигая лучших результатов в более чем 10 бенчмарках для GUI-агентов. Модель включает в себя несколько ключевых инноваций: улучшенное восприятие, унифицированное моделирование действий, рассуждение по системе-2 и итеративное обучение с рефлексивными онлайн-трассами. UI-TARS постоянно учится на своих ошибках и адаптируется к непредвиденным ситуациям с минимальным вмешательством человека.'}, 'en': {'title': 'Revolutionizing GUI Interaction with UI-TARS: The End-to-End Agent Model', 'desc': 'UI-TARS is a novel GUI agent model that processes screenshots to perform tasks like a human would, using keyboard and mouse actions. Unlike existing models that rely on complex commercial frameworks and pre-defined prompts, UI-TARS operates end-to-end and shows superior performance in various benchmarks. It achieves state-of-the-art results in GUI task execution by utilizing enhanced perception, unified action modeling, and system-2 reasoning for better decision-making. Additionally, its iterative training approach allows it to learn from past interactions, improving its adaptability with minimal human input.'}, 'zh': {'title': 'UI-TARS:革新图形用户界面代理的全新模型', 'desc': '本文介绍了UI-TARS,这是一种原生的图形用户界面(GUI)代理模型,能够仅通过屏幕截图进行人类般的交互。与依赖复杂商业模型的现有代理框架不同,UI-TARS是一个端到端的模型,在多个GUI代理基准测试中表现优异,尤其在感知、定位和任务执行方面。UI-TARS通过增强感知、统一动作建模、系统-2推理和反思在线追踪等创新,显著提高了其性能。通过迭代训练和反思调优,UI-TARS能够不断学习并适应新的情况,减少对人类干预的需求。'}}}, {'id': 'https://huggingface.co/papers/2501.12368', 'title': 'InternLM-XComposer2.5-Reward: A Simple Yet Effective Multi-Modal Reward Model', 'url': 'https://huggingface.co/papers/2501.12368', 'abstract': 'Despite the promising performance of Large Vision Language Models (LVLMs) in visual understanding, they occasionally generate incorrect outputs. While reward models (RMs) with reinforcement learning or test-time scaling offer the potential for improving generation quality, a critical gap remains: publicly available multi-modal RMs for LVLMs are scarce, and the implementation details of proprietary models are often unclear. We bridge this gap with InternLM-XComposer2.5-Reward (IXC-2.5-Reward), a simple yet effective multi-modal reward model that aligns LVLMs with human preferences. To ensure the robustness and versatility of IXC-2.5-Reward, we set up a high-quality multi-modal preference corpus spanning text, image, and video inputs across diverse domains, such as instruction following, general understanding, text-rich documents, mathematical reasoning, and video understanding. IXC-2.5-Reward achieves excellent results on the latest multi-modal reward model benchmark and shows competitive performance on text-only reward model benchmarks. We further demonstrate three key applications of IXC-2.5-Reward: (1) Providing a supervisory signal for RL training. We integrate IXC-2.5-Reward with Proximal Policy Optimization (PPO) yields IXC-2.5-Chat, which shows consistent improvements in instruction following and multi-modal open-ended dialogue; (2) Selecting the best response from candidate responses for test-time scaling; and (3) Filtering outlier or noisy samples from existing image and video instruction tuning training data. To ensure reproducibility and facilitate further research, we have open-sourced all model weights and training recipes at https://github.com/InternLM/InternLM-XComposer', 'score': 20, 'issue_id': 1804, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'd51d195276c2215d', 'authors': ['Yuhang Zang', 'Xiaoyi Dong', 'Pan Zhang', 'Yuhang Cao', 'Ziyu Liu', 'Shengyuan Ding', 'Shenxi Wu', 'Yubo Ma', 'Haodong Duan', 'Wenwei Zhang', 'Kai Chen', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Fudan University', 'Nanjing University', 'Nanyang Technological University', 'Shanghai Artificial Intelligence Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.12368.jpg', 'data': {'categories': ['#rlhf', '#alignment', '#open_source', '#benchmark', '#training', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'Улучшение LVLM с помощью мультимодальной модели вознаграждения', 'desc': 'В статье представлена мультимодальная модель вознаграждения InternLM-XComposer2.5-Reward (IXC-2.5-Reward) для улучшения качества генерации больших визуально-языковых моделей (LVLM). Модель обучена на высококачественном наборе данных, охватывающем различные домены и типы входных данных. IXC-2.5-Reward показывает отличные результаты на бенчмарках мультимодальных и текстовых моделей вознаграждения. Авторы демонстрируют три ключевых применения модели: обучение с подкреплением, выбор лучшего ответа из кандидатов и фильтрация шумных данных.'}, 'en': {'title': 'Bridging the Gap in Multi-Modal Reward Models for LVLMs', 'desc': 'This paper introduces InternLM-XComposer2.5-Reward (IXC-2.5-Reward), a multi-modal reward model designed to enhance the performance of Large Vision Language Models (LVLMs) by aligning them with human preferences. The authors address the lack of publicly available multi-modal reward models by creating a comprehensive preference corpus that includes text, images, and videos across various domains. IXC-2.5-Reward demonstrates strong performance on multi-modal benchmarks and effectively supports reinforcement learning training, response selection, and data filtering. The model and its training methods are open-sourced to promote reproducibility and further research in the field.'}, 'zh': {'title': '提升视觉语言模型生成质量的多模态奖励模型', 'desc': '本文介绍了一种新的多模态奖励模型,名为InternLM-XComposer2.5-Reward(IXC-2.5-Reward),旨在提高大型视觉语言模型(LVLMs)的生成质量。该模型通过对文本、图像和视频等多种输入形式进行高质量的偏好学习,来对齐LVLMs与人类的偏好。IXC-2.5-Reward在最新的多模态奖励模型基准测试中表现优异,并在文本奖励模型基准测试中也展现了竞争力。我们还展示了IXC-2.5-Reward的三种关键应用,包括强化学习训练的监督信号、候选响应的最佳选择以及过滤噪声样本。'}}}, {'id': 'https://huggingface.co/papers/2501.11733', 'title': 'Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks', 'url': 'https://huggingface.co/papers/2501.11733', 'abstract': 'Smartphones have become indispensable in modern life, yet navigating complex tasks on mobile devices often remains frustrating. Recent advancements in large multimodal model (LMM)-based mobile agents have demonstrated the ability to perceive and act in mobile environments. However, current approaches face significant limitations: they fall short in addressing real-world human needs, struggle with reasoning-intensive and long-horizon tasks, and lack mechanisms to learn and improve from prior experiences. To overcome these challenges, we introduce Mobile-Agent-E, a hierarchical multi-agent framework capable of self-evolution through past experience. By hierarchical, we mean an explicit separation of high-level planning and low-level action execution. The framework comprises a Manager, responsible for devising overall plans by breaking down complex tasks into subgoals, and four subordinate agents--Perceptor, Operator, Action Reflector, and Notetaker--which handle fine-grained visual perception, immediate action execution, error verification, and information aggregation, respectively. Mobile-Agent-E also features a novel self-evolution module which maintains a persistent long-term memory comprising Tips and Shortcuts. Tips are general guidance and lessons learned from prior tasks on how to effectively interact with the environment. Shortcuts are reusable, executable sequences of atomic operations tailored for specific subroutines. The inclusion of Tips and Shortcuts facilitates continuous refinement in performance and efficiency. Alongside this framework, we introduce Mobile-Eval-E, a new benchmark featuring complex mobile tasks requiring long-horizon, multi-app interactions. Empirical results show that Mobile-Agent-E achieves a 22% absolute improvement over previous state-of-the-art approaches across three foundation model backbones. Project page: https://x-plug.github.io/MobileAgent.', 'score': 17, 'issue_id': 1798, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': 'a9cddb8786536def', 'authors': ['Zhenhailong Wang', 'Haiyang Xu', 'Junyang Wang', 'Xi Zhang', 'Ming Yan', 'Ji Zhang', 'Fei Huang', 'Heng Ji'], 'affiliations': ['Alibaba Group', 'University of Illinois Urbana-Champaign'], 'pdf_title_img': 'assets/pdf/title_img/2501.11733.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#optimization', '#agents', '#multimodal', '#long_context'], 'emoji': '📱', 'ru': {'title': 'Мобильный ИИ-ассистент с самообучением для сложных задач', 'desc': 'Статья представляет Mobile-Agent-E - иерархическую мультиагентную систему для выполнения сложных задач на мобильных устройствах. Система включает Менеджера для планирования и четыре подчиненных агента для восприятия, выполнения действий, проверки ошибок и агрегации информации. Ключевой особенностью является модуль самоэволюции с долговременной памятью, содержащей Подсказки и Ярлыки для улучшения производительности. Эмпирические результаты показывают значительное улучшение по сравнению с предыдущими подходами на новом бенчмарке Mobile-Eval-E.'}, 'en': {'title': 'Empowering Mobile Agents with Self-Evolution for Enhanced Task Performance', 'desc': 'This paper presents Mobile-Agent-E, a hierarchical multi-agent framework designed to enhance mobile task performance by learning from past experiences. The framework separates high-level planning from low-level execution, utilizing a Manager for task decomposition and four specialized agents for perception, action, error checking, and information management. A key feature is the self-evolution module, which incorporates a long-term memory of Tips and Shortcuts to improve task efficiency and effectiveness. Experimental results demonstrate that Mobile-Agent-E significantly outperforms existing methods, achieving a 22% improvement in complex mobile tasks.'}, 'zh': {'title': '智能手机任务执行的新突破', 'desc': '本论文介绍了一种名为Mobile-Agent-E的层次化多智能体框架,旨在提升智能手机上的任务执行能力。该框架通过将高层规划与低层执行明确分离,包含一个管理者和四个子代理,分别负责视觉感知、动作执行、错误验证和信息聚合。Mobile-Agent-E还引入了自我进化模块,利用长期记忆中的提示和捷径来不断优化性能。实验结果表明,该框架在复杂移动任务中相较于现有方法有22%的绝对提升。'}}}, {'id': 'https://huggingface.co/papers/2501.11223', 'title': 'Reasoning Language Models: A Blueprint', 'url': 'https://huggingface.co/papers/2501.11223', 'abstract': 'Reasoning language models (RLMs), also known as Large Reasoning Models (LRMs), such as OpenAI\'s o1 and o3, DeepSeek-V3, and Alibaba\'s QwQ, have redefined AI\'s problem-solving capabilities by extending large language models (LLMs) with advanced reasoning mechanisms. Yet, their high costs, proprietary nature, and complex architectures - uniquely combining Reinforcement Learning (RL), search heuristics, and LLMs - present accessibility and scalability challenges. To address these, we propose a comprehensive blueprint that organizes RLM components into a modular framework, based on a survey and analysis of all RLM works. This blueprint incorporates diverse reasoning structures (chains, trees, graphs, and nested forms), reasoning strategies (e.g., Monte Carlo Tree Search, Beam Search), RL concepts (policy, value models and others), and supervision schemes (Output-Based and Process-Based Supervision). We also provide detailed mathematical formulations and algorithmic specifications to simplify RLM implementation. By showing how schemes like LLaMA-Berry, QwQ, Journey Learning, and Graph of Thoughts fit as special cases, we demonstrate the blueprint\'s versatility and unifying potential. To illustrate its utility, we introduce x1, a modular implementation for rapid RLM prototyping and experimentation. Using x1 and a literature review, we provide key insights, such as multi-phase training for policy and value models, and the importance of familiar training distributions. Finally, we outline how RLMs can integrate with a broader LLM ecosystem, including tools and databases. Our work demystifies RLM construction, democratizes advanced reasoning capabilities, and fosters innovation, aiming to mitigate the gap between "rich AI" and "poor AI" by lowering barriers to RLM development and experimentation.', 'score': 16, 'issue_id': 1797, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': 'f554416ad9af3344', 'authors': ['Maciej Besta', 'Julia Barth', 'Eric Schreiber', 'Ales Kubicek', 'Afonso Catarino', 'Robert Gerstenberger', 'Piotr Nyczyk', 'Patrick Iff', 'Yueling Li', 'Sam Houliston', 'Tomasz Sternal', 'Marcin Copik', 'Grzegorz Kwaśniewski', 'Jürgen Müller', 'Łukasz Flis', 'Hannes Eberhard', 'Hubert Niewiadomski', 'Torsten Hoefler'], 'affiliations': ['BASF SE', 'Cledar', 'Cyfronet AGH', 'ETH Zurich'], 'pdf_title_img': 'assets/pdf/title_img/2501.11223.jpg', 'data': {'categories': ['#rl', '#math', '#training', '#survey', '#reasoning', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Демократизация искусственного интеллекта: модульный подход к созданию моделей рассуждений', 'desc': 'Статья представляет комплексный подход к созданию моделей рассуждений (RLM), объединяющих языковые модели с механизмами продвинутых рассуждений. Авторы предлагают модульную структуру, включающую различные стратегии рассуждений, концепции обучения с подкреплением и схемы обучения. Они демонстрируют применимость этой структуры на примере существующих моделей и представляют x1 - модульную реализацию для быстрого прототипирования RLM. Исследование направлено на демократизацию возможностей продвинутых рассуждений в ИИ и снижение барьеров для разработки RLM.'}, 'en': {'title': 'Democratizing Advanced Reasoning in AI', 'desc': 'This paper introduces a modular framework for Reasoning Language Models (RLMs), which enhance traditional Large Language Models (LLMs) with advanced reasoning capabilities. The authors address the challenges of high costs and complex architectures by organizing RLM components into a comprehensive blueprint that includes various reasoning structures and strategies. They provide mathematical formulations and algorithmic specifications to facilitate easier implementation of RLMs. Additionally, the paper presents x1, a tool for rapid prototyping, and discusses how RLMs can be integrated into the larger LLM ecosystem to promote accessibility and innovation in AI development.'}, 'zh': {'title': '简化推理语言模型,促进AI创新', 'desc': '推理语言模型(RLMs)通过结合强化学习、搜索启发式和大型语言模型(LLMs),重新定义了人工智能的解决问题能力。尽管它们具有强大的推理机制,但高成本和复杂架构使得其可访问性和可扩展性面临挑战。为了解决这些问题,我们提出了一个模块化框架,组织RLM组件,并提供详细的数学公式和算法规范,以简化RLM的实现。我们的工作旨在降低RLM开发和实验的门槛,促进创新,缩小“富有AI”和“贫穷AI”之间的差距。'}}}, {'id': 'https://huggingface.co/papers/2501.12202', 'title': 'Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D Assets Generation', 'url': 'https://huggingface.co/papers/2501.12202', 'abstract': 'We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for generating high-resolution textured 3D assets. This system includes two foundation components: a large-scale shape generation model -- Hunyuan3D-DiT, and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape generative model, built on a scalable flow-based diffusion transformer, aims to create geometry that properly aligns with a given condition image, laying a solid foundation for downstream applications. The texture synthesis model, benefiting from strong geometric and diffusion priors, produces high-resolution and vibrant texture maps for either generated or hand-crafted meshes. Furthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production platform that simplifies the re-creation process of 3D assets. It allows both professional and amateur users to manipulate or even animate their meshes efficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0 outperforms previous state-of-the-art models, including the open-source models and closed-source models in geometry details, condition alignment, texture quality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps in the open-source 3D community for large-scale foundation generative models. The code and pre-trained weights of our models are available at: https://github.com/Tencent/Hunyuan3D-2', 'score': 14, 'issue_id': 1798, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'f95f069cba0bd83e', 'authors': ['Zibo Zhao', 'Zeqiang Lai', 'Qingxiang Lin', 'Yunfei Zhao', 'Haolin Liu', 'Shuhui Yang', 'Yifei Feng', 'Mingxin Yang', 'Sheng Zhang', 'Xianghui Yang', 'Huiwen Shi', 'Sicong Liu', 'Junta Wu', 'Yihang Lian', 'Fan Yang', 'Ruining Tang', 'Zebin He', 'Xinzhou Wang', 'Jian Liu', 'Xuhui Zuo', 'Zhuo Chen', 'Biwen Lei', 'Haohan Weng', 'Jing Xu', 'Yiling Zhu', 'Xinhai Liu', 'Lixin Xu', 'Changrong Hu', 'Tianyu Huang', 'Lifu Wang', 'Jihong Zhang', 'Meng Chen', 'Liang Dong', 'Yiwen Jia', 'Yulin Cai', 'Jiaao Yu', 'Yixuan Tang', 'Hao Zhang', 'Zheng Ye', 'Peng He', 'Runzhou Wu', 'Chao Zhang', 'Yonghao Tan', 'Jie Xiao', 'Yangyu Tao', 'Jianchen Zhu', 'Jinbao Xue', 'Kai Liu', 'Chongqing Zhao', 'Xinming Wu', 'Zhichao Hu', 'Lei Qin', 'Jianbing Peng', 'Zhan Li', 'Minghui Chen', 'Xipeng Zhang', 'Lin Niu', 'Paige Wang', 'Yingkai Wang', 'Haozhao Kuang', 'Zhongyi Fan', 'Xu Zheng', 'Weihao Zhuang', 'YingPing He', 'Tian Liu', 'Yong Yang', 'Di Wang', 'Yuhong Liu', 'Jie Jiang', 'Jingwei Huang', 'Chunchao Guo'], 'affiliations': ['Tencent'], 'pdf_title_img': 'assets/pdf/title_img/2501.12202.jpg', 'data': {'categories': ['#diffusion', '#open_source', '#3d'], 'emoji': '🎨', 'ru': {'title': 'Революция в 3D-генерации: от формы к текстуре', 'desc': 'Hunyuan3D 2.0 - это продвинутая система для создания трехмерных текстурированных объектов высокого разрешения. Она состоит из двух основных компонентов: модели генерации форм Hunyuan3D-DiT и модели синтеза текстур Hunyuan3D-Paint. Модель генерации форм основана на масштабируемом диффузионном трансформере и создает геометрию, соответствующую заданному изображению. Модель синтеза текстур, используя геометрические и диффузионные праймы, создает высококачественные текстурные карты для сгенерированных или созданных вручную мешей.'}, 'en': {'title': 'Revolutionizing 3D Asset Creation with Hunyuan3D 2.0', 'desc': 'Hunyuan3D 2.0 is a sophisticated system designed for creating high-quality 3D models with detailed textures. It consists of two main components: Hunyuan3D-DiT for generating 3D shapes and Hunyuan3D-Paint for applying textures. The shape model uses a flow-based diffusion transformer to ensure that the generated geometry matches the input conditions, while the texture model leverages geometric and diffusion principles to create vibrant textures. This system not only enhances the quality of 3D assets but also provides an accessible platform for users to create and animate their models easily.'}, 'zh': {'title': 'Hunyuan3D 2.0:高效生成高质量3D资产的系统', 'desc': 'Hunyuan3D 2.0 是一个先进的大规模 3D 合成系统,能够生成高分辨率的纹理 3D 资产。该系统包含两个基础组件:Hunyuan3D-DiT 形状生成模型和 Hunyuan3D-Paint 纹理合成模型。形状生成模型基于可扩展的流式扩散变换器,旨在创建与给定条件图像相匹配的几何形状。纹理合成模型则利用强大的几何和扩散先验,为生成或手工制作的网格生成高分辨率的生动纹理图。'}}}, {'id': 'https://huggingface.co/papers/2501.12375', 'title': 'Video Depth Anything: Consistent Depth Estimation for Super-Long Videos', 'url': 'https://huggingface.co/papers/2501.12375', 'abstract': 'Depth Anything has achieved remarkable success in monocular depth estimation with strong generalization ability. However, it suffers from temporal inconsistency in videos, hindering its practical applications. Various methods have been proposed to alleviate this issue by leveraging video generation models or introducing priors from optical flow and camera poses. Nonetheless, these methods are only applicable to short videos (< 10 seconds) and require a trade-off between quality and computational efficiency. We propose Video Depth Anything for high-quality, consistent depth estimation in super-long videos (over several minutes) without sacrificing efficiency. We base our model on Depth Anything V2 and replace its head with an efficient spatial-temporal head. We design a straightforward yet effective temporal consistency loss by constraining the temporal depth gradient, eliminating the need for additional geometric priors. The model is trained on a joint dataset of video depth and unlabeled images, similar to Depth Anything V2. Moreover, a novel key-frame-based strategy is developed for long video inference. Experiments show that our model can be applied to arbitrarily long videos without compromising quality, consistency, or generalization ability. Comprehensive evaluations on multiple video benchmarks demonstrate that our approach sets a new state-of-the-art in zero-shot video depth estimation. We offer models of different scales to support a range of scenarios, with our smallest model capable of real-time performance at 30 FPS.', 'score': 13, 'issue_id': 1798, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '00640fb6adcf39e3', 'authors': ['Sili Chen', 'Hengkai Guo', 'Shengnan Zhu', 'Feihu Zhang', 'Zilong Huang', 'Jiashi Feng', 'Bingyi Kang'], 'affiliations': ['ByteDance'], 'pdf_title_img': 'assets/pdf/title_img/2501.12375.jpg', 'data': {'categories': ['#benchmark', '#optimization', '#small_models', '#video', '#cv', '#training'], 'emoji': '🎥', 'ru': {'title': 'Согласованная оценка глубины для сверхдлинных видео', 'desc': 'В статье представлен метод Video Depth Anything для оценки глубины в сверхдлинных видео с высоким качеством и временной согласованностью. Модель основана на Depth Anything V2 с новой пространственно-временной головой и использует эффективную функцию потерь для обеспечения временной согласованности. Предложенный подход позволяет обрабатывать видео произвольной длительности без ущерба для качества и обобщающей способности. Метод достигает наилучших результатов в задаче zero-shot оценки глубины видео на нескольких бенчмарках.'}, 'en': {'title': 'Achieving Consistent Depth Estimation in Long Videos', 'desc': 'This paper introduces Video Depth Anything, a model designed for accurate depth estimation in long videos, overcoming the limitations of previous methods that struggled with temporal consistency. The model builds on Depth Anything V2, enhancing it with a spatial-temporal head and a novel temporal consistency loss that focuses on the depth gradient over time. By training on a combined dataset of video depth and unlabeled images, the model achieves high-quality depth estimation without the need for complex geometric priors. The results demonstrate that Video Depth Anything can handle videos of any length while maintaining efficiency and setting new benchmarks in zero-shot video depth estimation.'}, 'zh': {'title': '超长视频深度估计的新突破', 'desc': '本文提出了一种名为Video Depth Anything的新模型,旨在解决单目深度估计在视频中的时间一致性问题。该模型能够在超长视频(超过几分钟)中实现高质量和一致性的深度估计,而不牺牲计算效率。我们通过设计一个简单有效的时间一致性损失,来约束时间深度梯度,从而避免了额外几何先验的需求。实验结果表明,该模型在多个视频基准测试中表现出色,设定了零-shot视频深度估计的新状态。'}}}, {'id': 'https://huggingface.co/papers/2501.10893', 'title': 'Learn-by-interact: A Data-Centric Framework for Self-Adaptive Agents in Realistic Environments', 'url': 'https://huggingface.co/papers/2501.10893', 'abstract': 'Autonomous agents powered by large language models (LLMs) have the potential to enhance human capabilities, assisting with digital tasks from sending emails to performing data analysis. The abilities of existing LLMs at such tasks are often hindered by the lack of high-quality agent data from the corresponding environments they interact with. We propose Learn-by-interact, a data-centric framework to adapt LLM agents to any given environments without human annotations. Learn-by-interact synthesizes trajectories of agent-environment interactions based on documentations, and constructs instructions by summarizing or abstracting the interaction histories, a process called backward construction. We assess the quality of our synthetic data by using them in both training-based scenarios and training-free in-context learning (ICL), where we craft innovative retrieval approaches optimized for agents. Extensive experiments on SWE-bench, WebArena, OSWorld and Spider2-V spanning across realistic coding, web, and desktop environments show the effectiveness of Learn-by-interact in various downstream agentic tasks -- baseline results are improved by up to 12.2\\% for ICL with Claude-3.5 and 19.5\\% for training with Codestral-22B. We further demonstrate the critical role of backward construction, which provides up to 14.0\\% improvement for training. Our ablation studies demonstrate the efficiency provided by our synthesized data in ICL and the superiority of our retrieval pipeline over alternative approaches like conventional retrieval-augmented generation (RAG). We expect that Learn-by-interact will serve as a foundation for agent data synthesis as LLMs are increasingly deployed at real-world environments.', 'score': 13, 'issue_id': 1798, 'pub_date': '2025-01-18', 'pub_date_card': {'ru': '18 января', 'en': 'January 18', 'zh': '1月18日'}, 'hash': 'b6ab4c9ac3809941', 'authors': ['Hongjin Su', 'Ruoxi Sun', 'Jinsung Yoon', 'Pengcheng Yin', 'Tao Yu', 'Sercan Ö. Arık'], 'affiliations': ['Google', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.10893.jpg', 'data': {'categories': ['#optimization', '#agents', '#synthetic', '#training', '#data', '#rag', '#dataset'], 'emoji': '🤖', 'ru': {'title': 'Обучение ИИ-агентов через синтетическое взаимодействие', 'desc': 'Статья представляет Learn-by-interact - фреймворк для адаптации агентов на основе больших языковых моделей (LLM) к различным средам без аннотаций человека. Метод синтезирует траектории взаимодействия агента со средой на основе документации и создает инструкции путем обобщения истории взаимодействий. Эксперименты показывают эффективность подхода в различных задачах, улучшая базовые результаты до 19.5% при обучении. Авторы демонстрируют критическую роль обратного конструирования и превосходство их метода над альтернативными подходами.'}, 'en': {'title': 'Empowering LLM Agents through Synthetic Interaction Data', 'desc': "This paper introduces Learn-by-interact, a framework designed to enhance the performance of large language model (LLM) agents in various environments without needing human-generated data. The framework generates synthetic data by simulating interactions between agents and their environments, using documentation to guide the process. A key innovation is the backward construction method, which summarizes interaction histories to create effective instructions for the agents. Experimental results show significant improvements in agent performance across multiple tasks, highlighting the framework's potential for real-world applications."}, 'zh': {'title': '通过交互学习,提升智能代理能力', 'desc': '本文提出了一种名为Learn-by-interact的数据中心框架,旨在使大型语言模型(LLMs)能够适应不同的环境,而无需人工标注。该框架通过文档生成代理与环境交互的轨迹,并通过总结或抽象交互历史来构建指令,这一过程称为反向构建。实验结果表明,Learn-by-interact在多种下游任务中显著提高了性能,尤其是在无监督学习和训练场景中。我们还展示了反向构建在训练中的重要性,进一步验证了合成数据的有效性和检索管道的优越性。'}}}, {'id': 'https://huggingface.co/papers/2501.08331', 'title': 'Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using Real-Time Warped Noise', 'url': 'https://huggingface.co/papers/2501.08331', 'abstract': 'Generative modeling aims to transform random noise into structured outputs. In this work, we enhance video diffusion models by allowing motion control via structured latent noise sampling. This is achieved by just a change in data: we pre-process training videos to yield structured noise. Consequently, our method is agnostic to diffusion model design, requiring no changes to model architectures or training pipelines. Specifically, we propose a novel noise warping algorithm, fast enough to run in real time, that replaces random temporal Gaussianity with correlated warped noise derived from optical flow fields, while preserving the spatial Gaussianity. The efficiency of our algorithm enables us to fine-tune modern video diffusion base models using warped noise with minimal overhead, and provide a one-stop solution for a wide range of user-friendly motion control: local object motion control, global camera movement control, and motion transfer. The harmonization between temporal coherence and spatial Gaussianity in our warped noise leads to effective motion control while maintaining per-frame pixel quality. Extensive experiments and user studies demonstrate the advantages of our method, making it a robust and scalable approach for controlling motion in video diffusion models. Video results are available on our webpage: https://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow. Source code and model checkpoints are available on GitHub: https://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow.', 'score': 11, 'issue_id': 1798, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'c48e19ef08e8d758', 'authors': ['Ryan Burgert', 'Yuancheng Xu', 'Wenqi Xian', 'Oliver Pilarski', 'Pascal Clausen', 'Mingming He', 'Li Ma', 'Yitong Deng', 'Lingxiao Li', 'Mohsen Mousavi', 'Michael Ryoo', 'Paul Debevec', 'Ning Yu'], 'affiliations': ['Eyeline Studios', 'Netflix', 'Stanford University', 'Stony Brook University', 'University of Maryland'], 'pdf_title_img': 'assets/pdf/title_img/2501.08331.jpg', 'data': {'categories': ['#diffusion', '#video', '#data'], 'emoji': '🎬', 'ru': {'title': 'Контроль движения в видео-диффузии через структурированный шум', 'desc': 'Исследователи предложили метод улучшения видео-диффузионных моделей путем изменения структуры шумовых данных при обучении. Они разработали алгоритм искажения шума в реальном времени, который сохраняет пространственную гауссовость, но вводит временную корреляцию на основе оптического потока. Этот подход позволяет контролировать движение в генерируемых видео без изменения архитектуры модели. Эксперименты показали эффективность метода для управления локальным движением объектов, глобальным движением камеры и переносом движения.'}, 'en': {'title': 'Transforming Noise into Motion: Enhanced Control in Video Diffusion Models', 'desc': 'This paper presents an improvement in video diffusion models by introducing a method for controlling motion through structured latent noise sampling. The authors propose a novel noise warping algorithm that modifies the training data to replace random noise with correlated noise based on optical flow, enhancing temporal coherence while maintaining spatial quality. This approach allows for real-time processing and fine-tuning of existing video diffusion models without altering their architecture or training methods. The results show that this method effectively enables various motion control tasks, making it a versatile tool for video generation applications.'}, 'zh': {'title': '运动控制的新方法:扭曲噪声的力量', 'desc': '生成建模的目标是将随机噪声转化为结构化输出。本文通过结构化潜在噪声采样增强视频扩散模型,实现了运动控制。我们提出了一种新颖的噪声扭曲算法,能够实时运行,并用光流场导出的相关扭曲噪声替代随机时间高斯噪声,同时保持空间高斯性。我们的算法高效性使得在现代视频扩散基础模型中使用扭曲噪声进行微调成为可能,提供了用户友好的运动控制解决方案。'}}}, {'id': 'https://huggingface.co/papers/2501.12273', 'title': 'Condor: Enhance LLM Alignment with Knowledge-Driven Data Synthesis and Refinement', 'url': 'https://huggingface.co/papers/2501.12273', 'abstract': 'The quality of Supervised Fine-Tuning (SFT) data plays a critical role in enhancing the conversational capabilities of Large Language Models (LLMs). However, as LLMs become more advanced, the availability of high-quality human-annotated SFT data has become a significant bottleneck, necessitating a greater reliance on synthetic training data. In this work, we introduce Condor, a novel two-stage synthetic data generation framework that incorporates World Knowledge Tree and Self-Reflection Refinement to produce high-quality SFT data at scale. Our experimental results demonstrate that a base model fine-tuned on only 20K Condor-generated samples achieves superior performance compared to counterparts. The additional refinement stage in Condor further enables iterative self-improvement for LLMs at various scales (up to 72B), validating the effectiveness of our approach. Furthermore, our investigation into the scaling for synthetic data in post-training reveals substantial unexplored potential for performance improvements, opening promising avenues for future research.', 'score': 11, 'issue_id': 1796, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '10499c8b820d5368', 'authors': ['Maosong Cao', 'Taolin Zhang', 'Mo Li', 'Chuyu Zhang', 'Yunxin Liu', 'Haodong Duan', 'Songyang Zhang', 'Kai Chen'], 'affiliations': ['Shanghai AI Laboratory', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.12273.jpg', 'data': {'categories': ['#optimization', '#synthetic', '#data', '#dataset', '#training'], 'emoji': '🦅', 'ru': {'title': 'Condor: прорыв в создании синтетических данных для обучения языковых моделей', 'desc': 'В статье представлен Condor - новый фреймворк для генерации синтетических данных для обучения больших языковых моделей (LLM). Он использует дерево мировых знаний и самоанализ для создания высококачественных обучающих данных. Эксперименты показали, что модель, обученная на 20 тысячах сгенерированных Condor примеров, превосходит аналоги. Исследование также выявило потенциал для улучшения производительности LLM при масштабировании синтетических данных.'}, 'en': {'title': 'Unlocking LLM Potential with Synthetic Data Generation', 'desc': 'This paper addresses the challenge of obtaining high-quality Supervised Fine-Tuning (SFT) data for Large Language Models (LLMs). It presents Condor, a two-stage framework that generates synthetic training data using World Knowledge Tree and Self-Reflection Refinement techniques. The results show that models fine-tuned with just 20,000 samples from Condor outperform those trained with traditional methods. Additionally, the framework allows for iterative self-improvement, suggesting significant potential for enhancing LLM performance through synthetic data.'}, 'zh': {'title': '合成数据生成,提升对话能力的关键', 'desc': '本论文探讨了监督微调(SFT)数据的质量对大型语言模型(LLMs)对话能力的重要性。随着LLMs的进步,高质量的人类标注SFT数据变得稀缺,因此需要更多依赖合成训练数据。我们提出了一种名为Condor的两阶段合成数据生成框架,结合了世界知识树和自我反思精炼,以大规模生成高质量的SFT数据。实验结果表明,仅用20K个Condor生成的样本微调的基础模型,其性能优于其他模型,验证了我们方法的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.10687', 'title': 'EMO2: End-Effector Guided Audio-Driven Avatar Video Generation', 'url': 'https://huggingface.co/papers/2501.10687', 'abstract': 'In this paper, we propose a novel audio-driven talking head method capable of simultaneously generating highly expressive facial expressions and hand gestures. Unlike existing methods that focus on generating full-body or half-body poses, we investigate the challenges of co-speech gesture generation and identify the weak correspondence between audio features and full-body gestures as a key limitation. To address this, we redefine the task as a two-stage process. In the first stage, we generate hand poses directly from audio input, leveraging the strong correlation between audio signals and hand movements. In the second stage, we employ a diffusion model to synthesize video frames, incorporating the hand poses generated in the first stage to produce realistic facial expressions and body movements. Our experimental results demonstrate that the proposed method outperforms state-of-the-art approaches, such as CyberHost and Vlogger, in terms of both visual quality and synchronization accuracy. This work provides a new perspective on audio-driven gesture generation and a robust framework for creating expressive and natural talking head animations.', 'score': 9, 'issue_id': 1798, 'pub_date': '2025-01-18', 'pub_date_card': {'ru': '18 января', 'en': 'January 18', 'zh': '1月18日'}, 'hash': '13c0931101eb51eb', 'authors': ['Linrui Tian', 'Siqi Hu', 'Qi Wang', 'Bang Zhang', 'Liefeng Bo'], 'affiliations': ['Institute for Intelligent Computing, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.10687.jpg', 'data': {'categories': ['#multimodal', '#audio', '#video', '#games', '#diffusion'], 'emoji': '🗣️', 'ru': {'title': 'Революция в анимации: от звука к выразительным жестам', 'desc': 'В статье предлагается новый метод создания говорящей головы на основе аудио, способный одновременно генерировать выразительные мимику и жесты рук. Авторы определяют задачу как двухэтапный процесс: сначала генерируются позы рук непосредственно из аудиовхода, затем применяется диффузионная модель для синтеза видеокадров. Экспериментальные результаты показывают, что предложенный метод превосходит современные подходы по качеству изображения и точности синхронизации. Работа предоставляет новый взгляд на генерацию жестов на основе аудио и надежную основу для создания выразительных и естественных анимаций говорящей головы.'}, 'en': {'title': 'Expressive Talking Heads: Bridging Audio and Gesture Generation', 'desc': 'This paper introduces a new method for creating talking head animations that are driven by audio. It focuses on generating both facial expressions and hand gestures, addressing the limitations of previous methods that often overlook the connection between audio and gestures. The approach is divided into two stages: first, it generates hand poses from audio signals, and then it uses a diffusion model to create video frames that combine these hand poses with realistic facial movements. The results show that this method is more effective than existing techniques, providing better visual quality and synchronization with the audio.'}, 'zh': {'title': '音频驱动的生动表情与手势生成新方法', 'desc': '本文提出了一种新颖的音频驱动的说话头方法,能够同时生成高度表现力的面部表情和手势。与现有方法不同,我们关注于共语手势生成的挑战,并识别音频特征与全身手势之间的弱对应关系。为了解决这个问题,我们将任务重新定义为两个阶段:第一阶段直接从音频输入生成手势,第二阶段使用扩散模型合成视频帧,结合第一阶段生成的手势,产生逼真的面部表情和身体动作。实验结果表明,该方法在视觉质量和同步精度方面优于现有的最先进方法。'}}}, {'id': 'https://huggingface.co/papers/2501.12390', 'title': 'GPS as a Control Signal for Image Generation', 'url': 'https://huggingface.co/papers/2501.12390', 'abstract': 'We show that the GPS tags contained in photo metadata provide a useful control signal for image generation. We train GPS-to-image models and use them for tasks that require a fine-grained understanding of how images vary within a city. In particular, we train a diffusion model to generate images conditioned on both GPS and text. The learned model generates images that capture the distinctive appearance of different neighborhoods, parks, and landmarks. We also extract 3D models from 2D GPS-to-image models through score distillation sampling, using GPS conditioning to constrain the appearance of the reconstruction from each viewpoint. Our evaluations suggest that our GPS-conditioned models successfully learn to generate images that vary based on location, and that GPS conditioning improves estimated 3D structure.', 'score': 9, 'issue_id': 1797, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '11d289e8a895bedd', 'authors': ['Chao Feng', 'Ziyang Chen', 'Aleksander Holynski', 'Alexei A. Efros', 'Andrew Owens'], 'affiliations': ['UC Berkeley', 'University of Michigan'], 'pdf_title_img': 'assets/pdf/title_img/2501.12390.jpg', 'data': {'categories': ['#synthetic', '#cv', '#multimodal', '#dataset', '#diffusion', '#3d'], 'emoji': '🗺️', 'ru': {'title': 'GPS-метки открывают новые горизонты в генерации изображений и 3D-моделировании', 'desc': 'Исследователи демонстрируют, как GPS-метки в метаданных фотографий могут использоваться для улучшения генерации изображений. Они обучают модели диффузии, генерирующие изображения на основе GPS-координат и текста, что позволяет точно отображать особенности различных районов и достопримечательностей. Авторы также извлекают 3D-модели из 2D GPS-моделей с помощью методики score distillation sampling. Результаты показывают, что GPS-обусловленные модели успешно генерируют изображения, варьирующиеся в зависимости от местоположения, и улучшают оценку 3D-структуры.'}, 'en': {'title': 'Harnessing GPS Data for Location-Aware Image Generation', 'desc': 'This paper explores the use of GPS data embedded in photo metadata as a control signal for generating images. The authors develop GPS-to-image models, particularly a diffusion model, that can create images based on both GPS coordinates and textual descriptions. The model effectively captures the unique characteristics of various urban environments, such as neighborhoods and landmarks. Additionally, they demonstrate the ability to extract 3D models from these images, enhancing the accuracy of 3D reconstructions by using GPS information to guide the process.'}, 'zh': {'title': '利用GPS标签生成城市图像的创新方法', 'desc': '本文展示了照片元数据中的GPS标签可以作为图像生成的有用控制信号。我们训练了GPS到图像的模型,并将其应用于需要细致理解城市中图像变化的任务。特别地,我们训练了一个扩散模型,生成同时依赖于GPS和文本的图像。评估结果表明,我们的GPS条件模型成功学习了基于位置生成变化图像,并且GPS条件改善了估计的3D结构。'}}}, {'id': 'https://huggingface.co/papers/2501.10057', 'title': 'MSTS: A Multimodal Safety Test Suite for Vision-Language Models', 'url': 'https://huggingface.co/papers/2501.10057', 'abstract': 'Vision-language models (VLMs), which process image and text inputs, are increasingly integrated into chat assistants and other consumer AI applications. Without proper safeguards, however, VLMs may give harmful advice (e.g. how to self-harm) or encourage unsafe behaviours (e.g. to consume drugs). Despite these clear hazards, little work so far has evaluated VLM safety and the novel risks created by multimodal inputs. To address this gap, we introduce MSTS, a Multimodal Safety Test Suite for VLMs. MSTS comprises 400 test prompts across 40 fine-grained hazard categories. Each test prompt consists of a text and an image that only in combination reveal their full unsafe meaning. With MSTS, we find clear safety issues in several open VLMs. We also find some VLMs to be safe by accident, meaning that they are safe because they fail to understand even simple test prompts. We translate MSTS into ten languages, showing non-English prompts to increase the rate of unsafe model responses. We also show models to be safer when tested with text only rather than multimodal prompts. Finally, we explore the automation of VLM safety assessments, finding even the best safety classifiers to be lacking.', 'score': 7, 'issue_id': 1802, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '05ea9cad57d3e1e6', 'authors': ['Paul Röttger', 'Giuseppe Attanasio', 'Felix Friedrich', 'Janis Goldzycher', 'Alicia Parrish', 'Rishabh Bhardwaj', 'Chiara Di Bonaventura', 'Roman Eng', 'Gaia El Khoury Geagea', 'Sujata Goswami', 'Jieun Han', 'Dirk Hovy', 'Seogyeong Jeong', 'Paloma Jeretič', 'Flor Miriam Plaza-del-Arco', 'Donya Rooein', 'Patrick Schramowski', 'Anastassia Shaitarova', 'Xudong Shen', 'Richard Willats', 'Andrea Zugarini', 'Bertie Vidgen'], 'affiliations': ['Bocconi University', 'CERTAIN', 'Clarkson University', 'Contextual AI', 'DFKI', 'Expert.ai', 'Google DeepMind', 'Hessian.AI', 'Imperial College London', 'Instituto de Telecomunicações', 'KAIST', 'Kings College London', 'Lawrence Berkeley National Laboratory', 'National University of Singapore', 'TU Darmstadt', 'University of Pennsylvania', 'University of Zurich', 'Walled AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.10057.jpg', 'data': {'categories': ['#security', '#dataset', '#benchmark', '#multimodal', '#ethics', '#multilingual'], 'emoji': '🔍', 'ru': {'title': 'Новый подход к оценке безопасности мультимодальных ИИ-моделей', 'desc': 'Статья представляет новый набор тестов MSTS для оценки безопасности мультимодальных моделей, работающих с изображениями и текстом. MSTS содержит 400 тестовых запросов в 40 категориях опасностей, где небезопасный смысл раскрывается только при сочетании текста и изображения. Исследование выявило проблемы безопасности в нескольких открытых мультимодальных моделях, а также показало, что некоторые модели безопасны случайно из-за непонимания даже простых запросов. Авторы также обнаружили, что модели менее безопасны при тестировании на других языках и с мультимодальными запросами по сравнению с только текстовыми.'}, 'en': {'title': 'Ensuring Safety in Vision-Language Models: A New Testing Approach', 'desc': 'This paper discusses the safety concerns associated with Vision-Language Models (VLMs) that combine image and text inputs. It introduces the Multimodal Safety Test Suite (MSTS), which includes 400 test prompts designed to evaluate the safety of VLMs across various hazard categories. The study reveals that many VLMs exhibit safety issues when processing multimodal inputs, while some are inadvertently safe due to their inability to comprehend simple prompts. Additionally, the research highlights the challenges in automating safety assessments for VLMs, indicating that even the most advanced safety classifiers have limitations.'}, 'zh': {'title': '确保视觉语言模型安全的关键测试', 'desc': '本文介绍了一种多模态安全测试套件(MSTS),用于评估视觉语言模型(VLMs)的安全性。MSTS包含400个测试提示,涵盖40个细分的危险类别,每个提示由文本和图像组合而成,以揭示其潜在的危险含义。研究发现,许多开放的VLM在安全性方面存在明显问题,而一些模型由于无法理解简单提示而意外地表现出安全性。此外,测试结果表明,单一文本提示的安全性高于多模态提示,且现有的安全分类器在自动化评估中仍存在不足。'}}}, {'id': 'https://huggingface.co/papers/2501.10573', 'title': 'The Geometry of Tokens in Internal Representations of Large Language Models', 'url': 'https://huggingface.co/papers/2501.10573', 'abstract': 'We investigate the relationship between the geometry of token embeddings and their role in the next token prediction within transformer models. An important aspect of this connection uses the notion of empirical measure, which encodes the distribution of token point clouds across transformer layers and drives the evolution of token representations in the mean-field interacting picture. We use metrics such as intrinsic dimension, neighborhood overlap, and cosine similarity to observationally probe these empirical measures across layers. To validate our approach, we compare these metrics to a dataset where the tokens are shuffled, which disrupts the syntactic and semantic structure. Our findings reveal a correlation between the geometric properties of token embeddings and the cross-entropy loss of next token predictions, implying that prompts with higher loss values have tokens represented in higher-dimensional spaces.', 'score': 5, 'issue_id': 1807, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '1b34301e721ccccd', 'authors': ['Karthik Viswanathan', 'Yuri Gardinazzi', 'Giada Panerai', 'Alberto Cazzaniga', 'Matteo Biagetti'], 'affiliations': ['Area Science Park, Trieste, Italy', 'University of Amsterdam, Amsterdam, the Netherlands', 'University of Trieste, Trieste, Italy'], 'pdf_title_img': 'assets/pdf/title_img/2501.10573.jpg', 'data': {'categories': ['#benchmark', '#optimization', '#training', '#dataset', '#interpretability', '#data', '#architecture'], 'emoji': '🧮', 'ru': {'title': 'Геометрия вложений токенов раскрывает тайны предсказания в трансформерах', 'desc': 'Исследование посвящено связи между геометрией вложений токенов и их ролью в предсказании следующего токена в трансформерных моделях. Авторы используют понятие эмпирической меры для анализа распределения облаков точек токенов по слоям трансформера. Они применяют метрики, такие как внутренняя размерность, перекрытие окрестностей и косинусное сходство, для изучения этих эмпирических мер. Результаты показывают корреляцию между геометрическими свойствами вложений токенов и кросс-энтропийной функцией потерь при предсказании следующего токена.'}, 'en': {'title': 'Geometry Matters: Token Embeddings Shape Prediction Success', 'desc': 'This paper explores how the shape and arrangement of token embeddings affect the ability of transformer models to predict the next token in a sequence. It introduces the concept of empirical measure to analyze how token representations change across different layers of the model. By examining metrics like intrinsic dimension and cosine similarity, the authors investigate the geometric properties of these embeddings. The results show that tokens associated with higher prediction errors are represented in more complex, higher-dimensional spaces, highlighting the importance of geometry in language modeling.'}, 'zh': {'title': '标记嵌入几何与预测损失的关系', 'desc': '本文研究了在变换器模型中,标记嵌入的几何形状与下一个标记预测之间的关系。我们使用经验测度的概念来编码标记点云在变换器层中的分布,并驱动标记表示的演变。通过内在维度、邻域重叠和余弦相似度等指标,我们观察了这些经验测度在各层之间的变化。研究结果表明,标记嵌入的几何特性与下一个标记预测的交叉熵损失之间存在相关性,损失值较高的提示对应的标记在更高维空间中表示。'}}}, {'id': 'https://huggingface.co/papers/2501.11900', 'title': 'Panoramic Interests: Stylistic-Content Aware Personalized Headline Generation', 'url': 'https://huggingface.co/papers/2501.11900', 'abstract': "Personalized news headline generation aims to provide users with attention-grabbing headlines that are tailored to their preferences. Prevailing methods focus on user-oriented content preferences, but most of them overlook the fact that diverse stylistic preferences are integral to users' panoramic interests, leading to suboptimal personalization. In view of this, we propose a novel Stylistic-Content Aware Personalized Headline Generation (SCAPE) framework. SCAPE extracts both content and stylistic features from headlines with the aid of large language model (LLM) collaboration. It further adaptively integrates users' long- and short-term interests through a contrastive learning-based hierarchical fusion network. By incorporating the panoramic interests into the headline generator, SCAPE reflects users' stylistic-content preferences during the generation process. Extensive experiments on the real-world dataset PENS demonstrate the superiority of SCAPE over baselines.", 'score': 3, 'issue_id': 1805, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'af7a432a54575398', 'authors': ['Junhong Lian', 'Xiang Ao', 'Xinyu Liu', 'Yang Liu', 'Qing He'], 'affiliations': ['Institute of Computing Technology, Chinese Academy of Sciences', 'Key Lab of Intelligent Information Processing of Chinese Academy of Sciences (CAS)'], 'pdf_title_img': 'assets/pdf/title_img/2501.11900.jpg', 'data': {'categories': ['#multimodal', '#training', '#story_generation', '#dataset'], 'emoji': '📰', 'ru': {'title': 'SCAPE: персонализация заголовков с учетом стиля и содержания', 'desc': 'Эта статья представляет новый подход к генерации персонализированных заголовков новостей, называемый SCAPE. Фреймворк SCAPE учитывает как содержательные, так и стилистические предпочтения пользователей с помощью большой языковой модели. Он адаптивно интегрирует долгосрочные и краткосрочные интересы пользователей через иерархическую сеть слияния на основе контрастного обучения. Эксперименты на реальном датасете PENS демонстрируют превосходство SCAPE над базовыми методами.'}, 'en': {'title': 'Tailored Headlines: Merging Style and Content for Personalization', 'desc': "This paper introduces a new framework called SCAPE for generating personalized news headlines that cater to both content and stylistic preferences of users. Unlike previous methods that primarily focus on content, SCAPE recognizes the importance of diverse stylistic choices in enhancing personalization. The framework utilizes large language models to extract relevant features and employs a contrastive learning-based hierarchical fusion network to integrate users' interests over time. Experimental results on the PENS dataset show that SCAPE outperforms existing approaches in generating more appealing and tailored headlines."}, 'zh': {'title': '个性化标题生成的新视角:风格与内容的结合', 'desc': '个性化新闻标题生成旨在为用户提供吸引眼球的标题,符合他们的偏好。现有方法主要关注用户的内容偏好,但往往忽视了用户多样化的风格偏好,这导致个性化效果不佳。为此,我们提出了一种新颖的风格内容感知个性化标题生成框架(SCAPE)。SCAPE通过大型语言模型提取标题的内容和风格特征,并通过对比学习的层次融合网络自适应整合用户的长期和短期兴趣,从而在生成过程中反映用户的风格内容偏好。'}}}, {'id': 'https://huggingface.co/papers/2501.12389', 'title': 'Taming Teacher Forcing for Masked Autoregressive Video Generation', 'url': 'https://huggingface.co/papers/2501.12389', 'abstract': 'We introduce MAGI, a hybrid video generation framework that combines masked modeling for intra-frame generation with causal modeling for next-frame generation. Our key innovation, Complete Teacher Forcing (CTF), conditions masked frames on complete observation frames rather than masked ones (namely Masked Teacher Forcing, MTF), enabling a smooth transition from token-level (patch-level) to frame-level autoregressive generation. CTF significantly outperforms MTF, achieving a +23% improvement in FVD scores on first-frame conditioned video prediction. To address issues like exposure bias, we employ targeted training strategies, setting a new benchmark in autoregressive video generation. Experiments show that MAGI can generate long, coherent video sequences exceeding 100 frames, even when trained on as few as 16 frames, highlighting its potential for scalable, high-quality video generation.', 'score': 1, 'issue_id': 1813, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': '43a9c17394f0d637', 'authors': ['Deyu Zhou', 'Quan Sun', 'Yuang Peng', 'Kun Yan', 'Runpei Dong', 'Duomin Wang', 'Zheng Ge', 'Nan Duan', 'Xiangyu Zhang', 'Lionel M. Ni', 'Heung-Yeung Shum'], 'affiliations': ['HKUST', 'HKUST(GZ)', 'StepFun', 'THU', 'UIUC'], 'pdf_title_img': 'assets/pdf/title_img/2501.12389.jpg', 'data': {'categories': ['#training', '#video', '#benchmark'], 'emoji': '🎬', 'ru': {'title': 'MAGI: Революция в автоматической генерации видео', 'desc': 'MAGI - это гибридная система генерации видео, объединяющая маскированное моделирование для внутрикадровой генерации и каузальное моделирование для генерации следующего кадра. Ключевое нововведение - Complete Teacher Forcing (CTF), которое обусловливает маскированные кадры полными наблюдаемыми кадрами, а не маскированными. CTF значительно превосходит Masked Teacher Forcing (MTF), улучшая показатели FVD на 23% при прогнозировании видео на основе первого кадра. MAGI способна генерировать длинные, связные видеопоследовательности, превышающие 100 кадров, даже при обучении на всего 16 кадрах.'}, 'en': {'title': 'MAGI: Revolutionizing Video Generation with Complete Teacher Forcing', 'desc': 'MAGI is a new framework for generating videos that uses two main techniques: masked modeling for creating individual frames and causal modeling for predicting the next frame. The innovative approach called Complete Teacher Forcing (CTF) improves the process by using fully observed frames to guide the generation, rather than just partially masked frames. This method leads to a significant performance boost, as evidenced by a 23% increase in FVD scores compared to previous methods. Additionally, MAGI can produce long and coherent video sequences, demonstrating its effectiveness even with limited training data.'}, 'zh': {'title': 'MAGI:高效视频生成的新突破', 'desc': '本文介绍了一种名为MAGI的混合视频生成框架,它结合了掩码建模用于帧内生成和因果建模用于下一帧生成。我们提出的关键创新是完整教师强制(CTF),它基于完整观察帧而非掩码帧来条件化掩码帧,从而实现从标记级到帧级自回归生成的平滑过渡。CTF在第一帧条件视频预测中显著优于掩码教师强制(MTF),FVD分数提高了23%。实验表明,MAGI能够生成超过100帧的长时间连贯视频序列,即使在仅用16帧训练的情况下,也展现了其可扩展性和高质量视频生成的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.12206', 'title': 'Fixing Imbalanced Attention to Mitigate In-Context Hallucination of Large Vision-Language Model', 'url': 'https://huggingface.co/papers/2501.12206', 'abstract': 'Large Vision Language Models (LVLMs) have demonstrated remarkable capabilities in understanding and describing visual content, achieving state-of-the-art performance across various vision-language tasks. However, these models frequently exhibit hallucination behavior, where they generate descriptions containing objects or details absent in the input image. Our work investigates this phenomenon by analyzing attention patterns across transformer layers and heads, revealing that hallucinations often stem from progressive degradation of visual grounding in deeper layers. We propose a novel attention modification approach that combines selective token emphasis and head-specific modulation to maintain visual grounding throughout the generation process. Our method introduces two key components: (1) a dual-stream token selection mechanism that identifies and prioritizes both locally informative and spatially significant visual tokens, and (2) an attention head-specific modulation strategy that differentially amplifies visual information processing based on measured visual sensitivity of individual attention heads. Through extensive experimentation on the MSCOCO dataset, we demonstrate that our approach reduces hallucination rates by up to 62.3\\% compared to baseline models while maintaining comparable task performance. Our analysis reveals that selectively modulating tokens across attention heads with varying levels of visual sensitivity can significantly improve visual grounding without requiring model retraining.', 'score': 0, 'issue_id': 1812, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'd37fc59e414ab903', 'authors': ['Kazi Hasan Ibn Arif', 'Sajib Acharjee Dip', 'Khizar Hussain', 'Lang Zhang', 'Chris Thomas'], 'affiliations': ['Virginia Tech'], 'pdf_title_img': 'assets/pdf/title_img/2501.12206.jpg', 'data': {'categories': ['#multimodal', '#dataset', '#interpretability', '#architecture', '#cv', '#hallucinations'], 'emoji': '👁️', 'ru': {'title': 'Улучшение визуальной привязки для снижения галлюцинаций в LVLM', 'desc': 'Данная статья исследует проблему галлюцинаций в крупных визуально-языковых моделях (LVLM) при описании изображений. Авторы анализируют паттерны внимания в слоях трансформера и обнаруживают, что галлюцинации часто возникают из-за ослабления визуальной привязки в глубоких слоях. Предлагается новый подход модификации внимания, сочетающий выборочное усиление токенов и модуляцию головок внимания для сохранения визуальной привязки. Эксперименты показывают, что метод снижает уровень галлюцинаций на 62.3% по сравнению с базовыми моделями.'}, 'en': {'title': 'Enhancing Visual Grounding to Combat Hallucinations in LVLMs', 'desc': "This paper addresses the issue of hallucination in Large Vision Language Models (LVLMs), where the models generate incorrect descriptions that include non-existent objects. The authors analyze attention patterns in transformer layers to understand how visual grounding deteriorates in deeper layers, leading to these hallucinations. They propose a new method that enhances attention by focusing on important visual tokens and adjusting how different attention heads process visual information. Their experiments show that this approach can significantly reduce hallucination rates while keeping the model's performance on tasks intact."}, 'zh': {'title': '减少幻觉,提升视觉理解!', 'desc': '大型视觉语言模型(LVLMs)在理解和描述视觉内容方面表现出色,但它们常常会产生幻觉行为,即生成的描述中包含输入图像中不存在的对象或细节。我们的研究分析了变换器层和头部的注意力模式,发现幻觉通常源于深层次的视觉基础逐渐退化。我们提出了一种新的注意力修改方法,结合选择性标记强调和头部特定调制,以在生成过程中保持视觉基础。通过在MSCOCO数据集上的广泛实验,我们的方法将幻觉率降低了多达62.3%,同时保持了相似的任务性能。'}}}, {'id': 'https://huggingface.co/papers/2501.02976', 'title': 'STAR: Spatial-Temporal Augmentation with Text-to-Video Models for Real-World Video Super-Resolution', 'url': 'https://huggingface.co/papers/2501.02976', 'abstract': 'Image diffusion models have been adapted for real-world video super-resolution to tackle over-smoothing issues in GAN-based methods. However, these models struggle to maintain temporal consistency, as they are trained on static images, limiting their ability to capture temporal dynamics effectively. Integrating text-to-video (T2V) models into video super-resolution for improved temporal modeling is straightforward. However, two key challenges remain: artifacts introduced by complex degradations in real-world scenarios, and compromised fidelity due to the strong generative capacity of powerful T2V models (e.g., CogVideoX-5B). To enhance the spatio-temporal quality of restored videos, we introduce~\\name (Spatial-Temporal Augmentation with T2V models for Real-world video super-resolution), a novel approach that leverages T2V models for real-world video super-resolution, achieving realistic spatial details and robust temporal consistency. Specifically, we introduce a Local Information Enhancement Module (LIEM) before the global attention block to enrich local details and mitigate degradation artifacts. Moreover, we propose a Dynamic Frequency (DF) Loss to reinforce fidelity, guiding the model to focus on different frequency components across diffusion steps. Extensive experiments demonstrate~\\name~outperforms state-of-the-art methods on both synthetic and real-world datasets.', 'score': 36, 'issue_id': 1527, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '13ac412646c508f5', 'authors': ['Rui Xie', 'Yinhong Liu', 'Penghao Zhou', 'Chen Zhao', 'Jun Zhou', 'Kai Zhang', 'Zhenyu Zhang', 'Jian Yang', 'Zhenheng Yang', 'Ying Tai'], 'affiliations': ['ByteDance', 'Nanjing University', 'Southwest University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02976.jpg', 'data': {'categories': ['#cv', '#optimization', '#diffusion', '#multimodal', '#video'], 'emoji': '🎥', 'ru': {'title': 'Качественное суперразрешение видео с помощью T2V моделей', 'desc': 'Представлена новая методика STAR для суперразрешения видео в реальных условиях с использованием моделей text-to-video. Предложен модуль LIEM для улучшения локальных деталей и устранения артефактов деградации. Введена функция потерь Dynamic Frequency для усиления точности восстановления на разных частотах. Эксперименты показывают превосходство STAR над современными методами на синтетических и реальных датасетах.'}, 'en': {'title': 'Enhancing Video Quality with T2V Models for Real-World Super-Resolution', 'desc': 'This paper presents a new method called Spatial-Temporal Augmentation with T2V models for Real-world video super-resolution, which aims to improve video quality by addressing issues of over-smoothing and temporal consistency. Traditional image diffusion models struggle with video because they are designed for static images, leading to challenges in capturing motion dynamics. The proposed approach incorporates a Local Information Enhancement Module to enhance local details and reduce artifacts, along with a Dynamic Frequency Loss to maintain fidelity across different frequency components. Experimental results show that this method outperforms existing techniques in both synthetic and real-world scenarios, providing better spatial and temporal quality in restored videos.'}, 'zh': {'title': '提升视频超分辨率的时空一致性', 'desc': '本文提出了一种新方法,名为~\\name~,用于提高真实世界视频超分辨率的时空质量。该方法结合了文本到视频(T2V)模型,以解决传统生成对抗网络(GAN)方法中的过平滑问题。通过引入局部信息增强模块(LIEM)和动态频率损失(DF Loss),该方法能够有效改善视频的局部细节和时间一致性。实验结果表明,~\\name~在合成和真实世界数据集上均优于现有的最先进方法。'}}}, {'id': 'https://huggingface.co/papers/2501.03226', 'title': 'BoostStep: Boosting mathematical capability of Large Language Models via improved single-step reasoning', 'url': 'https://huggingface.co/papers/2501.03226', 'abstract': "Cutting-edge large language models (LLMs) demonstrate promising performance in solving complex math problems with a divide-and-conquer pipeline and the assistance of in-context learning (ICL) examples. However, their potential for improvement is limited by two critical problems within their ICL examples: granularity-mismatch and the ensuing negative-effect noise problem. Specifically, the LLMs are capable of the dividing process yet mostly failed by inaccurate reasoning within a few conquer steps, while the ICL examples retrieved in question-grained sometimes lack relevant steps for a specific challenging reasoning step. Further, this disconnect may hinder the correct reasoning due to its irrelevance. To this end, we focus on improving the reasoning quality within each step and present BoostStep. BoostStep aligns the granularity between the retrieving and reasoning on step grained, and provides highly related ICL examples for each reasoning step with a novel `first-try' strategy. BoostStep provides more relevant examples than the coarse question-grained strategy, enhancing the model reasoning quality within each step steadily. BoostStep is a general and robust reasoning-enhancing method that not only improves standalone reasoning performance but also integrates seamlessly with Monte Carlo Tree Search methods (MCTS) to refine both candidate generation and decision-making. Quantitatively, it improves GPT-4o and Qwen2.5-Math-72B by 3.6\\% and 2.0\\% respectively on various mathematical benchmarks, and 7.5\\% gain combined with MCTS.", 'score': 21, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '94a01c7d4516c725', 'authors': ['Beichen Zhang', 'Yuhong Liu', 'Xiaoyi Dong', 'Yuhang Zang', 'Pan Zhang', 'Haodong Duan', 'Yuhang Cao', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.03226.jpg', 'data': {'categories': ['#training', '#optimization', '#math', '#reasoning'], 'emoji': '🧮', 'ru': {'title': 'BoostStep: Повышение точности рассуждений ИИ в решении математических задач', 'desc': 'Статья представляет метод BoostStep для улучшения решения сложных математических задач большими языковыми моделями. BoostStep решает проблемы несоответствия детализации и негативного шума в примерах обучения в контексте. Метод выравнивает гранулярность между извлечением и рассуждением на уровне шагов, предоставляя релевантные примеры для каждого шага рассуждения. BoostStep повышает качество рассуждений модели и может интегрироваться с методами поиска по дереву Монте-Карло для улучшения генерации кандидатов и принятия решений.'}, 'en': {'title': 'Boosting Reasoning Quality in Large Language Models with BoostStep', 'desc': "This paper introduces BoostStep, a method designed to enhance the reasoning quality of large language models (LLMs) when solving complex math problems. It addresses two main issues: granularity-mismatch and negative-effect noise in in-context learning (ICL) examples, which can lead to inaccurate reasoning. By aligning the granularity of retrieved examples with the specific reasoning steps required, BoostStep provides more relevant ICL examples, improving the model's performance. The method not only boosts standalone reasoning but also integrates effectively with Monte Carlo Tree Search (MCTS) to enhance decision-making processes."}, 'zh': {'title': '提升推理质量的BoostStep方法', 'desc': '这篇论文探讨了大型语言模型(LLMs)在解决复杂数学问题时的表现,特别是通过分而治之的策略和上下文学习(ICL)示例的辅助。研究发现,ICL示例中的粒度不匹配和负面噪声问题限制了模型的改进潜力。为了解决这些问题,论文提出了BoostStep方法,它通过对每个推理步骤的粒度进行对齐,提供更相关的ICL示例,从而提高推理质量。BoostStep不仅提升了独立推理的性能,还能与蒙特卡洛树搜索(MCTS)方法无缝集成,进一步优化候选生成和决策过程。'}}}, {'id': 'https://huggingface.co/papers/2501.03218', 'title': 'Dispider: Enabling Video LLMs with Active Real-Time Interaction via Disentangled Perception, Decision, and Reaction', 'url': 'https://huggingface.co/papers/2501.03218', 'abstract': 'Active Real-time interaction with video LLMs introduces a new paradigm for human-computer interaction, where the model not only understands user intent but also responds while continuously processing streaming video on the fly. Unlike offline video LLMs, which analyze the entire video before answering questions, active real-time interaction requires three capabilities: 1) Perception: real-time video monitoring and interaction capturing. 2) Decision: raising proactive interaction in proper situations, 3) Reaction: continuous interaction with users. However, inherent conflicts exist among the desired capabilities. The Decision and Reaction require a contrary Perception scale and grain, and the autoregressive decoding blocks the real-time Perception and Decision during the Reaction. To unify the conflicted capabilities within a harmonious system, we present Dispider, a system that disentangles Perception, Decision, and Reaction. Dispider features a lightweight proactive streaming video processing module that tracks the video stream and identifies optimal moments for interaction. Once the interaction is triggered, an asynchronous interaction module provides detailed responses, while the processing module continues to monitor the video in the meantime. Our disentangled and asynchronous design ensures timely, contextually accurate, and computationally efficient responses, making Dispider ideal for active real-time interaction for long-duration video streams. Experiments show that Dispider not only maintains strong performance in conventional video QA tasks, but also significantly surpasses previous online models in streaming scenario responses, thereby validating the effectiveness of our architecture. The code and model are released at https://github.com/Mark12Ding/Dispider.', 'score': 20, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '1e9974be2d206516', 'authors': ['Rui Qian', 'Shuangrui Ding', 'Xiaoyi Dong', 'Pan Zhang', 'Yuhang Zang', 'Yuhang Cao', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.03218.jpg', 'data': {'categories': ['#long_context', '#video', '#optimization', '#architecture', '#interpretability'], 'emoji': '🎥', 'ru': {'title': 'Dispider: Интеллектуальное взаимодействие с видео в реальном времени', 'desc': 'Статья представляет систему Dispider для активного взаимодействия с видео в реальном времени с использованием языковых моделей. Система разделяет процессы восприятия, принятия решений и реакции, что позволяет эффективно обрабатывать потоковое видео и взаимодействовать с пользователем. Dispider использует легковесный модуль обработки видео для отслеживания потока и определения оптимальных моментов для взаимодействия. Асинхронная архитектура обеспечивает своевременные и точные ответы при длительной обработке видеопотоков.'}, 'en': {'title': 'Dispider: Real-time Interaction Redefined for Video LLMs', 'desc': 'This paper introduces Dispider, a system designed for active real-time interaction with video using large language models (LLMs). Unlike traditional offline models, Dispider can process video streams continuously while engaging with users, requiring three key capabilities: Perception, Decision, and Reaction. The system addresses conflicts between these capabilities by disentangling them, allowing for efficient monitoring and interaction without lag. Experimental results demonstrate that Dispider outperforms previous models in streaming scenarios, providing timely and contextually relevant responses during long-duration video interactions.'}, 'zh': {'title': '主动实时交互的新范式', 'desc': '本论文介绍了一种名为Dispider的系统,旨在实现视频大语言模型的主动实时交互。该系统通过分离感知、决策和反应三个能力,解决了实时交互中的固有冲突。Dispider具备轻量级的流媒体处理模块,能够实时监控视频流并识别最佳交互时机。实验结果表明,Dispider在传统视频问答任务中表现优异,并在流媒体场景响应上显著超越了之前的在线模型。'}}}, {'id': 'https://huggingface.co/papers/2501.02157', 'title': 'Personalized Graph-Based Retrieval for Large Language Models', 'url': 'https://huggingface.co/papers/2501.02157', 'abstract': 'As large language models (LLMs) evolve, their ability to deliver personalized and context-aware responses offers transformative potential for improving user experiences. Existing personalization approaches, however, often rely solely on user history to augment the prompt, limiting their effectiveness in generating tailored outputs, especially in cold-start scenarios with sparse data. To address these limitations, we propose Personalized Graph-based Retrieval-Augmented Generation (PGraphRAG), a framework that leverages user-centric knowledge graphs to enrich personalization. By directly integrating structured user knowledge into the retrieval process and augmenting prompts with user-relevant context, PGraphRAG enhances contextual understanding and output quality. We also introduce the Personalized Graph-based Benchmark for Text Generation, designed to evaluate personalized text generation tasks in real-world settings where user history is sparse or unavailable. Experimental results show that PGraphRAG significantly outperforms state-of-the-art personalization methods across diverse tasks, demonstrating the unique advantages of graph-based retrieval for personalization.', 'score': 16, 'issue_id': 1527, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': '65e3736cfc1e3295', 'authors': ['Steven Au', 'Cameron J. Dimacali', 'Ojasmitha Pedirappagari', 'Namyong Park', 'Franck Dernoncourt', 'Yu Wang', 'Nikos Kanakaris', 'Hanieh Deilamsalehy', 'Ryan A. Rossi', 'Nesreen K. Ahmed'], 'affiliations': ['Adobe Research', 'Cisco AI Research', 'Meta AI', 'University of California Santa Cruz', 'University of Oregon', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.02157.jpg', 'data': {'categories': ['#rag', '#optimization', '#graphs', '#multimodal', '#benchmark', '#games'], 'emoji': '🕸️', 'ru': {'title': 'Графы знаний на службе персонализации языковых моделей', 'desc': 'Статья представляет новый подход к персонализации ответов больших языковых моделей (LLM) под названием PGraphRAG. В отличие от существующих методов, полагающихся на историю пользователя, PGraphRAG использует ориентированные на пользователя графы знаний для обогащения контекста. Этот метод улучшает понимание контекста и качество генерируемых ответов, особенно в сценариях с ограниченными данными о пользователе. Экспериментальные результаты показывают, что PGraphRAG превосходит современные методы персонализации в различных задачах.'}, 'en': {'title': 'Revolutionizing Personalization with Graph-based Retrieval', 'desc': "This paper introduces a new framework called Personalized Graph-based Retrieval-Augmented Generation (PGraphRAG) that enhances the personalization of large language models (LLMs). Unlike traditional methods that depend only on user history, PGraphRAG utilizes user-centric knowledge graphs to provide richer context for generating responses. By integrating structured user information into the retrieval process, it improves the model's understanding and the quality of its outputs, especially in situations where user data is limited. The authors also present a benchmark for evaluating personalized text generation, showing that PGraphRAG outperforms existing methods in various tasks."}, 'zh': {'title': '个性化图谱提升生成质量', 'desc': '随着大型语言模型的发展,它们在提供个性化和上下文感知的响应方面展现出巨大的潜力。现有的个性化方法通常仅依赖用户历史数据来增强提示,这在数据稀疏的冷启动场景中效果有限。为了解决这些问题,我们提出了个性化图谱检索增强生成(PGraphRAG)框架,利用以用户为中心的知识图谱来丰富个性化。实验结果表明,PGraphRAG在多种任务中显著优于现有的个性化方法,展示了基于图谱的检索在个性化中的独特优势。'}}}, {'id': 'https://huggingface.co/papers/2501.02497', 'title': 'Test-time Computing: from System-1 Thinking to System-2 Thinking', 'url': 'https://huggingface.co/papers/2501.02497', 'abstract': "The remarkable performance of the o1 model in complex reasoning demonstrates that test-time computing scaling can further unlock the model's potential, enabling powerful System-2 thinking. However, there is still a lack of comprehensive surveys for test-time computing scaling. We trace the concept of test-time computing back to System-1 models. In System-1 models, test-time computing addresses distribution shifts and improves robustness and generalization through parameter updating, input modification, representation editing, and output calibration. In System-2 models, it enhances the model's reasoning ability to solve complex problems through repeated sampling, self-correction, and tree search. We organize this survey according to the trend of System-1 to System-2 thinking, highlighting the key role of test-time computing in the transition from System-1 models to weak System-2 models, and then to strong System-2 models. We also point out a few possible future directions.", 'score': 15, 'issue_id': 1528, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': '7d9414c60fe7701d', 'authors': ['Yixin Ji', 'Juntao Li', 'Hai Ye', 'Kaixin Wu', 'Jia Xu', 'Linjian Mo', 'Min Zhang'], 'affiliations': ['Ant Group', 'Department of Computer Science, National University of Singapore', 'School of Computer Science and Technology, Soochow University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02497.jpg', 'data': {'categories': ['#reasoning', '#math', '#survey', '#training'], 'emoji': '🧠', 'ru': {'title': 'Масштабирование вычислений: путь к мышлению System-2', 'desc': 'Эта статья рассматривает масштабирование вычислений во время тестирования для улучшения производительности моделей машинного обучения. Авторы прослеживают эволюцию этой концепции от моделей System-1 до моделей System-2. В работе описываются различные методы, такие как обновление параметров, модификация входных данных и древовидный поиск. Исследование подчеркивает ключевую роль вычислений во время тестирования в переходе от моделей System-1 к сильным моделям System-2.'}, 'en': {'title': 'Unlocking Model Potential: The Power of Test-Time Computing', 'desc': 'This paper explores the concept of test-time computing scaling and its impact on machine learning models, particularly in enhancing reasoning capabilities. It distinguishes between System-1 models, which focus on improving robustness and generalization through techniques like parameter updating and output calibration, and System-2 models, which utilize methods such as repeated sampling and self-correction for complex problem-solving. The authors trace the evolution from System-1 to System-2 thinking, emphasizing how test-time computing plays a crucial role in this transition. Additionally, the paper identifies potential future research directions in this area.'}, 'zh': {'title': '测试时计算:从系统-1到强系统-2的关键转变', 'desc': '这篇论文探讨了测试时计算扩展对机器学习模型的影响,特别是在复杂推理中的应用。作者指出,测试时计算可以通过参数更新、输入修改、表示编辑和输出校准来提高模型的鲁棒性和泛化能力。对于系统-2模型,测试时计算通过重复采样、自我修正和树搜索来增强模型的推理能力。论文还强调了测试时计算在从系统-1模型向弱系统-2模型再到强系统-2模型转变中的关键作用,并提出了一些未来的研究方向。'}}}, {'id': 'https://huggingface.co/papers/2501.02045', 'title': 'METAGENE-1: Metagenomic Foundation Model for Pandemic Monitoring', 'url': 'https://huggingface.co/papers/2501.02045', 'abstract': 'We pretrain METAGENE-1, a 7-billion-parameter autoregressive transformer model, which we refer to as a metagenomic foundation model, on a novel corpus of diverse metagenomic DNA and RNA sequences comprising over 1.5 trillion base pairs. This dataset is sourced from a large collection of human wastewater samples, processed and sequenced using deep metagenomic (next-generation) sequencing methods. Unlike genomic models that focus on individual genomes or curated sets of specific species, the aim of METAGENE-1 is to capture the full distribution of genomic information present within this wastewater, to aid in tasks relevant to pandemic monitoring and pathogen detection. We carry out byte-pair encoding (BPE) tokenization on our dataset, tailored for metagenomic sequences, and then pretrain our model. In this paper, we first detail the pretraining dataset, tokenization strategy, and model architecture, highlighting the considerations and design choices that enable the effective modeling of metagenomic data. We then show results of pretraining this model on our metagenomic dataset, providing details about our losses, system metrics, and training stability over the course of pretraining. Finally, we demonstrate the performance of METAGENE-1, which achieves state-of-the-art results on a set of genomic benchmarks and new evaluations focused on human-pathogen detection and genomic sequence embedding, showcasing its potential for public health applications in pandemic monitoring, biosurveillance, and early detection of emerging health threats.', 'score': 12, 'issue_id': 1528, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '60a3568f555ed60f', 'authors': ['Ollie Liu', 'Sami Jaghouar', 'Johannes Hagemann', 'Shangshang Wang', 'Jason Wiemels', 'Jeff Kaufman', 'Willie Neiswanger'], 'affiliations': ['Nucleic Acid Observatory', 'Prime Intellect', 'University of Southern California'], 'pdf_title_img': 'assets/pdf/title_img/2501.02045.jpg', 'data': {'categories': ['#benchmark', '#data', '#training', '#architecture', '#science', '#dataset', '#healthcare'], 'emoji': '🧬', 'ru': {'title': 'METAGENE-1: Метагеномная модель для мониторинга здоровья населения', 'desc': 'METAGENE-1 - это автореграссивная трансформерная модель с 7 миллиардами параметров, обученная на разнообразных метагеномных последовательностях ДНК и РНК. Модель создана для анализа геномной информации из образцов сточных вод с целью мониторинга пандемий и обнаружения патогенов. Авторы описывают процесс предобучения, включая токенизацию и архитектуру модели, а также демонстрируют результаты на различных геномных задачах. METAGENE-1 показывает высокую эффективность в обнаружении патогенов человека и встраивании геномных последовательностей, что открывает перспективы для применения в общественном здравоохранении.'}, 'en': {'title': 'Unlocking Metagenomics: METAGENE-1 for Pandemic Preparedness', 'desc': 'The paper introduces METAGENE-1, a large autoregressive transformer model designed for metagenomic data analysis. It is pretrained on a vast dataset of metagenomic DNA and RNA sequences derived from human wastewater, totaling over 1.5 trillion base pairs. The model aims to enhance pandemic monitoring and pathogen detection by capturing the diverse genomic information present in wastewater samples. The authors detail their tokenization strategy and model architecture, demonstrating that METAGENE-1 achieves state-of-the-art performance in genomic benchmarks and applications related to public health.'}, 'zh': {'title': 'METAGENE-1:元基因组基础模型助力公共卫生监测', 'desc': '我们预训练了METAGENE-1,这是一个拥有70亿参数的自回归变换器模型,称为元基因组基础模型。该模型在一个包含超过1.5万亿碱基对的多样化元基因组DNA和RNA序列的新数据集上进行训练,这些数据来自大量人类废水样本。METAGENE-1的目标是捕捉废水中存在的基因组信息的完整分布,以帮助进行疫情监测和病原体检测。我们展示了该模型在元基因组数据集上的预训练结果,证明其在公共卫生应用中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.02690', 'title': 'GS-DiT: Advancing Video Generation with Pseudo 4D Gaussian Fields through Efficient Dense 3D Point Tracking', 'url': 'https://huggingface.co/papers/2501.02690', 'abstract': '4D video control is essential in video generation as it enables the use of sophisticated lens techniques, such as multi-camera shooting and dolly zoom, which are currently unsupported by existing methods. Training a video Diffusion Transformer (DiT) directly to control 4D content requires expensive multi-view videos. Inspired by Monocular Dynamic novel View Synthesis (MDVS) that optimizes a 4D representation and renders videos according to different 4D elements, such as camera pose and object motion editing, we bring pseudo 4D Gaussian fields to video generation. Specifically, we propose a novel framework that constructs a pseudo 4D Gaussian field with dense 3D point tracking and renders the Gaussian field for all video frames. Then we finetune a pretrained DiT to generate videos following the guidance of the rendered video, dubbed as GS-DiT. To boost the training of the GS-DiT, we also propose an efficient Dense 3D Point Tracking (D3D-PT) method for the pseudo 4D Gaussian field construction. Our D3D-PT outperforms SpatialTracker, the state-of-the-art sparse 3D point tracking method, in accuracy and accelerates the inference speed by two orders of magnitude. During the inference stage, GS-DiT can generate videos with the same dynamic content while adhering to different camera parameters, addressing a significant limitation of current video generation models. GS-DiT demonstrates strong generalization capabilities and extends the 4D controllability of Gaussian splatting to video generation beyond just camera poses. It supports advanced cinematic effects through the manipulation of the Gaussian field and camera intrinsics, making it a powerful tool for creative video production. Demos are available at https://wkbian.github.io/Projects/GS-DiT/.', 'score': 11, 'issue_id': 1530, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'b4c147a2637166a8', 'authors': ['Weikang Bian', 'Zhaoyang Huang', 'Xiaoyu Shi', 'Yijin Li', 'Fu-Yun Wang', 'Hongsheng Li'], 'affiliations': ['Avolution AI', 'Centre for Perceptual and Interactive Intelligence', 'Multimedia Laboratory, The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.02690.jpg', 'data': {'categories': ['#video', '#games', '#diffusion', '#3d'], 'emoji': '🎥', 'ru': {'title': 'Революция в генерации видео: 4D-контроль с помощью гауссовых полей', 'desc': 'Эта статья представляет инновационный подход к генерации видео с 4D-контролем, используя псевдо-4D гауссовы поля и модель Diffusion Transformer (DiT). Авторы предлагают метод Dense 3D Point Tracking (D3D-PT) для эффективного построения гауссовых полей, превосходящий существующие решения по точности и скорости. Разработанная система GS-DiT позволяет генерировать видео с одинаковым динамическим содержанием, но с разными параметрами камеры, что открывает новые возможности для создания кинематографических эффектов. Метод демонстрирует сильные обобщающие способности и расширяет возможности 4D-контроля в генерации видео.'}, 'en': {'title': 'Revolutionizing Video Generation with 4D Control', 'desc': 'This paper introduces a new method for generating videos that can be controlled in four dimensions (4D), which includes both camera movement and object motion. The authors propose a framework called GS-DiT that utilizes pseudo 4D Gaussian fields to enhance video generation, allowing for advanced cinematic effects. They also present a Dense 3D Point Tracking (D3D-PT) technique that improves the accuracy and speed of tracking 3D points compared to existing methods. Overall, GS-DiT enables the creation of dynamic videos with flexible camera parameters, significantly advancing the capabilities of video generation models.'}, 'zh': {'title': '伪4D高斯场:视频生成的新突破', 'desc': '本论文提出了一种新颖的框架,利用伪4D高斯场进行视频生成,以支持复杂的镜头技术。我们通过密集的3D点跟踪构建伪4D高斯场,并为所有视频帧渲染该高斯场。为了提升GS-DiT的训练效果,我们还提出了一种高效的密集3D点跟踪方法,显著提高了准确性和推理速度。GS-DiT能够在不同的相机参数下生成具有相同动态内容的视频,扩展了视频生成的4D可控性,成为创意视频制作的强大工具。'}}}, {'id': 'https://huggingface.co/papers/2501.03059', 'title': 'Through-The-Mask: Mask-based Motion Trajectories for Image-to-Video Generation', 'url': 'https://huggingface.co/papers/2501.03059', 'abstract': "We consider the task of Image-to-Video (I2V) generation, which involves transforming static images into realistic video sequences based on a textual description. While recent advancements produce photorealistic outputs, they frequently struggle to create videos with accurate and consistent object motion, especially in multi-object scenarios. To address these limitations, we propose a two-stage compositional framework that decomposes I2V generation into: (i) An explicit intermediate representation generation stage, followed by (ii) A video generation stage that is conditioned on this representation. Our key innovation is the introduction of a mask-based motion trajectory as an intermediate representation, that captures both semantic object information and motion, enabling an expressive but compact representation of motion and semantics. To incorporate the learned representation in the second stage, we utilize object-level attention objectives. Specifically, we consider a spatial, per-object, masked-cross attention objective, integrating object-specific prompts into corresponding latent space regions and a masked spatio-temporal self-attention objective, ensuring frame-to-frame consistency for each object. We evaluate our method on challenging benchmarks with multi-object and high-motion scenarios and empirically demonstrate that the proposed method achieves state-of-the-art results in temporal coherence, motion realism, and text-prompt faithfulness. Additionally, we introduce \\benchmark, a new challenging benchmark for single-object and multi-object I2V generation, and demonstrate our method's superiority on this benchmark. Project page is available at https://guyyariv.github.io/TTM/.", 'score': 10, 'issue_id': 1532, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': '4f24667b663efb7d', 'authors': ['Guy Yariv', 'Yuval Kirstain', 'Amit Zohar', 'Shelly Sheynin', 'Yaniv Taigman', 'Yossi Adi', 'Sagie Benaim', 'Adam Polyak'], 'affiliations': ['FAIR, Meta', 'GenAI, Meta', 'The Hebrew University of Jerusalem'], 'pdf_title_img': 'assets/pdf/title_img/2501.03059.jpg', 'data': {'categories': ['#video', '#multimodal', '#benchmark'], 'emoji': '🎬', 'ru': {'title': 'Генерация реалистичных видео из статичных изображений с помощью масок траекторий движения', 'desc': 'Статья представляет новый подход к генерации видео из изображений (I2V) на основе текстового описания. Авторы предлагают двухэтапную композиционную модель, которая сначала генерирует промежуточное представление в виде маски траектории движения объектов. Затем это представление используется для генерации видео с применением объектно-ориентированных целевых функций внимания. Эксперименты показывают, что предложенный метод достигает лучших результатов по временной согласованности, реалистичности движения и соответствию текстовому описанию.'}, 'en': {'title': 'Transforming Images into Realistic Videos with Motion Precision', 'desc': 'This paper addresses the challenge of generating videos from static images using textual descriptions, known as Image-to-Video (I2V) generation. The authors propose a two-stage framework that first creates an intermediate representation to capture object semantics and motion, followed by a video generation stage that utilizes this representation. A key innovation is the use of a mask-based motion trajectory, which helps maintain accurate object motion and consistency across frames. The method is evaluated against challenging benchmarks and shows superior performance in terms of motion realism and coherence, while also introducing a new benchmark for I2V generation.'}, 'zh': {'title': '图像到视频生成的新突破', 'desc': '本文探讨了图像到视频(I2V)生成的任务,即根据文本描述将静态图像转换为逼真的视频序列。尽管近期的进展能够生成照片级真实感的输出,但在多物体场景中,视频的物体运动准确性和一致性仍然存在挑战。为了解决这些问题,我们提出了一种两阶段的组合框架,首先生成明确的中间表示,然后基于该表示生成视频。我们的创新在于引入了一种基于掩码的运动轨迹作为中间表示,能够捕捉语义物体信息和运动,从而实现运动和语义的紧凑而富有表现力的表示。'}}}, {'id': 'https://huggingface.co/papers/2501.03006', 'title': 'TransPixar: Advancing Text-to-Video Generation with Transparency', 'url': 'https://huggingface.co/papers/2501.03006', 'abstract': 'Text-to-video generative models have made significant strides, enabling diverse applications in entertainment, advertising, and education. However, generating RGBA video, which includes alpha channels for transparency, remains a challenge due to limited datasets and the difficulty of adapting existing models. Alpha channels are crucial for visual effects (VFX), allowing transparent elements like smoke and reflections to blend seamlessly into scenes. We introduce TransPixar, a method to extend pretrained video models for RGBA generation while retaining the original RGB capabilities. TransPixar leverages a diffusion transformer (DiT) architecture, incorporating alpha-specific tokens and using LoRA-based fine-tuning to jointly generate RGB and alpha channels with high consistency. By optimizing attention mechanisms, TransPixar preserves the strengths of the original RGB model and achieves strong alignment between RGB and alpha channels despite limited training data. Our approach effectively generates diverse and consistent RGBA videos, advancing the possibilities for VFX and interactive content creation.', 'score': 8, 'issue_id': 1527, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'e85e5fa9a03d5d04', 'authors': ['Luozhou Wang', 'Yijun Li', 'Zhifei Chen', 'Jui-Hsien Wang', 'Zhifei Zhang', 'He Zhang', 'Zhe Lin', 'Yingcong Chen'], 'affiliations': ['Adobe Research', 'HKUST', 'HKUST(GZ)'], 'pdf_title_img': 'assets/pdf/title_img/2501.03006.jpg', 'data': {'categories': ['#optimization', '#architecture', '#training', '#diffusion', '#video'], 'emoji': '🎬', 'ru': {'title': 'TransPixar: Прорыв в генерации RGBA-видео для визуальных эффектов', 'desc': 'TransPixar - это новый метод генерации RGBA-видео, расширяющий возможности предобученных видеомоделей. Он использует архитектуру диффузионного трансформера (DiT) и токены, специфичные для альфа-канала, для совместной генерации RGB и альфа-каналов с высокой согласованностью. Метод применяет тонкую настройку на основе LoRA и оптимизирует механизмы внимания для сохранения сильных сторон исходной RGB-модели. TransPixar эффективно генерирует разнообразные и согласованные RGBA-видео, открывая новые возможности для создания визуальных эффектов и интерактивного контента.'}, 'en': {'title': 'TransPixar: Bridging RGB and Alpha for Enhanced Video Generation', 'desc': 'This paper presents TransPixar, a novel method for generating RGBA videos, which include transparency information crucial for visual effects. The challenge lies in the limited datasets and the need to adapt existing models to handle alpha channels effectively. TransPixar utilizes a diffusion transformer architecture and incorporates alpha-specific tokens, allowing it to generate both RGB and alpha channels simultaneously. By optimizing attention mechanisms and employing LoRA-based fine-tuning, TransPixar achieves high consistency between RGB and alpha outputs, enhancing the quality of video generation for applications in VFX and interactive media.'}, 'zh': {'title': 'TransPixar:生成高质量RGBA视频的新方法', 'desc': '本文介绍了一种名为TransPixar的方法,旨在生成包含透明通道的RGBA视频。传统的视频生成模型在处理透明效果时面临挑战,TransPixar通过扩展预训练模型来解决这一问题。该方法利用扩散变换器架构,结合特定的透明通道标记,并通过LoRA微调实现RGB和透明通道的高一致性生成。最终,TransPixar在有限的数据集上优化了注意力机制,成功生成多样且一致的RGBA视频,推动了视觉特效和互动内容创作的可能性。'}}}, {'id': 'https://huggingface.co/papers/2501.01790', 'title': 'Ingredients: Blending Custom Photos with Video Diffusion Transformers', 'url': 'https://huggingface.co/papers/2501.01790', 'abstract': 'This paper presents a powerful framework to customize video creations by incorporating multiple specific identity (ID) photos, with video diffusion Transformers, referred to as Ingredients. Generally, our method consists of three primary modules: (i) a facial extractor that captures versatile and precise facial features for each human ID from both global and local perspectives; (ii) a multi-scale projector that maps face embeddings into the contextual space of image query in video diffusion transformers; (iii) an ID router that dynamically combines and allocates multiple ID embedding to the corresponding space-time regions. Leveraging a meticulously curated text-video dataset and a multi-stage training protocol, Ingredients demonstrates superior performance in turning custom photos into dynamic and personalized video content. Qualitative evaluations highlight the advantages of proposed method, positioning it as a significant advancement toward more effective generative video control tools in Transformer-based architecture, compared to existing methods. The data, code, and model weights are publicly available at: https://github.com/feizc/Ingredients.', 'score': 6, 'issue_id': 1528, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': 'dd1ccebdd2fcf276', 'authors': ['Zhengcong Fei', 'Debang Li', 'Di Qiu', 'Changqian Yu', 'Mingyuan Fan'], 'affiliations': ['Kunlun Inc. Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01790.jpg', 'data': {'categories': ['#open_source', '#training', '#architecture', '#video', '#dataset', '#diffusion', '#multimodal'], 'emoji': '🎬', 'ru': {'title': 'Персонализированное видео из фотографий: новый уровень контроля в генеративных моделях', 'desc': 'Статья представляет новый метод под названием Ingredients для создания персонализированных видео с использованием нескольких фотографий конкретных людей. Метод состоит из трех основных модулей: экстрактора лицевых признаков, многомасштабного проектора и маршрутизатора идентификаторов. Ingredients использует тщательно подобранный набор данных текст-видео и многоэтапный протокол обучения для достижения превосходных результатов. Качественная оценка показывает преимущества предложенного метода по сравнению с существующими подходами в области генеративного контроля видео на основе архитектуры Transformer.'}, 'en': {'title': 'Transforming Photos into Personalized Videos with Ingredients', 'desc': 'This paper introduces a novel framework called Ingredients for creating personalized videos using multiple identity photos. It employs a facial extractor to accurately capture facial features, a multi-scale projector to integrate these features into video diffusion transformers, and an ID router to manage the allocation of identity embeddings across different time and space regions in the video. The framework is trained on a carefully selected text-video dataset, enhancing its ability to generate dynamic video content from custom images. The results show that Ingredients outperforms existing methods, marking a significant step forward in generative video control using Transformer architectures.'}, 'zh': {'title': '个性化视频创作的新突破', 'desc': '本文提出了一种强大的框架,通过结合多个特定身份照片,定制视频创作,称为Ingredients。该方法主要由三个模块组成:面部提取器、多个尺度投影器和身份路由器,分别用于提取面部特征、映射面部嵌入和动态分配身份嵌入。通过精心策划的文本-视频数据集和多阶段训练协议,Ingredients在将自定义照片转化为动态个性化视频内容方面表现出色。定性评估显示,该方法在基于Transformer的架构中,相较于现有方法,显著提升了生成视频控制工具的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.02576', 'title': 'DepthMaster: Taming Diffusion Models for Monocular Depth Estimation', 'url': 'https://huggingface.co/papers/2501.02576', 'abstract': "Monocular depth estimation within the diffusion-denoising paradigm demonstrates impressive generalization ability but suffers from low inference speed. Recent methods adopt a single-step deterministic paradigm to improve inference efficiency while maintaining comparable performance. However, they overlook the gap between generative and discriminative features, leading to suboptimal results. In this work, we propose DepthMaster, a single-step diffusion model designed to adapt generative features for the discriminative depth estimation task. First, to mitigate overfitting to texture details introduced by generative features, we propose a Feature Alignment module, which incorporates high-quality semantic features to enhance the denoising network's representation capability. Second, to address the lack of fine-grained details in the single-step deterministic framework, we propose a Fourier Enhancement module to adaptively balance low-frequency structure and high-frequency details. We adopt a two-stage training strategy to fully leverage the potential of the two modules. In the first stage, we focus on learning the global scene structure with the Feature Alignment module, while in the second stage, we exploit the Fourier Enhancement module to improve the visual quality. Through these efforts, our model achieves state-of-the-art performance in terms of generalization and detail preservation, outperforming other diffusion-based methods across various datasets. Our project page can be found at https://indu1ge.github.io/DepthMaster_page.", 'score': 5, 'issue_id': 1536, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'a8429b95ef4eb7b7', 'authors': ['Ziyang Song', 'Zerong Wang', 'Bo Li', 'Hao Zhang', 'Ruijie Zhu', 'Li Liu', 'Peng-Tao Jiang', 'Tianzhu Zhang'], 'affiliations': ['School of Information Science and Technology, University of Science and Technology of China (USTC), Hefei 230026, P.R.China', 'vivo Mobile Communication Co., Ltd., Hangzhou 310030, P.R.China'], 'pdf_title_img': 'assets/pdf/title_img/2501.02576.jpg', 'data': {'categories': ['#optimization', '#training', '#diffusion', '#cv', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'DepthMaster: Однопроходная диффузионная модель для точной оценки глубины с улучшенной генерализацией', 'desc': 'DepthMaster - это однопроходная диффузионная модель для монокулярной оценки глубины. Она использует модуль выравнивания признаков для улучшения представления семантических особенностей и модуль улучшения Фурье для балансировки низкочастотной структуры и высокочастотных деталей. Модель обучается в два этапа: сначала фокусируется на глобальной структуре сцены, затем улучшает визуальное качество. DepthMaster превосходит другие диффузионные методы по обобщающей способности и сохранению деталей на различных наборах данных.'}, 'en': {'title': 'DepthMaster: Bridging Generative and Discriminative Depth Estimation', 'desc': 'This paper introduces DepthMaster, a single-step diffusion model aimed at improving monocular depth estimation. It addresses the inefficiencies of previous methods by integrating a Feature Alignment module to enhance the representation of semantic features and reduce overfitting to textures. Additionally, a Fourier Enhancement module is proposed to balance low-frequency structures with high-frequency details, ensuring finer depth estimation. The two-stage training strategy allows the model to first learn global scene structures and then refine visual quality, resulting in state-of-the-art performance across various datasets.'}, 'zh': {'title': 'DepthMaster:提升深度估计的单步扩散模型', 'desc': '本文提出了一种名为DepthMaster的单步扩散模型,用于单目深度估计。该模型通过特征对齐模块和傅里叶增强模块,优化生成特征以适应判别性深度估计任务。特征对齐模块增强了去噪网络的表示能力,而傅里叶增强模块则平衡了低频结构和高频细节。通过两阶段训练策略,DepthMaster在泛化能力和细节保留方面达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.01830', 'title': 'Auto-RT: Automatic Jailbreak Strategy Exploration for Red-Teaming Large Language Models', 'url': 'https://huggingface.co/papers/2501.01830', 'abstract': 'Automated red-teaming has become a crucial approach for uncovering vulnerabilities in large language models (LLMs). However, most existing methods focus on isolated safety flaws, limiting their ability to adapt to dynamic defenses and uncover complex vulnerabilities efficiently. To address this challenge, we propose Auto-RT, a reinforcement learning framework that automatically explores and optimizes complex attack strategies to effectively uncover security vulnerabilities through malicious queries. Specifically, we introduce two key mechanisms to reduce exploration complexity and improve strategy optimization: 1) Early-terminated Exploration, which accelerate exploration by focusing on high-potential attack strategies; and 2) Progressive Reward Tracking algorithm with intermediate downgrade models, which dynamically refine the search trajectory toward successful vulnerability exploitation. Extensive experiments across diverse LLMs demonstrate that, by significantly improving exploration efficiency and automatically optimizing attack strategies, Auto-RT detects a boarder range of vulnerabilities, achieving a faster detection speed and 16.63\\% higher success rates compared to existing methods.', 'score': 5, 'issue_id': 1529, 'pub_date': '2025-01-03', 'pub_date_card': {'ru': '3 января', 'en': 'January 3', 'zh': '1月3日'}, 'hash': '5b08b81c52ec8da8', 'authors': ['Yanjiang Liu', 'Shuhen Zhou', 'Yaojie Lu', 'Huijia Zhu', 'Weiqiang Wang', 'Hongyu Lin', 'Ben He', 'Xianpei Han', 'Le Sun'], 'affiliations': ['Ant Group', 'Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences, Beijing, China', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.01830.jpg', 'data': {'categories': ['#security', '#rl', '#rlhf'], 'emoji': '🛡️', 'ru': {'title': 'Auto-RT: Умная защита больших языковых моделей', 'desc': 'Авторы представляют Auto-RT - фреймворк на основе обучения с подкреплением для автоматизированного поиска уязвимостей в больших языковых моделях (LLM). Система использует механизмы раннего прекращения исследования и прогрессивного отслеживания наград для оптимизации стратегий атак. Auto-RT превосходит существующие методы, обнаруживая более широкий спектр уязвимостей с большей скоростью и на 16.63% более высоким уровнем успеха. Этот подход позволяет эффективно выявлять сложные уязвимости в LLM через вредоносные запросы.'}, 'en': {'title': 'Auto-RT: Revolutionizing Vulnerability Detection in LLMs', 'desc': 'This paper presents Auto-RT, a reinforcement learning framework designed to enhance automated red-teaming for large language models (LLMs). Unlike traditional methods that target isolated safety flaws, Auto-RT efficiently uncovers complex vulnerabilities by optimizing attack strategies through malicious queries. It introduces two innovative mechanisms: Early-terminated Exploration to prioritize promising attack strategies, and Progressive Reward Tracking to refine the search process dynamically. Experimental results show that Auto-RT significantly improves exploration efficiency and detection success rates, outperforming existing approaches.'}, 'zh': {'title': '自动化红队:高效发现语言模型漏洞的利器', 'desc': '自动化红队技术在发现大型语言模型(LLMs)中的漏洞方面变得至关重要。现有方法大多集中于孤立的安全缺陷,限制了其适应动态防御和高效发现复杂漏洞的能力。为了解决这个问题,我们提出了Auto-RT,一个强化学习框架,能够自动探索和优化复杂的攻击策略,通过恶意查询有效发现安全漏洞。我们的实验表明,Auto-RT显著提高了探索效率和攻击策略的自动优化,检测到更广泛的漏洞,检测速度更快,成功率提高了16.63%。'}}}, {'id': 'https://huggingface.co/papers/2501.02506', 'title': 'ToolHop: A Query-Driven Benchmark for Evaluating Large Language Models in Multi-Hop Tool Use', 'url': 'https://huggingface.co/papers/2501.02506', 'abstract': 'Effective evaluation of multi-hop tool use is critical for analyzing the understanding, reasoning, and function-calling capabilities of large language models (LLMs). However, progress has been hindered by a lack of reliable evaluation datasets. To address this, we present ToolHop, a dataset comprising 995 user queries and 3,912 associated tools, specifically designed for rigorous evaluation of multi-hop tool use. ToolHop ensures diverse queries, meaningful interdependencies, locally executable tools, detailed feedback, and verifiable answers through a novel query-driven data construction approach that includes tool creation, document refinement, and code generation. We evaluate 14 LLMs across five model families (i.e., LLaMA3.1, Qwen2.5, Gemini1.5, Claude3.5, and GPT), uncovering significant challenges in handling multi-hop tool-use scenarios. The leading model, GPT-4o, achieves an accuracy of 49.04%, underscoring substantial room for improvement. Further analysis reveals variations in tool-use strategies for various families, offering actionable insights to guide the development of more effective approaches. Code and data can be found in https://huggingface.co/bytedance-research/ToolHop.', 'score': 5, 'issue_id': 1529, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'f785173226e5f9fc', 'authors': ['Junjie Ye', 'Zhengyin Du', 'Xuesong Yao', 'Weijian Lin', 'Yufei Xu', 'Zehui Chen', 'Zaiyuan Wang', 'Sining Zhu', 'Zhiheng Xi', 'Siyu Yuan', 'Tao Gui', 'Qi Zhang', 'Xuanjing Huang', 'Jiechao Chen'], 'affiliations': ['ByteDance', 'Institute of Modern Languages and Linguistics, Fudan University', 'School of Computer Science, Fudan University', 'School of Data Science, Fudan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.02506.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#dataset', '#optimization'], 'emoji': '🛠️', 'ru': {'title': 'ToolHop: новый стандарт для оценки многоэтапного использования инструментов в LLM', 'desc': 'Статья представляет новый набор данных ToolHop для оценки многоэтапного использования инструментов большими языковыми моделями (LLM). ToolHop содержит 995 пользовательских запросов и 3912 связанных инструментов, обеспечивая разнообразие запросов, взаимозависимости и возможность локального выполнения. Авторы оценили 14 LLM из пяти семейств моделей, выявив значительные трудности в обработке сценариев многоэтапного использования инструментов. Лучшая модель, GPT-4o, достигла точности 49.04%, что указывает на большой потенциал для улучшения.'}, 'en': {'title': 'ToolHop: Advancing Multi-Hop Tool Use Evaluation for LLMs', 'desc': 'This paper introduces ToolHop, a new dataset designed to evaluate how well large language models (LLMs) can use multiple tools in a single task. It includes 995 user queries and 3,912 tools, focusing on diverse and interdependent queries that can be executed locally. The authors tested 14 different LLMs, revealing that even the best-performing model, GPT-4o, only achieved 49.04% accuracy, indicating significant challenges in multi-hop tool use. The findings highlight different strategies employed by various model families, providing insights for future improvements in LLM capabilities.'}, 'zh': {'title': 'ToolHop:多跳工具使用的有效评估数据集', 'desc': '本文介绍了ToolHop数据集,该数据集包含995个用户查询和3912个相关工具,旨在有效评估大型语言模型(LLMs)在多跳工具使用中的理解、推理和功能调用能力。通过新颖的查询驱动数据构建方法,ToolHop确保了查询的多样性、工具的局部可执行性和可验证的答案。我们对14个不同模型(如LLaMA3.1、Qwen2.5等)进行了评估,发现它们在处理多跳工具使用场景时面临显著挑战。尽管GPT-4o模型的准确率为49.04%,但仍有很大的改进空间,分析还揭示了不同模型家族在工具使用策略上的差异,为未来的研究提供了有价值的见解。'}}}, {'id': 'https://huggingface.co/papers/2501.02423', 'title': 'Scaling Laws for Floating Point Quantization Training', 'url': 'https://huggingface.co/papers/2501.02423', 'abstract': 'Low-precision training is considered an effective strategy for reducing both training and downstream inference costs. Previous scaling laws for precision mainly focus on integer quantization, which pay less attention to the constituents in floating-point quantization and thus cannot well fit the LLM losses in this scenario. In contrast, while floating-point quantization training is more commonly implemented in production, the research on it has been relatively superficial. In this paper, we thoroughly explore the effects of floating-point quantization targets, exponent bits, mantissa bits, and the calculation granularity of the scaling factor in floating-point quantization training performance of LLM models. While presenting an accurate floating-point quantization unified scaling law, we also provide valuable suggestions for the community: (1) Exponent bits contribute slightly more to the model performance than mantissa bits. We provide the optimal exponent-mantissa bit ratio for different bit numbers, which is available for future reference by hardware manufacturers; (2) We discover the formation of the critical data size in low-precision LLM training. Too much training data exceeding the critical data size will inversely bring in degradation of LLM performance; (3) The optimal floating-point quantization precision is directly proportional to the computational power, but within a wide computational power range, we estimate that the best cost-performance precision lies between 4-8 bits.', 'score': 4, 'issue_id': 1537, 'pub_date': '2025-01-05', 'pub_date_card': {'ru': '5 января', 'en': 'January 5', 'zh': '1月5日'}, 'hash': 'be6872257cb9a129', 'authors': ['Xingwu Sun', 'Shuaipeng Li', 'Ruobing Xie', 'Weidong Han', 'Kan Wu', 'Zhen Yang', 'Yixing Li', 'An Wang', 'Shuai Li', 'Jinbao Xue', 'Yu Cheng', 'Yangyu Tao', 'Zhanhui Kang', 'Chengzhong Xu', 'Di Wang', 'Jie Jiang'], 'affiliations': ['Tencent Hunyuan', 'The Chinese University of Hong Kong', 'Tokyo Institute of Technology', 'University of Macau'], 'pdf_title_img': 'assets/pdf/title_img/2501.02423.jpg', 'data': {'categories': ['#training', '#optimization', '#inference'], 'emoji': '🧮', 'ru': {'title': 'Оптимизация точности вычислений в обучении языковых моделей', 'desc': 'Статья исследует влияние квантования с плавающей запятой на обучение больших языковых моделей (LLM). Авторы анализируют роль экспоненциальных и мантиссных битов, а также размера обучающих данных в производительности моделей. Они представляют унифицированный закон масштабирования для квантования с плавающей запятой и дают рекомендации по оптимальному соотношению битов и размеру данных. Исследование показывает, что оптимальная точность квантования находится в диапазоне 4-8 бит для широкого спектра вычислительных мощностей.'}, 'en': {'title': 'Optimizing Floating-Point Quantization for Better LLM Performance', 'desc': 'This paper investigates the impact of floating-point quantization on the training performance of large language models (LLMs). It highlights that previous research primarily focused on integer quantization, neglecting the nuances of floating-point quantization. The authors establish a unified scaling law for floating-point quantization and provide insights on the optimal ratio of exponent to mantissa bits, emphasizing that exponent bits have a greater influence on model performance. Additionally, they identify a critical data size threshold, beyond which performance may degrade, and suggest that the best precision for cost-performance lies between 4-8 bits, depending on computational power.'}, 'zh': {'title': '低精度训练:优化浮点量化的关键', 'desc': '低精度训练被认为是降低训练和推理成本的有效策略。以往的研究主要集中在整数量化上,而对浮点量化的研究相对较少,导致无法很好地适应大语言模型的损失情况。本文深入探讨了浮点量化训练中目标、指数位、尾数位和缩放因子的计算粒度对大语言模型性能的影响,并提出了统一的浮点量化缩放法则。研究结果表明,指数位对模型性能的贡献略高于尾数位,并发现了低精度训练中的关键数据大小。'}}}, {'id': 'https://huggingface.co/papers/2501.02832', 'title': 'Samba-asr state-of-the-art speech recognition leveraging structured state-space models', 'url': 'https://huggingface.co/papers/2501.02832', 'abstract': 'We propose Samba ASR, the first state-of-the-art Automatic Speech Recognition (ASR) model leveraging the novel Mamba architecture as both encoder and decoder, built on the foundation of state-space models (SSMs). Unlike transformer-based ASR models, which rely on self-attention mechanisms to capture dependencies, Samba ASR effectively models both local and global temporal dependencies using efficient state-space dynamics, achieving remarkable performance gains. By addressing the limitations of transformers, such as quadratic scaling with input length and difficulty in handling long-range dependencies, Samba ASR achieves superior accuracy and efficiency. Experimental results demonstrate that Samba ASR surpasses existing open-source transformer-based ASR models across various standard benchmarks, establishing it as the new state of the art in ASR. Extensive evaluations on benchmark datasets show significant improvements in Word Error Rate (WER), with competitive performance even in low-resource scenarios. Furthermore, the computational efficiency and parameter optimization of the Mamba architecture make Samba ASR a scalable and robust solution for diverse ASR tasks. Our contributions include: A new Samba ASR architecture demonstrating the superiority of SSMs over transformer-based models for speech sequence processing. A comprehensive evaluation on public benchmarks showcasing state-of-the-art performance. An analysis of computational efficiency, robustness to noise, and sequence generalization. This work highlights the viability of Mamba SSMs as a transformer-free alternative for efficient and accurate ASR. By leveraging state-space modeling advancements, Samba ASR sets a new benchmark for ASR performance and future research.', 'score': 4, 'issue_id': 1530, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'ed3c4a6192d0c5f9', 'authors': ['Syed Abdul Gaffar Shakhadri', 'Kruthika KR', 'Kartik Basavaraj Angadi'], 'affiliations': ['SandLogic Technologies Pvt Ltd'], 'pdf_title_img': 'assets/pdf/title_img/2501.02832.jpg', 'data': {'categories': ['#audio', '#architecture', '#benchmark', '#low_resource', '#open_source'], 'emoji': '🎙️', 'ru': {'title': 'Samba ASR: революция в распознавании речи с помощью моделей пространства состояний', 'desc': 'Представлена модель Samba ASR - первая современная система автоматического распознавания речи, использующая архитектуру Mamba в качестве энкодера и декодера на основе моделей пространства состояний (SSM). В отличие от трансформерных моделей, Samba ASR эффективно моделирует локальные и глобальные временные зависимости, достигая значительных улучшений производительности. Экспериментальные результаты показывают, что Samba ASR превосходит существующие модели с открытым исходным кодом на основе трансформеров по различным стандартным показателям. Модель демонстрирует значительное снижение показателя Word Error Rate (WER) и высокую эффективность даже при ограниченных ресурсах.'}, 'en': {'title': 'Samba ASR: Redefining Speech Recognition with State-Space Models', 'desc': 'Samba ASR is a groundbreaking Automatic Speech Recognition model that utilizes the innovative Mamba architecture, which functions as both the encoder and decoder. This model departs from traditional transformer-based approaches by employing state-space models (SSMs) to effectively capture both local and global temporal dependencies, leading to enhanced performance. By overcoming the challenges associated with transformers, such as their inefficiency with long input sequences, Samba ASR achieves superior accuracy and efficiency in recognizing speech. Extensive testing shows that Samba ASR not only outperforms existing transformer-based models but also excels in low-resource environments, making it a robust solution for various ASR applications.'}, 'zh': {'title': 'Samba ASR:超越变换器的语音识别新标杆', 'desc': '我们提出了Samba ASR,这是第一个利用新型Mamba架构作为编码器和解码器的最先进自动语音识别(ASR)模型。与基于变换器的ASR模型不同,Samba ASR通过高效的状态空间动态建模局部和全局时间依赖关系,从而实现显著的性能提升。该模型克服了变换器在处理长距离依赖和输入长度的平方扩展等方面的局限性,展现出更高的准确性和效率。实验结果表明,Samba ASR在多个标准基准测试中超越了现有的开源变换器ASR模型,确立了其在ASR领域的新标杆。'}}}, {'id': 'https://huggingface.co/papers/2501.00912', 'title': 'AutoPresent: Designing Structured Visuals from Scratch', 'url': 'https://huggingface.co/papers/2501.00912', 'abstract': "Designing structured visuals such as presentation slides is essential for communicative needs, necessitating both content creation and visual planning skills. In this work, we tackle the challenge of automated slide generation, where models produce slide presentations from natural language (NL) instructions. We first introduce the SlidesBench benchmark, the first benchmark for slide generation with 7k training and 585 testing examples derived from 310 slide decks across 10 domains. SlidesBench supports evaluations that are (i)reference-based to measure similarity to a target slide, and (ii)reference-free to measure the design quality of generated slides alone. We benchmark end-to-end image generation and program generation methods with a variety of models, and find that programmatic methods produce higher-quality slides in user-interactable formats. Built on the success of program generation, we create AutoPresent, an 8B Llama-based model trained on 7k pairs of instructions paired with code for slide generation, and achieve results comparable to the closed-source model GPT-4o. We further explore iterative design refinement where the model is tasked to self-refine its own output, and we found that this process improves the slide's quality. We hope that our work will provide a basis for future work on generating structured visuals.", 'score': 3, 'issue_id': 1539, 'pub_date': '2025-01-01', 'pub_date_card': {'ru': '1 января', 'en': 'January 1', 'zh': '1月1日'}, 'hash': 'ea7b88fcc0a2025b', 'authors': ['Jiaxin Ge', 'Zora Zhiruo Wang', 'Xuhui Zhou', 'Yi-Hao Peng', 'Sanjay Subramanian', 'Qinyue Tan', 'Maarten Sap', 'Alane Suhr', 'Daniel Fried', 'Graham Neubig', 'Trevor Darrell'], 'affiliations': ['Carnegie Mellon University', 'University of California, Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.00912.jpg', 'data': {'categories': ['#dataset', '#story_generation', '#training', '#benchmark', '#multimodal'], 'emoji': '🎯', 'ru': {'title': 'Автоматизация создания презентаций: от текста к структурированным визуальным материалам', 'desc': 'Эта статья представляет новый бенчмарк SlidesBench для автоматической генерации слайдов презентаций на основе текстовых инструкций. Авторы сравнивают методы генерации изображений и программного кода, обнаружив преимущество последнего подхода. Они создают модель AutoPresent на базе Llama для генерации кода слайдов, достигающую результатов, сопоставимых с GPT-4. Исследователи также изучают итеративное улучшение дизайна слайдов с помощью самооптимизации модели.'}, 'en': {'title': 'Automating Slide Generation with Advanced Models', 'desc': 'This paper addresses the challenge of creating automated slide presentations from natural language instructions. It introduces the SlidesBench benchmark, which includes a large dataset for training and testing slide generation models. The authors evaluate various methods, finding that programmatic approaches yield higher-quality slides. They also present AutoPresent, a model that competes with advanced models like GPT-4o, and demonstrate that iterative design refinement enhances the quality of generated slides.'}, 'zh': {'title': '自动生成高质量演示幻灯片的未来', 'desc': '本研究旨在自动生成演示幻灯片,解决内容创作和视觉规划的挑战。我们首次引入SlidesBench基准,包含7000个训练样本和585个测试样本,涵盖10个领域的310个幻灯片集。通过对比不同模型的图像生成和程序生成方法,我们发现程序生成方法在用户交互格式中生成的幻灯片质量更高。基于程序生成的成功,我们开发了AutoPresent模型,并通过自我优化过程进一步提升幻灯片的质量。'}}}, {'id': 'https://huggingface.co/papers/2501.03225', 'title': 'Automated Generation of Challenging Multiple-Choice Questions for Vision Language Model Evaluation', 'url': 'https://huggingface.co/papers/2501.03225', 'abstract': 'The rapid development of vision language models (VLMs) demands rigorous and reliable evaluation. However, current visual question answering (VQA) benchmarks often depend on open-ended questions, making accurate evaluation difficult due to the variability in natural language responses. To address this, we introduce AutoConverter, an agentic framework that automatically converts these open-ended questions into multiple-choice format, enabling objective evaluation while reducing the costly question creation process. Our experiments demonstrate that AutoConverter can generate correct and challenging multiple-choice questions, with VLMs demonstrating consistently similar or lower accuracy on these questions compared to human-created ones. Using AutoConverter, we construct VMCBench, a benchmark created by transforming 20 existing VQA datasets into a unified multiple-choice format, totaling 9,018 questions. We comprehensively evaluate 33 state-of-the-art VLMs on VMCBench, setting a new standard for scalable, consistent, and reproducible VLM evaluation.', 'score': 1, 'issue_id': 1542, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'aa212f5e596ed0e6', 'authors': ['Yuhui Zhang', 'Yuchang Su', 'Yiming Liu', 'Xiaohan Wang', 'James Burgess', 'Elaine Sui', 'Chenyu Wang', 'Josiah Aklilu', 'Alejandro Lozano', 'Anjiang Wei', 'Ludwig Schmidt', 'Serena Yeung-Levy'], 'affiliations': ['MIT', 'Stanford University', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.03225.jpg', 'data': {'categories': ['#interpretability', '#agents', '#benchmark', '#cv', '#survey', '#games', '#optimization'], 'emoji': '🔄', 'ru': {'title': 'Автоматизация оценки моделей машинного зрения и языка', 'desc': 'Исследователи представили AutoConverter - агентную систему для автоматического преобразования открытых вопросов в вопросы с множественным выбором для оценки моделей машинного зрения и языка (VLM). Эта система позволяет объективно оценивать VLM, избегая сложностей, связанных с вариативностью естественно-языковых ответов. На основе AutoConverter был создан бенчмарк VMCBench, включающий 9018 вопросов из 20 существующих наборов данных для визуальных вопросов и ответов (VQA). VMCBench был использован для всесторонней оценки 33 современных VLM, устанавливая новый стандарт масштабируемой и воспроизводимой оценки таких моделей.'}, 'en': {'title': 'Transforming VQA for Objective Evaluation with AutoConverter', 'desc': 'This paper presents AutoConverter, a framework designed to improve the evaluation of vision language models (VLMs) by converting open-ended visual question answering (VQA) questions into a multiple-choice format. This transformation allows for more objective assessments of VLM performance, addressing the challenges posed by the variability of natural language responses. The authors demonstrate that VLMs perform similarly or worse on these newly generated questions compared to those created by humans, indicating the rigor of the new benchmark. Additionally, they introduce VMCBench, a comprehensive dataset that standardizes 20 existing VQA datasets into a unified multiple-choice format, facilitating scalable and reproducible evaluations of VLMs.'}, 'zh': {'title': '自动化评估视觉语言模型的新标准', 'desc': '随着视觉语言模型(VLMs)的快速发展,评估这些模型的准确性变得尤为重要。现有的视觉问答(VQA)基准往往依赖开放式问题,这使得评估变得困难,因为自然语言回答的多样性很大。为了解决这个问题,我们提出了AutoConverter,这是一种自动将开放式问题转换为多项选择格式的框架,从而实现客观评估并减少问题创建的成本。通过使用AutoConverter,我们构建了VMCBench,这是一个将20个现有VQA数据集转化为统一多项选择格式的基准,包含9,018个问题,全面评估了33个最先进的VLMs,设定了可扩展、一致和可重复的VLM评估新标准。'}}}, {'id': 'https://huggingface.co/papers/2501.05874', 'title': 'VideoRAG: Retrieval-Augmented Generation over Video Corpus', 'url': 'https://huggingface.co/papers/2501.05874', 'abstract': 'Retrieval-Augmented Generation (RAG) is a powerful strategy to address the issue of generating factually incorrect outputs in foundation models by retrieving external knowledge relevant to queries and incorporating it into their generation process. However, existing RAG approaches have primarily focused on textual information, with some recent advancements beginning to consider images, and they largely overlook videos, a rich source of multimodal knowledge capable of representing events, processes, and contextual details more effectively than any other modality. While a few recent studies explore the integration of videos in the response generation process, they either predefine query-associated videos without retrieving them according to queries, or convert videos into the textual descriptions without harnessing their multimodal richness. To tackle these, we introduce VideoRAG, a novel framework that not only dynamically retrieves relevant videos based on their relevance with queries but also utilizes both visual and textual information of videos in the output generation. Further, to operationalize this, our method revolves around the recent advance of Large Video Language Models (LVLMs), which enable the direct processing of video content to represent it for retrieval and seamless integration of the retrieved videos jointly with queries. We experimentally validate the effectiveness of VideoRAG, showcasing that it is superior to relevant baselines.', 'score': 39, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'a6a86d4d49a42b4d', 'authors': ['Soyeong Jeong', 'Kangsan Kim', 'Jinheon Baek', 'Sung Ju Hwang'], 'affiliations': ['DeepAuto.ai', 'KAIST'], 'pdf_title_img': 'assets/pdf/title_img/2501.05874.jpg', 'data': {'categories': ['#multimodal', '#rag', '#interpretability', '#hallucinations', '#video'], 'emoji': '🎥', 'ru': {'title': 'VideoRAG: Обогащение генерации ответов с помощью видеоконтента', 'desc': 'VideoRAG - это новая система для улучшения генерации ответов с использованием видеоконтента. В отличие от существующих подходов, она динамически извлекает релевантные видео и использует как визуальную, так и текстовую информацию из них. VideoRAG основан на Больших Видеоязыковых Моделях (LVLM), которые позволяют напрямую обрабатывать видеоконтент. Экспериментальные результаты показывают превосходство VideoRAG над существующими методами.'}, 'en': {'title': 'Enhancing Generation with Dynamic Video Retrieval', 'desc': "This paper presents VideoRAG, a new framework that enhances the Retrieval-Augmented Generation (RAG) approach by incorporating video content into the generation process. Unlike previous methods that primarily focused on text or predefined videos, VideoRAG dynamically retrieves relevant videos based on the user's query. It leverages both visual and textual information from the videos, allowing for a richer and more accurate output generation. The framework utilizes Large Video Language Models (LVLMs) to effectively process and integrate video content, demonstrating superior performance compared to existing methods."}, 'zh': {'title': '视频检索增强生成:提升多模态知识的利用', 'desc': '检索增强生成(RAG)是一种强大的策略,用于解决基础模型生成事实不准确输出的问题。现有的RAG方法主要集中在文本信息上,最近的一些进展开始考虑图像,但大多数忽视了视频这一丰富的多模态知识源。我们提出了VideoRAG框架,它不仅根据查询动态检索相关视频,还利用视频的视觉和文本信息进行输出生成。实验结果验证了VideoRAG的有效性,显示其优于相关基线。'}}}, {'id': 'https://huggingface.co/papers/2501.03841', 'title': 'OmniManip: Towards General Robotic Manipulation via Object-Centric Interaction Primitives as Spatial Constraints', 'url': 'https://huggingface.co/papers/2501.03841', 'abstract': "The development of general robotic systems capable of manipulating in unstructured environments is a significant challenge. While Vision-Language Models(VLM) excel in high-level commonsense reasoning, they lack the fine-grained 3D spatial understanding required for precise manipulation tasks. Fine-tuning VLM on robotic datasets to create Vision-Language-Action Models(VLA) is a potential solution, but it is hindered by high data collection costs and generalization issues. To address these challenges, we propose a novel object-centric representation that bridges the gap between VLM's high-level reasoning and the low-level precision required for manipulation. Our key insight is that an object's canonical space, defined by its functional affordances, provides a structured and semantically meaningful way to describe interaction primitives, such as points and directions. These primitives act as a bridge, translating VLM's commonsense reasoning into actionable 3D spatial constraints. In this context, we introduce a dual closed-loop, open-vocabulary robotic manipulation system: one loop for high-level planning through primitive resampling, interaction rendering and VLM checking, and another for low-level execution via 6D pose tracking. This design ensures robust, real-time control without requiring VLM fine-tuning. Extensive experiments demonstrate strong zero-shot generalization across diverse robotic manipulation tasks, highlighting the potential of this approach for automating large-scale simulation data generation.", 'score': 37, 'issue_id': 1628, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'c2dc8cc20b9b990a', 'authors': ['Mingjie Pan', 'Jiyao Zhang', 'Tianshu Wu', 'Yinghao Zhao', 'Wenlong Gao', 'Hao Dong'], 'affiliations': ['AgiBot', 'CFCS, School of CS, Peking University', 'PKU-AgiBot Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.03841.jpg', 'data': {'categories': ['#agents', '#reasoning', '#robotics', '#3d', '#transfer_learning', '#agi'], 'emoji': '🤖', 'ru': {'title': 'Объектно-ориентированный подход к роботизированной манипуляции с использованием VLM', 'desc': 'Статья представляет новый подход к робототехнике, объединяющий возможности моделей визуального языка (VLM) с точным 3D-пониманием, необходимым для манипуляций. Авторы предлагают объектно-ориентированное представление, использующее каноническое пространство объекта для описания примитивов взаимодействия. Система включает два цикла: планирование высокого уровня с использованием VLM и низкоуровневое выполнение с отслеживанием 6D-позы. Эксперименты показывают сильную обобщающую способность в различных задачах робототехнической манипуляции.'}, 'en': {'title': 'Bridging High-Level Reasoning and Low-Level Manipulation in Robotics', 'desc': "This paper addresses the challenge of enabling robots to manipulate objects in unpredictable environments by enhancing Vision-Language Models (VLM) with a new approach. The authors propose a Vision-Language-Action Model (VLA) that utilizes an object-centric representation, focusing on an object's canonical space defined by its functional affordances. This representation helps translate high-level reasoning from VLM into specific 3D spatial actions needed for manipulation tasks. The proposed dual closed-loop system allows for effective planning and execution without the need for extensive fine-tuning, demonstrating strong performance in various robotic tasks."}, 'zh': {'title': '打破高层推理与低层操作的壁垒', 'desc': '本论文探讨了在非结构化环境中操作的通用机器人系统的开发挑战。虽然视觉-语言模型(VLM)在高层次的常识推理方面表现出色,但缺乏精细的三维空间理解能力。我们提出了一种新颖的以对象为中心的表示方法,旨在弥合VLM的高层推理与操作所需的低层精度之间的差距。通过引入双闭环、开放词汇的机器人操作系统,我们实现了高效的实时控制,且无需对VLM进行微调。'}}}, {'id': 'https://huggingface.co/papers/2501.06186', 'title': 'LlamaV-o1: Rethinking Step-by-step Visual Reasoning in LLMs', 'url': 'https://huggingface.co/papers/2501.06186', 'abstract': "Reasoning is a fundamental capability for solving complex multi-step problems, particularly in visual contexts where sequential step-wise understanding is essential. Existing approaches lack a comprehensive framework for evaluating visual reasoning and do not emphasize step-wise problem-solving. To this end, we propose a comprehensive framework for advancing step-by-step visual reasoning in large language models (LMMs) through three key contributions. First, we introduce a visual reasoning benchmark specifically designed to evaluate multi-step reasoning tasks. The benchmark presents a diverse set of challenges with eight different categories ranging from complex visual perception to scientific reasoning with over 4k reasoning steps in total, enabling robust evaluation of LLMs' abilities to perform accurate and interpretable visual reasoning across multiple steps. Second, we propose a novel metric that assesses visual reasoning quality at the granularity of individual steps, emphasizing both correctness and logical coherence. The proposed metric offers deeper insights into reasoning performance compared to traditional end-task accuracy metrics. Third, we present a new multimodal visual reasoning model, named LlamaV-o1, trained using a multi-step curriculum learning approach, where tasks are progressively organized to facilitate incremental skill acquisition and problem-solving. The proposed LlamaV-o1 is designed for multi-step reasoning and learns step-by-step through a structured training paradigm. Extensive experiments show that our LlamaV-o1 outperforms existing open-source models and performs favorably against close-source proprietary models. Compared to the recent Llava-CoT, our LlamaV-o1 achieves an average score of 67.3 with an absolute gain of 3.8\\% across six benchmarks while being 5 times faster during inference scaling. Our benchmark, model, and code are publicly available.", 'score': 31, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '40e1a0d2c562cda5', 'authors': ['Omkar Thawakar', 'Dinura Dissanayake', 'Ketan More', 'Ritesh Thawkar', 'Ahmed Heakl', 'Noor Ahsan', 'Yuhao Li', 'Mohammed Zumri', 'Jean Lahoud', 'Rao Muhammad Anwer', 'Hisham Cholakkal', 'Ivan Laptev', 'Mubarak Shah', 'Fahad Shahbaz Khan', 'Salman Khan'], 'affiliations': ['Australian National University', 'Linköping University', 'Mohamed bin Zayed University of AI', 'University of Central Florida'], 'pdf_title_img': 'assets/pdf/title_img/2501.06186.jpg', 'data': {'categories': ['#cv', '#benchmark', '#training', '#multimodal', '#open_source', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Шаг за шагом к совершенному визуальному рассуждению', 'desc': 'Статья представляет комплексный подход к улучшению пошагового визуального рассуждения в больших языковых моделях (LLM). Авторы вводят новый бенчмарк для оценки многошаговых задач визуального рассуждения и метрику для оценки качества рассуждения на уровне отдельных шагов. Они также предлагают новую мультимодальную модель визуального рассуждения LlamaV-o1, обученную с использованием подхода многоступенчатого куррикулярного обучения. Эксперименты показывают, что LlamaV-o1 превосходит существующие модели с открытым исходным кодом и демонстрирует хорошие результаты по сравнению с проприетарными моделями.'}, 'en': {'title': 'Advancing Step-by-Step Visual Reasoning in LLMs', 'desc': "This paper introduces a new framework to enhance visual reasoning in large language models (LLMs) by focusing on step-by-step problem-solving. It presents a visual reasoning benchmark with over 4,000 reasoning steps across eight categories, allowing for thorough evaluation of LLMs' multi-step reasoning capabilities. Additionally, a novel metric is proposed to assess the quality of visual reasoning at each step, providing insights beyond traditional accuracy measures. The authors also introduce LlamaV-o1, a multimodal model trained with a curriculum learning approach, which shows significant performance improvements over existing models."}, 'zh': {'title': '提升视觉推理能力的全新框架', 'desc': '本论文提出了一种新的框架,旨在提升大型语言模型(LLMs)在视觉推理中的逐步推理能力。我们设计了一个视觉推理基准,包含多达4000个推理步骤,涵盖复杂的视觉感知和科学推理等八个类别,以便全面评估模型的推理能力。我们还提出了一种新颖的度量标准,专注于逐步推理的正确性和逻辑一致性,提供比传统的任务准确率更深入的洞察。最后,我们介绍了名为LlamaV-o1的多模态视觉推理模型,通过逐步课程学习的方法进行训练,显著提升了推理性能。'}}}, {'id': 'https://huggingface.co/papers/2501.05510', 'title': 'OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video Understanding?', 'url': 'https://huggingface.co/papers/2501.05510', 'abstract': 'Temporal Awareness, the ability to reason dynamically based on the timestamp when a question is raised, is the key distinction between offline and online video LLMs. Unlike offline models, which rely on complete videos for static, post hoc analysis, online models process video streams incrementally and dynamically adapt their responses based on the timestamp at which the question is posed. Despite its significance, temporal awareness has not been adequately evaluated in existing benchmarks. To fill this gap, we present OVO-Bench (Online-VideO-Benchmark), a novel video benchmark that emphasizes the importance of timestamps for advanced online video understanding capability benchmarking. OVO-Bench evaluates the ability of video LLMs to reason and respond to events occurring at specific timestamps under three distinct scenarios: (1) Backward tracing: trace back to past events to answer the question. (2) Real-time understanding: understand and respond to events as they unfold at the current timestamp. (3) Forward active responding: delay the response until sufficient future information becomes available to answer the question accurately. OVO-Bench comprises 12 tasks, featuring 644 unique videos and approximately human-curated 2,800 fine-grained meta-annotations with precise timestamps. We combine automated generation pipelines with human curation. With these high-quality samples, we further developed an evaluation pipeline to systematically query video LLMs along the video timeline. Evaluations of nine Video-LLMs reveal that, despite advancements on traditional benchmarks, current models struggle with online video understanding, showing a significant gap compared to human agents. We hope OVO-Bench will drive progress in video LLMs and inspire future research in online video reasoning. Our benchmark and code can be accessed at https://github.com/JoeLeelyf/OVO-Bench.', 'score': 26, 'issue_id': 1631, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '6f833a01519603d5', 'authors': ['Yifei Li', 'Junbo Niu', 'Ziyang Miao', 'Chunjiang Ge', 'Yuanhang Zhou', 'Qihao He', 'Xiaoyi Dong', 'Haodong Duan', 'Shuangrui Ding', 'Rui Qian', 'Pan Zhang', 'Yuhang Zang', 'Yuhang Cao', 'Conghui He', 'Jiaqi Wang'], 'affiliations': ['Beihang University', 'Communication University of China', 'SenseTime Group', 'Shanghai Artificial Intelligence Laboratory', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05510.jpg', 'data': {'categories': ['#benchmark', '#survey', '#video', '#reasoning'], 'emoji': '⏱️', 'ru': {'title': 'Временная осведомленность как ключ к онлайн-анализу видео для LLM', 'desc': 'Статья представляет новый бенчмарк OVO-Bench для оценки способности видео-LLM моделей к онлайн-анализу видео с учетом временных меток. Бенчмарк включает 12 задач, 644 уникальных видео и около 2800 мета-аннотаций с точными временными метками. OVO-Bench оценивает три сценария: обратное отслеживание, понимание в реальном времени и активное реагирование на будущие события. Результаты тестирования девяти видео-LLM моделей показывают значительное отставание от человеческих возможностей в онлайн-анализе видео.'}, 'en': {'title': 'Enhancing Online Video Understanding with Temporal Awareness', 'desc': 'This paper introduces OVO-Bench, a new benchmark designed to evaluate the temporal awareness of online video language models (LLMs). Unlike offline models that analyze complete videos, online models must dynamically respond to questions based on the specific timestamp of the inquiry. OVO-Bench assesses video LLMs through three scenarios: backward tracing, real-time understanding, and forward active responding, using a dataset of 644 videos and 2,800 meta-annotations. The findings indicate that current video LLMs still lag behind human performance in understanding and reasoning about events in real-time video streams.'}, 'zh': {'title': '提升视频理解能力的时间意识基准', 'desc': '本文提出了OVO-Bench,这是一个新的视频基准,旨在评估视频大语言模型(LLMs)在时间意识方面的能力。时间意识是指模型根据提问时的时间戳动态推理的能力,这与传统的离线模型不同,后者依赖于完整视频进行静态分析。OVO-Bench包含12个任务,使用644个独特视频和约2800个精细的元注释,强调了时间戳在在线视频理解中的重要性。通过对九个视频LLMs的评估,结果显示当前模型在在线视频理解方面仍存在显著差距,远不及人类代理。'}}}, {'id': 'https://huggingface.co/papers/2501.05727', 'title': 'Enabling Scalable Oversight via Self-Evolving Critic', 'url': 'https://huggingface.co/papers/2501.05727', 'abstract': "Despite their remarkable performance, the development of Large Language Models (LLMs) faces a critical challenge in scalable oversight: providing effective feedback for tasks where human evaluation is difficult or where LLMs outperform humans. While there is growing interest in using LLMs for critique, current approaches still rely on human annotations or more powerful models, leaving the issue of enhancing critique capabilities without external supervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework that enables genuine self-evolution of critique abilities. Technically, SCRIT self-improves by training on synthetic data, generated by a contrastive-based self-critic that uses reference solutions for step-by-step critique, and a self-validation mechanism that ensures critique quality through correction outcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs, SCRIT achieves up to a 10.3\\% improvement on critique-correction and error identification benchmarks. Our analysis reveals that SCRIT's performance scales positively with data and model size, outperforms alternative approaches, and benefits critically from its self-validation component.", 'score': 17, 'issue_id': 1626, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '5a9e3b95b6aa1312', 'authors': ['Zhengyang Tang', 'Ziniu Li', 'Zhenyang Xiao', 'Tian Ding', 'Ruoyu Sun', 'Benyou Wang', 'Dayiheng Liu', 'Fei Huang', 'Tianyu Liu', 'Bowen Yu', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Inc., Beijing, China', 'Shenzhen Research Institute of Big Data, Shenzhen, China', 'The Chinese University of Hong Kong, Shenzhen, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.05727.jpg', 'data': {'categories': ['#training', '#benchmark', '#optimization', '#rlhf', '#synthetic'], 'emoji': '🔬', 'ru': {'title': 'SCRIT: Самосовершенствующийся критик для LLM', 'desc': 'SCRIT - это новая система для улучшения способностей больших языковых моделей (LLM) к самокритике без внешнего надзора. Она использует синтетические данные, созданные с помощью самокритика на основе контрастного обучения и механизма самопроверки. Реализованная на базе Qwen2.5-72B-Instruct, SCRIT демонстрирует значительное улучшение в задачах критики-коррекции и идентификации ошибок. Анализ показывает, что производительность SCRIT растет с увеличением объема данных и размера модели.'}, 'en': {'title': 'Empowering LLMs with Self-Evolving Critique', 'desc': 'This paper addresses the challenge of providing effective feedback for Large Language Models (LLMs) in tasks where human evaluation is difficult. It introduces SCRIT (Self-evolving CRITic), a framework that enhances the critique capabilities of LLMs without relying on external supervision. SCRIT utilizes synthetic data generated by a contrastive-based self-critic and incorporates a self-validation mechanism to ensure the quality of critiques. The results show that SCRIT significantly improves critique-correction and error identification benchmarks, demonstrating its effectiveness as LLMs scale in size and data.'}, 'zh': {'title': '自我进化,提升批评能力!', 'desc': '尽管大型语言模型(LLMs)表现出色,但在可扩展监督方面面临挑战,特别是在难以进行人类评估的任务中。本文提出了SCRIT(自我进化批评者)框架,旨在提升模型的自我批评能力。SCRIT通过对比自我批评生成合成数据,并利用自我验证机制确保批评质量,从而实现自我改进。实验结果表明,SCRIT在批评纠正和错误识别基准上提高了10.3%的性能,且其表现随着数据和模型规模的增加而提升。'}}}, {'id': 'https://huggingface.co/papers/2501.05452', 'title': 'ReFocus: Visual Editing as a Chain of Thought for Structured Image Understanding', 'url': 'https://huggingface.co/papers/2501.05452', 'abstract': 'Structured image understanding, such as interpreting tables and charts, requires strategically refocusing across various structures and texts within an image, forming a reasoning sequence to arrive at the final answer. However, current multimodal large language models (LLMs) lack this multihop selective attention capability. In this work, we introduce ReFocus, a simple yet effective framework that equips multimodal LLMs with the ability to generate "visual thoughts" by performing visual editing on the input image through code, shifting and refining their visual focuses. Specifically, ReFocus enables multimodal LLMs to generate Python codes to call tools and modify the input image, sequentially drawing boxes, highlighting sections, and masking out areas, thereby enhancing the visual reasoning process. We experiment upon a wide range of structured image understanding tasks involving tables and charts. ReFocus largely improves performance on all tasks over GPT-4o without visual editing, yielding an average gain of 11.0% on table tasks and 6.8% on chart tasks. We present an in-depth analysis of the effects of different visual edits, and reasons why ReFocus can improve the performance without introducing additional information. Further, we collect a 14k training set using ReFocus, and prove that such visual chain-of-thought with intermediate information offers a better supervision than standard VQA data, reaching a 8.0% average gain over the same model trained with QA pairs and 2.6% over CoT.', 'score': 7, 'issue_id': 1630, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '28a63b60414f99da', 'authors': ['Xingyu Fu', 'Minqian Liu', 'Zhengyuan Yang', 'John Corring', 'Yijuan Lu', 'Jianwei Yang', 'Dan Roth', 'Dinei Florencio', 'Cha Zhang'], 'affiliations': ['Microsoft', 'University of Pennsylvania', 'Virginia Tech'], 'pdf_title_img': 'assets/pdf/title_img/2501.05452.jpg', 'data': {'categories': ['#multimodal', '#interpretability', '#dataset', '#reasoning', '#training', '#cv'], 'emoji': '🔍', 'ru': {'title': 'ReFocus: Улучшение визуального понимания LLM через управляемое редактирование изображений', 'desc': "Статья представляет ReFocus - фреймворк, который наделяет мультимодальные большие языковые модели (LLM) способностью генерировать 'визуальные мысли' путем редактирования входного изображения с помощью кода. ReFocus позволяет LLM последовательно рисовать рамки, выделять секции и маскировать области, улучшая процесс визуального рассуждения. Эксперименты показывают значительное улучшение производительности на задачах понимания структурированных изображений, таких как таблицы и диаграммы. Авторы также доказывают, что визуальная цепочка рассуждений с промежуточной информацией обеспечивает лучшее обучение, чем стандартные данные VQA."}, 'en': {'title': 'Enhancing Visual Reasoning with ReFocus', 'desc': "This paper presents ReFocus, a framework designed to enhance the capabilities of multimodal large language models (LLMs) in structured image understanding tasks, such as interpreting tables and charts. ReFocus allows these models to generate 'visual thoughts' by performing visual edits on input images, which helps them focus on relevant areas and improve their reasoning processes. The framework enables the generation of Python code to manipulate images, such as drawing boxes and highlighting sections, which significantly boosts performance on various tasks. Experimental results show that ReFocus achieves notable improvements over existing models, demonstrating the effectiveness of visual editing in enhancing visual reasoning without adding new information."}, 'zh': {'title': 'ReFocus:提升多模态模型的视觉推理能力', 'desc': '本论文提出了一种名为ReFocus的框架,旨在提升多模态大语言模型在结构化图像理解任务中的表现。ReFocus通过生成Python代码对输入图像进行视觉编辑,使模型能够逐步调整视觉焦点,从而形成更有效的推理过程。实验结果表明,ReFocus在表格和图表任务上显著提高了性能,平均提升分别为11.0%和6.8%。此外,研究还表明,使用ReFocus生成的视觉链式思维提供了比标准问答数据更好的监督效果。'}}}, {'id': 'https://huggingface.co/papers/2501.04698', 'title': 'ConceptMaster: Multi-Concept Video Customization on Diffusion Transformer Models Without Test-Time Tuning', 'url': 'https://huggingface.co/papers/2501.04698', 'abstract': 'Text-to-video generation has made remarkable advancements through diffusion models. However, Multi-Concept Video Customization (MCVC) remains a significant challenge. We identify two key challenges in this task: 1) the identity decoupling problem, where directly adopting existing customization methods inevitably mix attributes when handling multiple concepts simultaneously, and 2) the scarcity of high-quality video-entity pairs, which is crucial for training such a model that represents and decouples various concepts well. To address these challenges, we introduce ConceptMaster, an innovative framework that effectively tackles the critical issues of identity decoupling while maintaining concept fidelity in customized videos. Specifically, we introduce a novel strategy of learning decoupled multi-concept embeddings that are injected into the diffusion models in a standalone manner, which effectively guarantees the quality of customized videos with multiple identities, even for highly similar visual concepts. To further overcome the scarcity of high-quality MCVC data, we carefully establish a data construction pipeline, which enables systematic collection of precise multi-concept video-entity data across diverse concepts. A comprehensive benchmark is designed to validate the effectiveness of our model from three critical dimensions: concept fidelity, identity decoupling ability, and video generation quality across six different concept composition scenarios. Extensive experiments demonstrate that our ConceptMaster significantly outperforms previous approaches for this task, paving the way for generating personalized and semantically accurate videos across multiple concepts.', 'score': 6, 'issue_id': 1631, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '6e82dc0b883c447a', 'authors': ['Yuzhou Huang', 'Ziyang Yuan', 'Quande Liu', 'Qiulin Wang', 'Xintao Wang', 'Ruimao Zhang', 'Pengfei Wan', 'Di Zhang', 'Kun Gai'], 'affiliations': ['Kuaishou Technology', 'Sun Yat-sen University', 'The Chinese University of Hong Kong, Shenzhen', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04698.jpg', 'data': {'categories': ['#diffusion', '#benchmark', '#data', '#video', '#dataset'], 'emoji': '🎬', 'ru': {'title': 'ConceptMaster: новый уровень персонализации в генерации видео', 'desc': 'Статья представляет ConceptMaster - новую систему для генерации видео с множественными персонализированными концептами. Авторы решают проблему смешивания атрибутов при одновременной работе с несколькими концептами, предлагая метод обучения раздельных мультиконцептуальных эмбеддингов. Для преодоления нехватки качественных данных разработан специальный конвейер сбора видео-сущностных пар. Эксперименты показывают превосходство ConceptMaster над существующими подходами в точности концептов, способности разделения идентичностей и качестве генерации видео.'}, 'en': {'title': 'Mastering Multi-Concept Video Customization with ConceptMaster', 'desc': "This paper presents ConceptMaster, a new framework for Multi-Concept Video Customization (MCVC) that addresses two main challenges: identity decoupling and the lack of high-quality video-entity pairs. The identity decoupling problem arises when existing methods mix attributes from different concepts, leading to poor customization results. ConceptMaster introduces a novel approach to learn decoupled multi-concept embeddings, which are integrated into diffusion models to ensure high-quality video outputs with distinct identities. Additionally, the authors establish a data construction pipeline to systematically gather diverse multi-concept video-entity data, and they validate their model's effectiveness through comprehensive benchmarks across various scenarios."}, 'zh': {'title': 'ConceptMaster:多概念视频定制的新突破', 'desc': '本文介绍了一种名为ConceptMaster的创新框架,旨在解决多概念视频定制中的身份解耦问题和高质量视频实体对的稀缺性。我们提出了一种新的学习策略,通过独立注入解耦的多概念嵌入到扩散模型中,从而保证定制视频的质量。为了克服高质量MCVC数据的不足,我们建立了一个数据构建管道,系统性地收集多概念视频实体数据。实验结果表明,ConceptMaster在概念保真度、身份解耦能力和视频生成质量等方面显著优于之前的方法。'}}}, {'id': 'https://huggingface.co/papers/2501.05707', 'title': 'Multiagent Finetuning: Self Improvement with Diverse Reasoning Chains', 'url': 'https://huggingface.co/papers/2501.05707', 'abstract': 'Large language models (LLMs) have achieved remarkable performance in recent years but are fundamentally limited by the underlying training data. To improve models beyond the training data, recent works have explored how LLMs can be used to generate synthetic data for autonomous self-improvement. However, successive steps of self-improvement can reach a point of diminishing returns. In this work, we propose a complementary approach towards self-improvement where finetuning is applied to a multiagent society of language models. A group of language models, all starting from the same base model, are independently specialized by updating each one using data generated through multiagent interactions among the models. By training each model on independent sets of data, we illustrate how this approach enables specialization across models and diversification over the set of models. As a result, our overall system is able to preserve diverse reasoning chains and autonomously improve over many more rounds of fine-tuning than single-agent self-improvement methods. We quantitatively illustrate the efficacy of the approach across a wide suite of reasoning tasks.', 'score': 5, 'issue_id': 1629, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '3d75785114d08414', 'authors': ['Vighnesh Subramaniam', 'Yilun Du', 'Joshua B. Tenenbaum', 'Antonio Torralba', 'Shuang Li', 'Igor Mordatch'], 'affiliations': ['Google Deepmind', 'Harvard University', 'MIT CSAIL', 'Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05707.jpg', 'data': {'categories': ['#synthetic', '#reasoning', '#training', '#agents'], 'emoji': '🤖', 'ru': {'title': 'Мультиагентное обучение: новый путь к улучшению языковых моделей', 'desc': 'Эта статья представляет новый подход к улучшению больших языковых моделей (LLM) с помощью мультиагентного обучения. Авторы предлагают создать группу моделей, которые взаимодействуют друг с другом для генерации синтетических данных. Каждая модель специализируется на своем наборе данных, что позволяет сохранить разнообразие рассуждений. Этот метод показывает лучшие результаты по сравнению с одноагентными подходами к самоулучшению на различных задачах рассуждения.'}, 'en': {'title': 'Empowering Language Models through Multiagent Self-Improvement', 'desc': 'This paper discusses a new method for improving large language models (LLMs) by using a multiagent system. Instead of relying solely on the original training data, the authors propose that multiple LLMs can interact and generate their own synthetic data, which they then use to fine-tune themselves. This approach allows each model to specialize in different areas, leading to a more diverse set of reasoning capabilities. The results show that this multiagent fine-tuning method can enhance performance over many iterations, surpassing traditional single-agent self-improvement techniques.'}, 'zh': {'title': '多智能体模型的自我改进新方法', 'desc': '大型语言模型(LLMs)在最近几年取得了显著的性能,但其根本上受到训练数据的限制。为了超越训练数据,最近的研究探索了如何利用LLMs生成合成数据以实现自主自我改进。本文提出了一种补充的方法,通过在多智能体语言模型的社会中进行微调,来实现自我改进。通过独立训练每个模型,利用模型之间的多智能体交互生成的数据,我们展示了这种方法如何实现模型的专业化和多样化,从而在多个微调轮次中保持多样的推理链。'}}}, {'id': 'https://huggingface.co/papers/2501.04961', 'title': 'Demystifying Domain-adaptive Post-training for Financial LLMs', 'url': 'https://huggingface.co/papers/2501.04961', 'abstract': 'Domain-adaptive post-training of large language models (LLMs) has emerged as a promising approach for specialized domains such as medicine and finance. However, significant challenges remain in identifying optimal adaptation criteria and training strategies across varying data and model configurations. To address these challenges, we introduce FINDAP, a systematic and fine-grained investigation into domain-adaptive post-training of LLMs for the finance domain. Our approach begins by identifying the core capabilities required for the target domain and designing a comprehensive evaluation suite aligned with these needs. We then analyze the effectiveness of key post-training stages, including continual pretraining, instruction tuning, and preference alignment. Building on these insights, we propose an effective training recipe centered on a novel preference data distillation method, which leverages process signals from a generative reward model. The resulting model, Llama-Fin, achieves state-of-the-art performance across a wide range of financial tasks. Our analysis also highlights how each post-training stage contributes to distinct capabilities, uncovering specific challenges and effective solutions, providing valuable insights for domain adaptation of LLMs. Project page: https://github.com/SalesforceAIResearch/FinDap', 'score': 4, 'issue_id': 1642, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ade3590d1cc29d47', 'authors': ['Zixuan Ke', 'Yifei Ming', 'Xuan-Phi Nguyen', 'Caiming Xiong', 'Shafiq Joty'], 'affiliations': ['Salesforce AI Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.04961.jpg', 'data': {'categories': ['#optimization', '#rlhf', '#healthcare', '#transfer_learning', '#training'], 'emoji': '💹', 'ru': {'title': 'Оптимизация LLM для финансов: от анализа до совершенства', 'desc': 'Статья представляет FINDAP - систематический подход к доменно-адаптивному постобучению больших языковых моделей (LLM) для финансовой сферы. Авторы разработали комплексный набор оценок, анализирующий эффективность ключевых этапов постобучения, включая продолжающееся предобучение, инструктивную настройку и выравнивание предпочтений. Предложен эффективный рецепт обучения, основанный на новом методе дистилляции данных предпочтений. Результирующая модель Llama-Fin достигает передовых результатов в широком спектре финансовых задач.'}, 'en': {'title': 'FINDAP: Tailoring LLMs for Finance Excellence', 'desc': 'This paper presents FINDAP, a method for improving large language models (LLMs) specifically for the finance sector through domain-adaptive post-training. It identifies essential capabilities needed for financial tasks and creates a tailored evaluation suite to measure these capabilities. The study examines various post-training techniques, such as continual pretraining and instruction tuning, to determine their effectiveness. Ultimately, the authors introduce Llama-Fin, a model that utilizes a novel preference data distillation method, achieving top performance in financial applications while providing insights into the adaptation process.'}, 'zh': {'title': '金融领域的智能适应训练', 'desc': '本文介绍了一种针对金融领域的大型语言模型(LLM)进行领域自适应后训练的方法,称为FINDAP。我们首先识别目标领域所需的核心能力,并设计了与这些需求相一致的综合评估套件。接着,我们分析了关键后训练阶段的有效性,包括持续预训练、指令调优和偏好对齐。最终,我们提出了一种基于新颖偏好数据蒸馏方法的有效训练方案,所得到的模型Llama-Fin在多种金融任务中达到了最先进的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.06187', 'title': 'Multi-subject Open-set Personalization in Video Generation', 'url': 'https://huggingface.co/papers/2501.06187', 'abstract': 'Video personalization methods allow us to synthesize videos with specific concepts such as people, pets, and places. However, existing methods often focus on limited domains, require time-consuming optimization per subject, or support only a single subject. We present Video Alchemist - a video model with built-in multi-subject, open-set personalization capabilities for both foreground objects and background, eliminating the need for time-consuming test-time optimization. Our model is built on a new Diffusion Transformer module that fuses each conditional reference image and its corresponding subject-level text prompt with cross-attention layers. Developing such a large model presents two main challenges: dataset and evaluation. First, as paired datasets of reference images and videos are extremely hard to collect, we sample selected video frames as reference images and synthesize a clip of the target video. However, while models can easily denoise training videos given reference frames, they fail to generalize to new contexts. To mitigate this issue, we design a new automatic data construction pipeline with extensive image augmentations. Second, evaluating open-set video personalization is a challenge in itself. To address this, we introduce a personalization benchmark that focuses on accurate subject fidelity and supports diverse personalization scenarios. Finally, our extensive experiments show that our method significantly outperforms existing personalization methods in both quantitative and qualitative evaluations.', 'score': 4, 'issue_id': 1631, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'fcf16f5f8fe9047a', 'authors': ['Tsai-Shien Chen', 'Aliaksandr Siarohin', 'Willi Menapace', 'Yuwei Fang', 'Kwot Sin Lee', 'Ivan Skorokhodov', 'Kfir Aberman', 'Jun-Yan Zhu', 'Ming-Hsuan Yang', 'Sergey Tulyakov'], 'affiliations': ['CMU', 'Snap Inc.', 'UC Merced'], 'pdf_title_img': 'assets/pdf/title_img/2501.06187.jpg', 'data': {'categories': ['#diffusion', '#synthetic', '#benchmark', '#data', '#optimization', '#video', '#dataset'], 'emoji': '🎭', 'ru': {'title': 'Универсальная персонализация видео без длительной оптимизации', 'desc': 'Статья представляет Video Alchemist - новую модель для персонализации видео с возможностью работы с несколькими объектами. Модель использует новый модуль Diffusion Transformer, который объединяет условные референсные изображения и текстовые промпты. Авторы разработали автоматический конвейер для создания данных с обширными аугментациями изображений. Также был создан новый бенчмарк для оценки персонализации видео в открытом наборе.'}, 'en': {'title': 'Revolutionizing Video Personalization with Video Alchemist', 'desc': "The paper introduces Video Alchemist, a novel video personalization model that allows for the synthesis of videos featuring multiple subjects without the need for extensive optimization. It utilizes a Diffusion Transformer module that integrates reference images and text prompts through cross-attention layers, enabling effective personalization for both foreground and background elements. The authors tackle challenges related to dataset creation by employing a new automatic data construction pipeline with image augmentations, which helps improve generalization to new contexts. Additionally, they propose a personalization benchmark to evaluate the model's performance in diverse scenarios, demonstrating that Video Alchemist outperforms existing methods in both quantitative and qualitative assessments."}, 'zh': {'title': '视频个性化的新突破', 'desc': '视频个性化方法可以合成特定概念的视频,如人物、宠物和地点。然而,现有方法通常只关注有限的领域,且每个主题需要耗时的优化,或者仅支持单一主题。我们提出了视频炼金术师(Video Alchemist),这是一种具有内置多主题、开放集个性化能力的视频模型,能够处理前景物体和背景,消除了耗时的测试时间优化需求。我们的模型基于新的扩散变换器模块,结合条件参考图像和相应的主题级文本提示,通过交叉注意力层进行融合。'}}}, {'id': 'https://huggingface.co/papers/2501.05542', 'title': 'Infecting Generative AI With Viruses', 'url': 'https://huggingface.co/papers/2501.05542', 'abstract': 'This study demonstrates a novel approach to testing the security boundaries of Vision-Large Language Model (VLM/ LLM) using the EICAR test file embedded within JPEG images. We successfully executed four distinct protocols across multiple LLM platforms, including OpenAI GPT-4o, Microsoft Copilot, Google Gemini 1.5 Pro, and Anthropic Claude 3.5 Sonnet. The experiments validated that a modified JPEG containing the EICAR signature could be uploaded, manipulated, and potentially executed within LLM virtual workspaces. Key findings include: 1) consistent ability to mask the EICAR string in image metadata without detection, 2) successful extraction of the test file using Python-based manipulation within LLM environments, and 3) demonstration of multiple obfuscation techniques including base64 encoding and string reversal. This research extends Microsoft Research\'s "Penetration Testing Rules of Engagement" framework to evaluate cloud-based generative AI and LLM security boundaries, particularly focusing on file handling and execution capabilities within containerized environments.', 'score': 4, 'issue_id': 1630, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ac21f1bae807486e', 'authors': ['David Noever', 'Forrest McKee'], 'affiliations': ['PeopleTec, Inc., Huntsville, AL'], 'pdf_title_img': 'assets/pdf/title_img/2501.05542.jpg', 'data': {'categories': ['#cv', '#benchmark', '#data', '#security'], 'emoji': '🛡️', 'ru': {'title': 'Новые горизонты в тестировании безопасности VLM/LLM с помощью EICAR', 'desc': 'Это исследование демонстрирует новый подход к тестированию границ безопасности моделей типа Vision-Large Language Model (VLM/LLM) с использованием тестового файла EICAR, встроенного в изображения JPEG. Эксперименты проводились на нескольких платформах LLM, включая OpenAI GPT-4, Microsoft Copilot, Google Gemini 1.5 Pro и Anthropic Claude 3.5 Sonnet. Ключевые результаты включают успешную маскировку строки EICAR в метаданных изображения, извлечение тестового файла с помощью Python в среде LLM и демонстрацию различных методов обфускации. Исследование расширяет рамки оценки безопасности облачных генеративных ИИ и LLM, особенно в отношении обработки файлов и возможностей выполнения в контейнеризированных средах.'}, 'en': {'title': 'Testing Security Boundaries of LLMs with EICAR in JPEGs', 'desc': 'This paper presents a new method for testing the security limits of Vision-Large Language Models (VLMs/LLMs) by embedding the EICAR test file in JPEG images. The authors conducted experiments on various LLM platforms, revealing that modified JPEGs containing the EICAR signature could be uploaded and manipulated without detection. They demonstrated the ability to extract the EICAR file using Python scripts and employed several obfuscation techniques to hide the EICAR string. This research enhances existing security frameworks by focusing on the file handling and execution capabilities of cloud-based generative AI systems.'}, 'zh': {'title': '测试大型语言模型的安全边界新方法', 'desc': '本研究展示了一种新颖的方法,用于测试视觉大型语言模型(VLM/LLM)的安全边界,使用嵌入在JPEG图像中的EICAR测试文件。我们在多个LLM平台上成功执行了四种不同的协议,包括OpenAI GPT-4o、Microsoft Copilot、Google Gemini 1.5 Pro和Anthropic Claude 3.5 Sonnet。实验验证了修改后的JPEG图像可以在LLM虚拟工作区中上传、操控并可能执行。研究的关键发现包括:在图像元数据中无检测地掩盖EICAR字符串、在LLM环境中成功提取测试文件,以及展示多种混淆技术,如base64编码和字符串反转。'}}}, {'id': 'https://huggingface.co/papers/2501.14249', 'title': "Humanity's Last Exam", 'url': 'https://huggingface.co/papers/2501.14249', 'abstract': "Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. HLE consists of 3,000 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. Each question has a known solution that is unambiguous and easily verifiable, but cannot be quickly answered via internet retrieval. State-of-the-art LLMs demonstrate low accuracy and calibration on HLE, highlighting a significant gap between current LLM capabilities and the expert human frontier on closed-ended academic questions. To inform research and policymaking upon a clear understanding of model capabilities, we publicly release HLE at https://lastexam.ai.", 'score': 29, 'issue_id': 1873, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '4d614974221756d3', 'authors': ['Long Phan', 'Alice Gatti', 'Ziwen Han', 'Nathaniel Li', 'Josephina Hu', 'Hugh Zhang', 'Sean Shi', 'Michael Choi', 'Anish Agrawal', 'Arnav Chopra', 'Adam Khoja', 'Ryan Kim', 'Jason Hausenloy', 'Oliver Zhang', 'Mantas Mazeika', 'Daron Anderson', 'Tung Nguyen', 'Mobeen Mahmood', 'Fiona Feng', 'Steven Y. Feng', 'Haoran Zhao', 'Michael Yu', 'Varun Gangal', 'Chelsea Zou', 'Zihan Wang', 'Jessica P. Wang', 'Pawan Kumar', 'Oleksandr Pokutnyi', 'Robert Gerbicz', 'Serguei Popov', 'John-Clark Levin', 'Mstyslav Kazakov', 'Johannes Schmitt', 'Geoff Galgon', 'Alvaro Sanchez', 'Yongki Lee', 'Will Yeadon', 'Scott Sauers', 'Marc Roth', 'Chidozie Agu', 'Søren Riis', 'Fabian Giska', 'Saiteja Utpala', 'Zachary Giboney', 'Gashaw M. Goshu', 'Joan of Arc Xavier', 'Sarah-Jane Crowson', 'Mohinder Maheshbhai Naiya', 'Noah Burns', 'Lennart Finke', 'Zerui Cheng', 'Hyunwoo Park', 'Francesco Fournier-Facio', 'John Wydallis', 'Mark Nandor', 'Ankit Singh', 'Tim Gehrunger', 'Jiaqi Cai', 'Ben McCarty', 'Darling Duclosel', 'Jungbae Nam', 'Jennifer Zampese', 'Ryan G. Hoerr', 'Aras Bacho', 'Gautier Abou Loume', 'Abdallah Galal', 'Hangrui Cao', 'Alexis C Garretson', 'Damien Sileo', 'Qiuyu Ren', 'Doru Cojoc', 'Pavel Arkhipov', 'Usman Qazi', 'Lianghui Li', 'Sumeet Motwani', 'Christian Schroeder de Witt', 'Edwin Taylor', 'Johannes Veith', 'Eric Singer', 'Taylor D. Hartman', 'Paolo Rissone', 'Jaehyeok Jin', 'Jack Wei Lun Shi', 'Chris G. Willcocks', 'Joshua Robinson', 'Aleksandar Mikov', 'Ameya Prabhu', 'Longke Tang', 'Xavier Alapont', 'Justine Leon Uro', 'Kevin Zhou', 'Emily de Oliveira Santos', 'Andrey Pupasov Maksimov', 'Edward Vendrow', 'Kengo Zenitani', 'Julien Guillod', 'Yuqi Li', 'Joshua Vendrow', 'Vladyslav Kuchkin', 'Ng Ze-An', 'Pierre Marion', 'Denis Efremov', 'Jayson Lynch', 'Kaiqu Liang', 'Andrew Gritsevskiy', 'Dakotah Martinez', 'Ben Pageler', 'Nick Crispino', 'Dimitri Zvonkine', 'Natanael Wildner Fraga', 'Saeed Soori', 'Ori Press', 'Henry Tang', 'Julian Salazar', 'Sean R. Green', 'Lina Brüssel', 'Moon Twayana', 'Aymeric Dieuleveut', 'T. Ryan Rogers', 'Wenjin Zhang', 'Bikun Li', 'Jinzhou Yang', 'Arun Rao', 'Gabriel Loiseau', 'Mikhail Kalinin', 'Marco Lukas', 'Ciprian Manolescu', 'Subrata Mishra', 'Ariel Ghislain Kemogne Kamdoum', 'Tobias Kreiman', 'Tad Hogg', 'Alvin Jin', 'Carlo Bosio', 'Gongbo Sun', 'Brian P Coppola', 'Tim Tarver', 'Haline Heidinger', 'Rafael Sayous', 'Stefan Ivanov', 'Joseph M Cavanagh', 'Jiawei Shen', 'Joseph Marvin Imperial', 'Philippe Schwaller', 'Shaipranesh Senthilkuma', 'Andres M Bran', 'Ali Dehghan', 'Andres Algaba', 'Brecht Verbeken', 'David Noever', 'Ragavendran P V', 'Lisa Schut', 'Ilia Sucholutsky', 'Evgenii Zheltonozhskii', 'Derek Lim', 'Richard Stanley', 'Shankar Sivarajan', 'Tong Yang', 'John Maar', 'Julian Wykowski', 'Martí Oller', 'Jennifer Sandlin', 'Anmol Sahu', 'Yuzheng Hu', 'Sara Fish', 'Nasser Heydari', 'Archimedes Apronti', 'Kaivalya Rawal', 'Tobias Garcia Vilchis', 'Yuexuan Zu', 'Martin Lackner', 'James Koppel', 'Jeremy Nguyen', 'Daniil S. Antonenko', 'Steffi Chern', 'Bingchen Zhao', 'Pierrot Arsene', 'Alan Goldfarb', 'Sergey Ivanov', 'Rafał Poświata', 'Chenguang Wang', 'Daofeng Li', 'Donato Crisostomi', 'Andrea Achilleos', 'Benjamin Myklebust', 'Archan Sen', 'David Perrella', 'Nurdin Kaparov', 'Mark H Inlow', 'Allen Zang', 'Elliott Thornley', 'Daniil Orel', 'Vladislav Poritski', 'Shalev Ben-David', 'Zachary Berger', 'Parker Whitfill', 'Michael Foster', 'Daniel Munro', 'Linh Ho', 'Dan Bar Hava', 'Aleksey Kuchkin', 'Robert Lauff', 'David Holmes', 'Frank Sommerhage', 'Keith Schneider', 'Zakayo Kazibwe', 'Nate Stambaugh', 'Mukhwinder Singh', 'Ilias Magoulas', 'Don Clarke', 'Dae Hyun Kim', 'Felipe Meneguitti Dias', 'Veit Elser', 'Kanu Priya Agarwal', 'Victor Efren Guadarrama Vilchis', 'Immo Klose', 'Christoph Demian', 'Ujjwala Anantheswaran', 'Adam Zweiger', 'Guglielmo Albani', 'Jeffery Li', 'Nicolas Daans', 'Maksim Radionov', 'Václav Rozhoň', 'Ziqiao Ma', 'Christian Stump', 'Mohammed Berkani', 'Jacob Platnick', 'Volodymyr Nevirkovets', 'Luke Basler', 'Marco Piccardo', 'Ferenc Jeanplong', 'Niv Cohen', 'Josef Tkadlec', 'Paul Rosu', 'Piotr Padlewski', 'Stanislaw Barzowski', 'Kyle Montgomery', 'Aline Menezes', 'Arkil Patel', 'Zixuan Wang', 'Jamie Tucker-Foltz', 'Jack Stade', 'Tom Goertzen', 'Fereshteh Kazemi', 'Jeremiah Milbauer', 'John Arnold Ambay', 'Abhishek Shukla', 'Yan Carlos Leyva Labrador', 'Alan Givré', 'Hew Wolff', 'Vivien Rossbach', 'Muhammad Fayez Aziz', 'Younesse Kaddar', 'Yanxu Chen', 'Robin Zhang', 'Jiayi Pan', 'Antonio Terpin', 'Niklas Muennighoff', 'Hailey Schoelkopf', 'Eric Zheng', 'Avishy Carmi', 'Adam Jones', 'Jainam Shah', 'Ethan D. L. Brown', 'Kelin Zhu', 'Max Bartolo', 'Richard Wheeler', 'Andrew Ho', 'Shaul Barkan', 'Jiaqi Wang', 'Martin Stehberger', 'Egor Kretov', 'Kaustubh Sridhar', 'Zienab EL-Wasif', 'Anji Zhang', 'Daniel Pyda', 'Joanna Tam', 'David M. Cunningham', 'Vladimir Goryachev', 'Demosthenes Patramanis', 'Michael Krause', 'Andrew Redenti', 'Daniel Bugas', 'David Aldous', 'Jesyin Lai', 'Shannon Coleman', 'Mohsen Bahaloo', 'Jiangnan Xu', 'Sangwon Lee', 'Sandy Zhao', 'Ning Tang', 'Michael K. Cohen', 'Micah Carroll', 'Orr Paradise', 'Jan Hendrik Kirchner', 'Stefan Steinerberger', 'Maksym Ovchynnikov', 'Jason O. Matos', 'Adithya Shenoy', 'Benedito Alves de Oliveira Junior', 'Michael Wang', 'Yuzhou Nie', 'Paolo Giordano', 'Philipp Petersen', 'Anna Sztyber-Betley', 'Priti Shukla', 'Jonathan Crozier', 'Antonella Pinto', 'Shreyas Verma', 'Prashant Joshi', 'Zheng-Xin Yong', 'Allison Tee', 'Jérémy Andréoletti', 'Orion Weller', 'Raghav Singhal', 'Gang Zhang', 'Alexander Ivanov', 'Seri Khoury', 'Hamid Mostaghimi', 'Kunvar Thaman', 'Qijia Chen', 'Tran Quoc Khánh', 'Jacob Loader', 'Stefano Cavalleri', 'Hannah Szlyk', 'Zachary Brown', 'Jonathan Roberts', 'William Alley', 'Kunyang Sun', 'Ryan Stendall', 'Max Lamparth', 'Anka Reuel', 'Ting Wang', 'Hanmeng Xu', 'Sreenivas Goud Raparthi', 'Pablo Hernández-Cámara', 'Freddie Martin', 'Dmitry Malishev', 'Thomas Preu', 'Tomek Korbak', 'Marcus Abramovitch', 'Dominic Williamson', 'Ziye Chen', 'Biró Bálint', 'M Saiful Bari', 'Peyman Kassani', 'Zihao Wang', 'Behzad Ansarinejad', 'Laxman Prasad Goswami', 'Yewen Sun', 'Hossam Elgnainy', 'Daniel Tordera', 'George Balabanian', 'Earth Anderson', 'Lynna Kvistad', 'Alejandro José Moyano', 'Rajat Maheshwari', 'Ahmad Sakor', 'Murat Eron', 'Isaac C. McAlister', 'Javier Gimenez', 'Innocent Enyekwe', 'Andrew Favre D. O.', 'Shailesh Shah', 'Xiaoxiang Zhou', 'Firuz Kamalov', 'Ronald Clark', 'Sherwin Abdoli', 'Tim Santens', 'Khalida Meer', 'Harrison K Wang', 'Kalyan Ramakrishnan', 'Evan Chen', 'Alessandro Tomasiello', 'G. Bruno De Luca', 'Shi-Zhuo Looi', 'Vinh-Kha Le', 'Noam Kolt', 'Niels Mündler', 'Avi Semler', 'Emma Rodman', 'Jacob Drori', 'Carl J Fossum', 'Milind Jagota', 'Ronak Pradeep', 'Honglu Fan', 'Tej Shah', 'Jonathan Eicher', 'Michael Chen', 'Kushal Thaman', 'William Merrill', 'Carter Harris', 'Jason Gross', 'Ilya Gusev', 'Asankhaya Sharma', 'Shashank Agnihotri', 'Pavel Zhelnov', 'Siranut Usawasutsakorn', 'Mohammadreza Mofayezi', 'Sergei Bogdanov', 'Alexander Piperski', 'Marc Carauleanu', 'David K. Zhang', 'Dylan Ler', 'Roman Leventov', 'Ignat Soroko', 'Thorben Jansen', 'Pascal Lauer', 'Joshua Duersch', 'Vage Taamazyan', 'Wiktor Morak', 'Wenjie Ma', 'William Held', 'Tran Đuc Huy', 'Ruicheng Xian', 'Armel Randy Zebaze', 'Mohanad Mohamed', 'Julian Noah Leser', 'Michelle X Yuan', 'Laila Yacar', 'Johannes Lengler', 'Hossein Shahrtash', 'Edson Oliveira', 'Joseph W. Jackson', 'Daniel Espinosa Gonzalez', 'Andy Zou', 'Muthu Chidambaram', 'Timothy Manik', 'Hector Haffenden', 'Dashiell Stander', 'Ali Dasouqi', 'Alexander Shen', 'Emilien Duc', 'Bita Golshani', 'David Stap', 'Mikalai Uzhou', 'Alina Borisovna Zhidkovskaya', 'Lukas Lewark', 'Mátyás Vincze', 'Dustin Wehr', 'Colin Tang', 'Zaki Hossain', 'Shaun Phillips', 'Jiang Muzhen', 'Fredrik Ekström', 'Angela Hammon', 'Oam Patel', 'Nicolas Remy', 'Faraz Farhidi', 'George Medley', 'Forough Mohammadzadeh', 'Madellene Peñaflor', 'Haile Kassahun', 'Alena Friedrich', 'Claire Sparrow', 'Taom Sakal', 'Omkar Dhamane', 'Ali Khajegili Mirabadi', 'Eric Hallman', 'Mike Battaglia', 'Mohammad Maghsoudimehrabani', 'Hieu Hoang', 'Alon Amit', 'Dave Hulbert', 'Roberto Pereira', 'Simon Weber', 'Stephen Mensah', 'Nathan Andre', 'Anton Peristyy', 'Chris Harjadi', 'Himanshu Gupta', 'Stephen Malina', 'Samuel Albanie', 'Will Cai', 'Mustafa Mehkary', 'Frank Reidegeld', 'Anna-Katharina Dick', 'Cary Friday', 'Jasdeep Sidhu', 'Wanyoung Kim', 'Mariana Costa', 'Hubeyb Gurdogan', 'Brian Weber', 'Harsh Kumar', 'Tong Jiang', 'Arunim Agarwal', 'Chiara Ceconello', 'Warren S. Vaz', 'Chao Zhuang', 'Haon Park', 'Andrew R. Tawfeek', 'Daattavya Aggarwal', 'Michael Kirchhof', 'Linjie Dai', 'Evan Kim', 'Johan Ferret', 'Yuzhou Wang', 'Minghao Yan', 'Krzysztof Burdzy', 'Lixin Zhang', 'Antonio Franca', 'Diana T. Pham', 'Kang Yong Loh', 'Joshua Robinson', 'Shreen Gul', 'Gunjan Chhablani', 'Zhehang Du', 'Adrian Cosma', 'Colin White', 'Robin Riblet', 'Prajvi Saxena', 'Jacob Votava', 'Vladimir Vinnikov', 'Ethan Delaney', 'Shiv Halasyamani', 'Syed M. Shahid', 'Jean-Christophe Mourrat', 'Lavr Vetoshkin', 'Renas Bacho', 'Vincent Ginis', 'Aleksandr Maksapetyan', 'Florencia de la Rosa', 'Xiuyu Li', 'Guillaume Malod', 'Leon Lang', 'Julien Laurendeau', 'Fatimah Adesanya', 'Julien Portier', 'Lawrence Hollom', 'Victor Souza', 'Yuchen Anna Zhou', 'Yiğit Yalın', 'Gbenga Daniel Obikoya', 'Luca Arnaboldi', 'Rai', 'Filippo Bigi', 'Kaniuar Bacho', 'Pierre Clavier', 'Gabriel Recchia', 'Mara Popescu', 'Nikita Shulga', 'Ngefor Mildred Tanwie', 'Thomas C. H. Lux', 'Ben Rank', 'Colin Ni', 'Alesia Yakimchyk', 'Huanxu', 'Liu', 'Olle Häggström', 'Emil Verkama', 'Himanshu Narayan', 'Hans Gundlach', 'Leonor Brito-Santana', 'Brian Amaro', 'Vivek Vajipey', 'Rynaa Grover', 'Yiyang Fan', 'Gabriel Poesia Reis e Silva', 'Linwei Xin', 'Yosi Kratish', 'Jakub Łucki', 'Wen-Ding Li', 'Justin Xu', 'Kevin Joseph Scaria', 'Freddie Vargus', 'Farzad Habibi', 'Long', 'Lian', 'Emanuele Rodolà', 'Jules Robins', 'Vincent Cheng', 'Declan Grabb', 'Ida Bosio', 'Tony Fruhauff', 'Ido Akov', 'Eve J. Y. Lo', 'Hao Qi', 'Xi Jiang', 'Ben Segev', 'Jingxuan Fan', 'Sarah Martinson', 'Erik Y. Wang', 'Kaylie Hausknecht', 'Michael P. Brenner', 'Mao Mao', 'Yibo Jiang', 'Xinyu Zhang', 'David Avagian', 'Eshawn Jessica Scipio', 'Muhammad Rehan Siddiqi', 'Alon Ragoler', 'Justin Tan', 'Deepakkumar Patil', 'Rebeka Plecnik', 'Aaron Kirtland', 'Roselynn Grace Montecillo', 'Stephane Durand', 'Omer Faruk Bodur', 'Zahra Adoul', 'Mohamed Zekry', 'Guillaume Douville', 'Ali Karakoc', 'Tania C. B. Santos', 'Samir Shamseldeen', 'Loukmane Karim', 'Anna Liakhovitskaia', 'Nate Resman', 'Nicholas Farina', 'Juan Carlos Gonzalez', 'Gabe Maayan', 'Sarah Hoback', 'Rodrigo De Oliveira Pena', 'Glen Sherman', 'Hodjat Mariji', 'Rasoul Pouriamanesh', 'Wentao Wu', 'Gözdenur Demir', 'Sandra Mendoza', 'Ismail Alarab', 'Joshua Cole', 'Danyelle Ferreira', 'Bryan Johnson', 'Hsiaoyun Milliron', 'Mohammad Safdari', 'Liangti Dai', 'Siriphan Arthornthurasuk', 'Alexey Pronin', 'Jing Fan', 'Angel Ramirez-Trinidad', 'Ashley Cartwright', 'Daphiny Pottmaier', 'Omid Taheri', 'David Outevsky', 'Stanley Stepanic', 'Samuel Perry', 'Luke Askew', 'Raúl Adrián Huerta Rodríguez', 'Abdelkader Dendane', 'Sam Ali', 'Ricardo Lorena', 'Krishnamurthy Iyer', 'Sk Md Salauddin', 'Murat Islam', 'Juan Gonzalez', 'Josh Ducey', 'Russell Campbell', 'Maja Somrak', 'Vasilios Mavroudis', 'Eric Vergo', 'Juehang Qin', 'Benjámin Borbás', 'Eric Chu', 'Jack Lindsey', 'Anil Radhakrishnan', 'Antoine Jallon', 'I. M. J. McInnis', 'Alex Hoover', 'Sören Möller', 'Song Bian', 'John Lai', 'Tejal Patwardhan', 'Summer Yue', 'Alexandr Wang', 'Dan Hendrycks'], 'affiliations': ['Center for AI Safety', 'Scale AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.14249.jpg', 'data': {'categories': ['#benchmark', '#science', '#multimodal'], 'emoji': '🧠', 'ru': {'title': 'Новый рубеж для искусственного интеллекта: тест на пределе человеческих знаний', 'desc': "Статья представляет новый многомодальный бенчмарк для оценки возможностей больших языковых моделей (LLM) под названием 'Последний экзамен человечества' (HLE). HLE состоит из 3000 вопросов по различным предметам, разработанных экспертами со всего мира. Бенчмарк создан для преодоления ограничений существующих тестов, на которых современные LLM достигают точности более 90%. Результаты показывают, что современные LLM демонстрируют низкую точность на HLE, что указывает на значительный разрыв между их возможностями и экспертными знаниями человека."}, 'en': {'title': "Raising the Bar: Humanity's Last Exam for LLMs", 'desc': "This paper introduces a new benchmark called Humanity's Last Exam (HLE) to evaluate the capabilities of large language models (LLMs). HLE consists of 3,000 questions across various subjects, including mathematics and humanities, designed to be challenging for LLMs. Unlike existing benchmarks, HLE questions cannot be easily answered through internet searches, making them a better measure of true understanding. The results show that current state-of-the-art LLMs struggle with HLE, indicating a significant gap between their performance and that of expert humans."}, 'zh': {'title': '人类的最后考试:挑战LLM的极限', 'desc': '基准测试是跟踪大型语言模型(LLM)能力快速发展的重要工具。然而,现有的基准测试难度未能与LLM的进步相匹配,导致LLM在流行基准测试(如MMLU)上达到90%以上的准确率。为此,我们推出了人类的最后考试(HLE),这是一个涵盖广泛学科的多模态基准,旨在成为此类学术基准的最终版本。HLE包含3000个问题,涉及数学、人文学科和自然科学,旨在揭示当前LLM能力与专家人类水平之间的显著差距。'}}}, {'id': 'https://huggingface.co/papers/2501.13953', 'title': 'Redundancy Principles for MLLMs Benchmarks', 'url': 'https://huggingface.co/papers/2501.13953', 'abstract': "With the rapid iteration of Multi-modality Large Language Models (MLLMs) and the evolving demands of the field, the number of benchmarks produced annually has surged into the hundreds. The rapid growth has inevitably led to significant redundancy among benchmarks. Therefore, it is crucial to take a step back and critically assess the current state of redundancy and propose targeted principles for constructing effective MLLM benchmarks. In this paper, we focus on redundancy from three key perspectives: 1) Redundancy of benchmark capability dimensions, 2) Redundancy in the number of test questions, and 3) Cross-benchmark redundancy within specific domains. Through the comprehensive analysis over hundreds of MLLMs' performance across more than 20 benchmarks, we aim to quantitatively measure the level of redundancy lies in existing MLLM evaluations, provide valuable insights to guide the future development of MLLM benchmarks, and offer strategies to refine and address redundancy issues effectively.", 'score': 21, 'issue_id': 1877, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': 'f504e124f29e4140', 'authors': ['Zicheng Zhang', 'Xiangyu Zhao', 'Xinyu Fang', 'Chunyi Li', 'Xiaohong Liu', 'Xiongkuo Min', 'Haodong Duan', 'Kai Chen', 'Guangtao Zhai'], 'affiliations': ['Shanghai AI Lab', 'Shanghai Jiao Tong University', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13953.jpg', 'data': {'categories': ['#benchmark', '#survey'], 'emoji': '🔍', 'ru': {'title': 'Борьба с избыточностью: оптимизация бенчмарков для мультимодальных языковых моделей', 'desc': 'Статья посвящена проблеме избыточности в бенчмарках для мультимодальных больших языковых моделей (MLLM). Авторы анализируют избыточность с трех ключевых аспектов: измерения возможностей, количество тестовых вопросов и пересечение между бенчмарками в конкретных областях. На основе анализа производительности MLLM на более чем 20 бенчмарках, исследователи предлагают количественно оценить уровень избыточности и дать рекомендации по улучшению бенчмарков. Цель работы - предоставить ценные идеи для будущего развития оценки MLLM и стратегии по устранению проблем избыточности.'}, 'en': {'title': 'Streamlining MLLM Benchmarks: Tackling Redundancy for Better Evaluation', 'desc': 'This paper examines the growing issue of redundancy in benchmarks for Multi-modality Large Language Models (MLLMs). It identifies three main types of redundancy: in the capabilities being tested, the number of test questions, and across different benchmarks within the same domain. By analyzing the performance of numerous MLLMs across over 20 benchmarks, the authors quantitatively measure the extent of this redundancy. The findings aim to inform the development of more effective benchmarks and provide strategies to reduce redundancy in future evaluations.'}, 'zh': {'title': '优化多模态大型语言模型基准测试,减少冗余', 'desc': '随着多模态大型语言模型(MLLMs)的快速发展,年度基准测试的数量激增,导致基准测试之间的冗余现象显著增加。本文从三个关键角度分析冗余问题:基准能力维度的冗余、测试问题数量的冗余以及特定领域内的跨基准冗余。通过对数百个MLLM在20多个基准测试中的表现进行综合分析,我们定量测量现有MLLM评估中的冗余水平。我们的目标是为未来MLLM基准的开发提供有价值的见解,并提出有效解决冗余问题的策略。'}}}, {'id': 'https://huggingface.co/papers/2501.14342', 'title': 'Chain-of-Retrieval Augmented Generation', 'url': 'https://huggingface.co/papers/2501.14342', 'abstract': "This paper introduces an approach for training o1-like RAG models that retrieve and reason over relevant information step by step before generating the final answer. Conventional RAG methods usually perform a single retrieval step before the generation process, which limits their effectiveness in addressing complex queries due to imperfect retrieval results. In contrast, our proposed method, CoRAG (Chain-of-Retrieval Augmented Generation), allows the model to dynamically reformulate the query based on the evolving state. To train CoRAG effectively, we utilize rejection sampling to automatically generate intermediate retrieval chains, thereby augmenting existing RAG datasets that only provide the correct final answer. At test time, we propose various decoding strategies to scale the model's test-time compute by controlling the length and number of sampled retrieval chains. Experimental results across multiple benchmarks validate the efficacy of CoRAG, particularly in multi-hop question answering tasks, where we observe more than 10 points improvement in EM score compared to strong baselines. On the KILT benchmark, CoRAG establishes a new state-of-the-art performance across a diverse range of knowledge-intensive tasks. Furthermore, we offer comprehensive analyses to understand the scaling behavior of CoRAG, laying the groundwork for future research aimed at developing factual and grounded foundation models.", 'score': 18, 'issue_id': 1873, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': 'cd489ba1638c5496', 'authors': ['Liang Wang', 'Haonan Chen', 'Nan Yang', 'Xiaolong Huang', 'Zhicheng Dou', 'Furu Wei'], 'affiliations': ['Microsoft Corporation', 'Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.14342.jpg', 'data': {'categories': ['#benchmark', '#optimization', '#rag', '#reasoning'], 'emoji': '🔗', 'ru': {'title': 'CoRAG: Пошаговый поиск для улучшения генерации ответов', 'desc': 'Статья представляет новый подход к обучению моделей извлечения и генерации (RAG), позволяющий выполнять пошаговый поиск и рассуждение перед генерацией окончательного ответа. Метод CoRAG (Chain-of-Retrieval Augmented Generation) позволяет модели динамически переформулировать запрос на основе развивающегося состояния. Для обучения CoRAG используется отбор с отклонением для автоматической генерации промежуточных цепочек поиска. Экспериментальные результаты показывают значительное улучшение производительности на различных бенчмарках, особенно в задачах многоэтапного ответа на вопросы.'}, 'en': {'title': 'CoRAG: Enhancing RAG with Dynamic Retrieval for Complex Queries', 'desc': 'This paper presents CoRAG, a novel approach for training retrieval-augmented generation (RAG) models that enhances their ability to handle complex queries. Unlike traditional RAG methods that rely on a single retrieval step, CoRAG employs a dynamic query reformulation process, allowing the model to retrieve information iteratively. The training process utilizes rejection sampling to create intermediate retrieval chains, enriching the dataset beyond just the final answers. Experimental results demonstrate that CoRAG significantly improves performance in multi-hop question answering tasks, achieving state-of-the-art results on the KILT benchmark.'}, 'zh': {'title': '动态检索,提升问答能力!', 'desc': '本文介绍了一种训练类似o1的RAG模型的新方法,该方法在生成最终答案之前逐步检索和推理相关信息。传统的RAG方法通常在生成过程之前只进行一次检索,这限制了它们在处理复杂查询时的有效性。我们提出的方法CoRAG(链式检索增强生成)允许模型根据不断变化的状态动态重构查询。通过使用拒绝采样自动生成中间检索链,我们有效地增强了现有的RAG数据集,从而在多跳问答任务中显著提高了模型的表现。'}}}, {'id': 'https://huggingface.co/papers/2501.14492', 'title': 'RealCritic: Towards Effectiveness-Driven Evaluation of Language Model Critiques', 'url': 'https://huggingface.co/papers/2501.14492', 'abstract': 'Critiques are important for enhancing the performance of Large Language Models (LLMs), enabling both self-improvement and constructive feedback for others by identifying flaws and suggesting improvements. However, evaluating the critique capabilities of LLMs presents a significant challenge due to the open-ended nature of the task. In this work, we introduce a new benchmark designed to assess the critique capabilities of LLMs. Unlike existing benchmarks, which typically function in an open-loop fashion, our approach employs a closed-loop methodology that evaluates the quality of corrections generated from critiques. Moreover, the benchmark incorporates features such as self-critique, cross-critique, and iterative critique, which are crucial for distinguishing the abilities of advanced reasoning models from more classical ones. We implement this benchmark using eight challenging reasoning tasks. We have several interesting findings. First, despite demonstrating comparable performance in direct chain-of-thought generation, classical LLMs significantly lag behind the advanced reasoning-based model o1-mini across all critique scenarios. Second, in self-critique and iterative critique settings, classical LLMs may even underperform relative to their baseline capabilities. We hope that this benchmark will serve as a valuable resource to guide future advancements. The code and data are available at https://github.com/tangzhy/RealCritic.', 'score': 13, 'issue_id': 1873, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '683923c8fb1958c2', 'authors': ['Zhengyang Tang', 'Ziniu Li', 'Zhenyang Xiao', 'Tian Ding', 'Ruoyu Sun', 'Benyou Wang', 'Dayiheng Liu', 'Fei Huang', 'Tianyu Liu', 'Bowen Yu', 'Junyang Lin'], 'affiliations': ['Qwen Team, Alibaba Inc.', 'Shenzhen Research Institute of Big Data', 'The Chinese University of Hong Kong, Shenzhen'], 'pdf_title_img': 'assets/pdf/title_img/2501.14492.jpg', 'data': {'categories': ['#benchmark', '#interpretability', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Новый бенчмарк раскрывает истинный потенциал LLM в критическом мышлении', 'desc': 'Эта статья представляет новый бенчмарк для оценки способностей больших языковых моделей (LLM) к критике. В отличие от существующих бенчмарков, этот подход использует замкнутую методологию, оценивающую качество исправлений, сгенерированных на основе критики. Бенчмарк включает в себя самокритику, перекрестную критику и итеративную критику, что важно для различения способностей продвинутых моделей рассуждения от классических. Исследование показало, что классические LLM значительно отстают от продвинутых моделей рассуждения во всех сценариях критики, несмотря на сопоставимую производительность в прямой генерации цепочки рассуждений.'}, 'en': {'title': 'Enhancing LLMs Through Effective Critique Evaluation', 'desc': 'This paper focuses on improving Large Language Models (LLMs) by evaluating their critique capabilities, which are essential for self-improvement and providing feedback. The authors introduce a new benchmark that uses a closed-loop methodology to assess how well LLMs can generate corrections based on critiques. This benchmark includes features like self-critique, cross-critique, and iterative critique, allowing for a more nuanced evaluation of reasoning abilities. The findings reveal that advanced reasoning models outperform classical LLMs in critique scenarios, highlighting the need for better evaluation methods in machine learning.'}, 'zh': {'title': '提升LLMs性能的新基准评估批评能力', 'desc': '本文探讨了大型语言模型(LLMs)在批评能力方面的评估。我们提出了一种新的基准,采用闭环方法来评估批评生成的修正质量。该基准包括自我批评、交叉批评和迭代批评等特性,以区分高级推理模型与传统模型的能力。研究发现,尽管传统LLMs在直接思维生成方面表现相似,但在所有批评场景中,它们的表现明显落后于基于高级推理的模型o1-mini。'}}}, {'id': 'https://huggingface.co/papers/2501.14726', 'title': 'Relightable Full-Body Gaussian Codec Avatars', 'url': 'https://huggingface.co/papers/2501.14726', 'abstract': 'We propose Relightable Full-Body Gaussian Codec Avatars, a new approach for modeling relightable full-body avatars with fine-grained details including face and hands. The unique challenge for relighting full-body avatars lies in the large deformations caused by body articulation and the resulting impact on appearance caused by light transport. Changes in body pose can dramatically change the orientation of body surfaces with respect to lights, resulting in both local appearance changes due to changes in local light transport functions, as well as non-local changes due to occlusion between body parts. To address this, we decompose the light transport into local and non-local effects. Local appearance changes are modeled using learnable zonal harmonics for diffuse radiance transfer. Unlike spherical harmonics, zonal harmonics are highly efficient to rotate under articulation. This allows us to learn diffuse radiance transfer in a local coordinate frame, which disentangles the local radiance transfer from the articulation of the body. To account for non-local appearance changes, we introduce a shadow network that predicts shadows given precomputed incoming irradiance on a base mesh. This facilitates the learning of non-local shadowing between the body parts. Finally, we use a deferred shading approach to model specular radiance transfer and better capture reflections and highlights such as eye glints. We demonstrate that our approach successfully models both the local and non-local light transport required for relightable full-body avatars, with a superior generalization ability under novel illumination conditions and unseen poses.', 'score': 5, 'issue_id': 1873, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '0072ce1869c715b7', 'authors': ['Shaofei Wang', 'Tomas Simon', 'Igor Santesteban', 'Timur Bagautdinov', 'Junxuan Li', 'Vasu Agrawal', 'Fabian Prada', 'Shoou-I Yu', 'Pace Nalbone', 'Matt Gramlich', 'Roman Lubachersky', 'Chenglei Wu', 'Javier Romero', 'Jason Saragih', 'Michael Zollhoefer', 'Andreas Geiger', 'Siyu Tang', 'Shunsuke Saito'], 'affiliations': ['Codec Avatars Lab, Meta, USA', 'ETH Zürich, Switzerland', 'University of Tübingen, Germany'], 'pdf_title_img': 'assets/pdf/title_img/2501.14726.jpg', 'data': {'categories': ['#cv', '#3d'], 'emoji': '🕴️', 'ru': {'title': 'Реалистичное освещение для полноразмерных цифровых аватаров', 'desc': 'Статья представляет новый подход к моделированию полноразмерных аватаров с возможностью изменения освещения, включая детализацию лица и рук. Авторы предлагают декомпозицию световых эффектов на локальные и нелокальные, используя обучаемые зональные гармоники для диффузного переноса освещения и специальную нейронную сеть для предсказания теней. Метод также включает отложенный шейдинг для моделирования зеркального переноса освещения. Результаты демонстрируют успешное моделирование как локального, так и нелокального переноса света для полноразмерных аватаров с улучшенной способностью к обобщению в новых условиях освещения и позах.'}, 'en': {'title': 'Realistic Relightable Avatars Through Advanced Light Transport Modeling', 'desc': 'This paper presents a novel method for creating relightable full-body avatars that capture intricate details like facial features and hands. The authors tackle the challenge of how body movements affect lighting and appearance by separating light transport into local and non-local effects. They utilize learnable zonal harmonics to efficiently model local changes in appearance due to body articulation, while a shadow network predicts non-local shadowing effects between body parts. The proposed approach enhances the realism of avatars under varying lighting conditions and poses, demonstrating improved generalization capabilities.'}, 'zh': {'title': '可重光照的全身头像建模新方法', 'desc': '我们提出了一种新的方法,称为可重光照的全身高斯编码头像,旨在建模具有细致面部和手部特征的全身头像。该方法解决了由于身体关节运动引起的大变形对外观的影响,特别是光传输的变化。我们将光传输分解为局部和非局部效应,使用可学习的区域谐波来建模局部外观变化,并引入阴影网络来预测身体部位之间的阴影。最终,我们采用延迟着色方法来建模镜面反射,以更好地捕捉反射和高光效果。'}}}, {'id': 'https://huggingface.co/papers/2501.14176', 'title': 'RL + Transformer = A General-Purpose Problem Solver', 'url': 'https://huggingface.co/papers/2501.14176', 'abstract': 'What if artificial intelligence could not only solve problems for which it was trained but also learn to teach itself to solve new problems (i.e., meta-learn)? In this study, we demonstrate that a pre-trained transformer fine-tuned with reinforcement learning over multiple episodes develops the ability to solve problems that it has never encountered before - an emergent ability called In-Context Reinforcement Learning (ICRL). This powerful meta-learner not only excels in solving unseen in-distribution environments with remarkable sample efficiency, but also shows strong performance in out-of-distribution environments. In addition, we show that it exhibits robustness to the quality of its training data, seamlessly stitches together behaviors from its context, and adapts to non-stationary environments. These behaviors demonstrate that an RL-trained transformer can iteratively improve upon its own solutions, making it an excellent general-purpose problem solver.', 'score': 4, 'issue_id': 1884, 'pub_date': '2025-01-24', 'pub_date_card': {'ru': '24 января', 'en': 'January 24', 'zh': '1月24日'}, 'hash': '708deafdf9ddb570', 'authors': ['Micah Rentschler', 'Jesse Roberts'], 'affiliations': ['Tennessee Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2501.14176.jpg', 'data': {'categories': ['#training', '#transfer_learning', '#optimization', '#agi', '#rl'], 'emoji': '🧠', 'ru': {'title': 'Трансформер учится учиться: новые горизонты искусственного интеллекта', 'desc': 'Исследование демонстрирует, что предобученный трансформер, дообученный с помощью обучения с подкреплением, развивает способность решать новые задачи - так называемое контекстное обучение с подкреплением (ICRL). Эта мета-обучающаяся модель эффективно решает не только задачи из распределения обучающих данных, но и задачи вне этого распределения. Модель показывает устойчивость к качеству обучающих данных и способность адаптироваться к нестационарным средам. Это свидетельствует о том, что трансформер, обученный с подкреплением, может итеративно улучшать свои решения.'}, 'en': {'title': 'Empowering AI: Learning to Solve New Problems with In-Context Reinforcement Learning', 'desc': 'This paper explores the concept of In-Context Reinforcement Learning (ICRL), where a pre-trained transformer model learns to solve new problems through reinforcement learning. The model shows remarkable sample efficiency, allowing it to tackle unseen problems effectively, both in familiar and unfamiliar environments. It also demonstrates robustness to varying training data quality and adapts well to changing conditions. Overall, the study highlights the potential of RL-trained transformers as versatile problem solvers capable of self-improvement.'}, 'zh': {'title': '元学习:让AI自我解决新问题的能力', 'desc': '本研究展示了一种预训练的变换器模型,通过强化学习进行微调,能够解决之前未遇到过的问题,这种能力被称为上下文强化学习(ICRL)。这种强大的元学习者在处理未见过的环境时表现出色,具有显著的样本效率,并且在分布外环境中也表现良好。此外,它对训练数据的质量具有鲁棒性,能够无缝地结合上下文中的行为,并适应非平稳环境。这些特性表明,经过强化学习训练的变换器能够不断改进自己的解决方案,成为一种优秀的通用问题解决者。'}}}, {'id': 'https://huggingface.co/papers/2501.13687', 'title': 'Question Answering on Patient Medical Records with Private Fine-Tuned LLMs', 'url': 'https://huggingface.co/papers/2501.13687', 'abstract': 'Healthcare systems continuously generate vast amounts of electronic health records (EHRs), commonly stored in the Fast Healthcare Interoperability Resources (FHIR) standard. Despite the wealth of information in these records, their complexity and volume make it difficult for users to retrieve and interpret crucial health insights. Recent advances in Large Language Models (LLMs) offer a solution, enabling semantic question answering (QA) over medical data, allowing users to interact with their health records more effectively. However, ensuring privacy and compliance requires edge and private deployments of LLMs. This paper proposes a novel approach to semantic QA over EHRs by first identifying the most relevant FHIR resources for a user query (Task1) and subsequently answering the query based on these resources (Task2). We explore the performance of privately hosted, fine-tuned LLMs, evaluating them against benchmark models such as GPT-4 and GPT-4o. Our results demonstrate that fine-tuned LLMs, while 250x smaller in size, outperform GPT-4 family models by 0.55% in F1 score on Task1 and 42% on Meteor Task in Task2. Additionally, we examine advanced aspects of LLM usage, including sequential fine-tuning, model self-evaluation (narcissistic evaluation), and the impact of training data size on performance. The models and datasets are available here: https://huggingface.co/genloop', 'score': 3, 'issue_id': 1885, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '710359a2b4f5f274', 'authors': ['Sara Kothari', 'Ayush Gupta'], 'affiliations': ['Department of Computer Science Stanford University', 'Genloop Labs, Inc. Delaware, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.13687.jpg', 'data': {'categories': ['#open_source', '#multimodal', '#dataset', '#training', '#science', '#benchmark', '#healthcare'], 'emoji': '🏥', 'ru': {'title': 'Эффективный анализ медицинских данных с помощью LLM', 'desc': 'Статья представляет новый подход к семантическому вопросно-ответному анализу электронных медицинских карт (ЭМК) с использованием больших языковых моделей (LLM). Авторы предлагают двухэтапный метод: сначала идентифицируются релевантные ресурсы FHIR, затем на их основе формируется ответ на запрос пользователя. Исследование показывает, что дообученные LLM меньшего размера превосходят модели семейства GPT-4 по ряду метрик. Также рассматриваются продвинутые аспекты использования LLM, включая последовательную тонкую настройку и самооценку моделей.'}, 'en': {'title': 'Unlocking Health Insights with Fine-Tuned Language Models', 'desc': 'This paper addresses the challenge of extracting meaningful insights from electronic health records (EHRs) using Large Language Models (LLMs). It introduces a two-step approach for semantic question answering (QA) that first identifies relevant FHIR resources and then answers user queries based on those resources. The study evaluates privately hosted, fine-tuned LLMs against benchmark models like GPT-4, showing that these smaller models can outperform larger ones in specific tasks. Additionally, it explores advanced techniques such as sequential fine-tuning and the effects of training data size on model performance.'}, 'zh': {'title': '提升医疗数据问答的智能化与隐私保护', 'desc': '本论文提出了一种新的语义问答方法,旨在提高用户对电子健康记录(EHRs)的访问和理解。首先,通过识别与用户查询最相关的FHIR资源(任务1),然后基于这些资源回答查询(任务2)。研究表明,经过微调的私有托管大型语言模型(LLMs)在任务1的F1分数上比GPT-4模型高出0.55%,在任务2的Meteor任务上高出42%。此外,论文还探讨了模型的自我评估和训练数据规模对性能的影响。'}}}, {'id': 'https://huggingface.co/papers/2501.13925', 'title': 'GeoPixel: Pixel Grounding Large Multimodal Model in Remote Sensing', 'url': 'https://huggingface.co/papers/2501.13925', 'abstract': 'Recent advances in large multimodal models (LMMs) have recognized fine-grained grounding as an imperative factor of visual understanding and dialogue. However, the benefits of such representation in LMMs are limited to the natural image domain, and these models perform poorly for remote sensing (RS). The distinct overhead viewpoint, scale variation, and presence of small objects in high-resolution RS imagery present a unique challenge in region-level comprehension. Moreover, the development of the grounding conversation capability of LMMs within RS is hindered by the lack of granular, RS domain-specific grounded data. Addressing these limitations, we propose GeoPixel - the first end-to-end high resolution RS-LMM that supports pixel-level grounding. This capability allows fine-grained visual perception by generating interleaved masks in conversation. GeoPixel supports up to 4K HD resolution in any aspect ratio, ideal for high-precision RS image analysis. To support the grounded conversation generation (GCG) in RS imagery, we curate a visually grounded dataset GeoPixelD through a semi-automated pipeline that utilizes set-of-marks prompting and spatial priors tailored for RS data to methodically control the data generation process. GeoPixel demonstrates superior performance in pixel-level comprehension, surpassing existing LMMs in both single-target and multi-target segmentation tasks. Our methodological ablation studies validate the effectiveness of each component in the overall architecture. Our code and data will be publicly released.', 'score': 3, 'issue_id': 1883, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '0c6257aa10e28148', 'authors': ['Akashah Shabbir', 'Mohammed Zumri', 'Mohammed Bennamoun', 'Fahad S. Khan', 'Salman Khan'], 'affiliations': ['Australian National University', 'Linkoping University', 'Mohamed bin Zayed University of AI', 'The University of Western Australia'], 'pdf_title_img': 'assets/pdf/title_img/2501.13925.jpg', 'data': {'categories': ['#open_source', '#architecture', '#dataset', '#multimodal', '#data', '#games', '#optimization'], 'emoji': '🛰️', 'ru': {'title': 'GeoPixel: Новый уровень детализации в анализе спутниковых снимков', 'desc': 'Статья представляет GeoPixel - первую модель для дистанционного зондирования с поддержкой пиксельного уровня детализации. Модель способна анализировать изображения высокого разрешения до 4K, что идеально подходит для точного анализа спутниковых снимков. Для обучения модели был создан специализированный датасет GeoPixelD с аннотациями на уровне пикселей. GeoPixel превосходит существующие мультимодальные модели в задачах сегментации как одиночных, так и множественных объектов на спутниковых снимках.'}, 'en': {'title': 'GeoPixel: Revolutionizing Remote Sensing with Pixel-Level Grounding', 'desc': "This paper introduces GeoPixel, a novel large multimodal model designed specifically for remote sensing imagery. It addresses the challenges of fine-grained grounding in high-resolution images, which are often complicated by factors like scale variation and small object presence. GeoPixel enhances visual understanding by enabling pixel-level grounding and generating interleaved masks during conversations. The authors also present a new dataset, GeoPixelD, which is tailored for remote sensing tasks and supports the model's grounded conversation capabilities."}, 'zh': {'title': 'GeoPixel:高分辨率遥感图像的像素级理解', 'desc': '最近,大型多模态模型(LMMs)的进展表明,细粒度的基础是视觉理解和对话的重要因素。然而,这些模型在遥感(RS)领域的表现较差,主要是由于遥感图像的独特挑战,如视角、尺度变化和小物体的存在。为了解决这些问题,我们提出了GeoPixel,这是第一个支持像素级基础的高分辨率RS-LMM,能够生成交错的掩码以实现细粒度的视觉感知。GeoPixel在单目标和多目标分割任务中表现优于现有的LMMs,展示了其在高精度遥感图像分析中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2403.14614', 'title': 'AdaIR: Adaptive All-in-One Image Restoration via Frequency Mining and Modulation', 'url': 'https://huggingface.co/papers/2403.14614', 'abstract': 'In the image acquisition process, various forms of degradation, including noise, haze, and rain, are frequently introduced. These degradations typically arise from the inherent limitations of cameras or unfavorable ambient conditions. To recover clean images from degraded versions, numerous specialized restoration methods have been developed, each targeting a specific type of degradation. Recently, all-in-one algorithms have garnered significant attention by addressing different types of degradations within a single model without requiring prior information of the input degradation type. However, these methods purely operate in the spatial domain and do not delve into the distinct frequency variations inherent to different degradation types. To address this gap, we propose an adaptive all-in-one image restoration network based on frequency mining and modulation. Our approach is motivated by the observation that different degradation types impact the image content on different frequency subbands, thereby requiring different treatments for each restoration task. Specifically, we first mine low- and high-frequency information from the input features, guided by the adaptively decoupled spectra of the degraded image. The extracted features are then modulated by a bidirectional operator to facilitate interactions between different frequency components. Finally, the modulated features are merged into the original input for a progressively guided restoration. With this approach, the model achieves adaptive reconstruction by accentuating the informative frequency subbands according to different input degradations. Extensive experiments demonstrate that the proposed method achieves state-of-the-art performance on different image restoration tasks, including denoising, dehazing, deraining, motion deblurring, and low-light image enhancement. Our code is available at https://github.com/c-yn/AdaIR.', 'score': 2, 'issue_id': 1883, 'pub_date': '2025-03-21', 'pub_date_card': {'ru': '21 марта', 'en': 'March 21', 'zh': '3月21日'}, 'hash': '54f7acd2a97e8313', 'authors': ['Yuning Cui', 'Syed Waqas Zamir', 'Salman Khan', 'Alois Knoll', 'Mubarak Shah', 'Fahad Shahbaz Khan'], 'affiliations': ['Australian National University', 'Inception Institute of Artificial Intelligence', 'Linköping University', 'Mohammed Bin Zayed University of AI', 'Technical University of Munich', 'University of Central Florida'], 'pdf_title_img': 'assets/pdf/title_img/2403.14614.jpg', 'data': {'categories': ['#cv'], 'emoji': '🖼️', 'ru': {'title': 'Адаптивное восстановление изображений на основе частотного анализа', 'desc': 'Статья описывает новый метод адаптивного восстановления изображений, пострадавших от различных видов деградации (шум, туман, дождь и т.д.). Авторы предлагают нейронную сеть, которая анализирует частотные характеристики искажений и адаптивно применяет соответствующие методы обработки. Подход основан на извлечении низко- и высокочастотной информации из входных данных и их модуляции с помощью двунаправленного оператора. Эксперименты показывают, что предложенный метод достигает наилучших результатов в различных задачах восстановления изображений.'}, 'en': {'title': 'Adaptive Image Restoration through Frequency Modulation', 'desc': 'This paper presents an innovative image restoration network that adapts to various types of image degradation, such as noise, haze, and rain. Unlike traditional methods that focus solely on spatial domain processing, this approach utilizes frequency mining to identify and modulate low- and high-frequency information specific to each degradation type. By employing a bidirectional operator, the model enhances interactions between different frequency components, allowing for more effective restoration. The results show that this adaptive method outperforms existing techniques across multiple restoration tasks, demonstrating its versatility and effectiveness.'}, 'zh': {'title': '自适应一体化图像修复:频率驱动的创新', 'desc': '在图像获取过程中,常常会出现噪声、雾霾和雨水等各种退化形式。这些退化通常源于相机的固有限制或不利的环境条件。为了从退化图像中恢复清晰图像,已经开发了许多专门的修复方法。我们提出了一种基于频率挖掘和调制的自适应一体化图像修复网络,能够在单一模型中处理不同类型的退化,且无需输入退化类型的先验信息。'}}}, {'id': 'https://huggingface.co/papers/2411.19458', 'title': 'Multiview Equivariance Improves 3D Correspondence Understanding with Minimal Feature Finetuning', 'url': 'https://huggingface.co/papers/2411.19458', 'abstract': 'Vision foundation models, particularly the ViT family, have revolutionized image understanding by providing rich semantic features. However, despite their success in 2D comprehension, their abilities on grasping 3D spatial relationships are still unclear. In this work, we evaluate and enhance the 3D awareness of ViT-based models. We begin by systematically assessing their ability to learn 3D equivariant features, specifically examining the consistency of semantic embeddings across different viewpoints. Our findings indicate that improved 3D equivariance leads to better performance on various downstream tasks, including pose estimation, tracking, and semantic transfer. Building on this insight, we propose a simple yet effective finetuning strategy based on 3D correspondences, which significantly enhances the 3D correspondence understanding of existing vision models. Remarkably, even finetuning on a single object for just one iteration results in substantial performance gains. All code and resources will be made publicly available to support further advancements in 3D-aware vision models. Our code is available at https://github.com/qq456cvb/3DCorrEnhance.', 'score': 1, 'issue_id': 1883, 'pub_date': '2025-11-29', 'pub_date_card': {'ru': '29 ноября', 'en': 'November 29', 'zh': '11月29日'}, 'hash': 'df24163a81379619', 'authors': ['Yang You', 'Yixin Li', 'Congyue Deng', 'Yue Wang', 'Leonidas Guibas'], 'affiliations': ['Department of Computer Science, Stanford University, U.S.A.', 'Department of Computer Science, University of Southern California, U.S.A.'], 'pdf_title_img': 'assets/pdf/title_img/2411.19458.jpg', 'data': {'categories': ['#cv', '#open_source', '#3d', '#training'], 'emoji': '🧊', 'ru': {'title': 'Повышение 3D-осведомленности моделей компьютерного зрения', 'desc': 'Статья посвящена исследованию и улучшению понимания трехмерных пространственных отношений моделями компьютерного зрения, основанными на архитектуре ViT. Авторы оценивают способность этих моделей изучать 3D-эквивариантные признаки и обнаруживают, что улучшение 3D-эквивариантности приводит к повышению производительности в различных задачах. Они предлагают эффективную стратегию дообучения на основе 3D-соответствий, которая значительно улучшает понимание трехмерных соответствий существующими моделями. Даже минимальное дообучение на одном объекте приводит к существенному повышению производительности.'}, 'en': {'title': 'Enhancing 3D Awareness in Vision Transformers', 'desc': "This paper focuses on improving the 3D understanding capabilities of Vision Transformer (ViT) models, which are known for their strong performance in 2D image analysis. The authors evaluate how well these models can learn 3D equivariant features, which are essential for maintaining consistent semantic meanings across different viewpoints. They discover that enhancing 3D equivariance significantly boosts the models' performance on tasks like pose estimation and tracking. To achieve this, they introduce a straightforward finetuning method that leverages 3D correspondences, showing that even minimal finetuning can lead to notable improvements in 3D comprehension."}, 'zh': {'title': '提升视觉模型的3D理解能力', 'desc': '本文探讨了视觉基础模型,特别是ViT系列在图像理解中的应用,尤其是其在3D空间关系理解方面的能力。我们系统评估了这些模型学习3D等变特征的能力,重点分析了不同视角下语义嵌入的一致性。研究表明,提升3D等变性可以显著改善在姿态估计、跟踪和语义转移等下游任务中的表现。基于这一发现,我们提出了一种简单有效的微调策略,通过3D对应关系显著增强现有视觉模型的3D理解能力。'}}}, {'id': 'https://huggingface.co/papers/2501.11325', 'title': 'CatV2TON: Taming Diffusion Transformers for Vision-Based Virtual Try-On with Temporal Concatenation', 'url': 'https://huggingface.co/papers/2501.11325', 'abstract': 'Virtual try-on (VTON) technology has gained attention due to its potential to transform online retail by enabling realistic clothing visualization of images and videos. However, most existing methods struggle to achieve high-quality results across image and video try-on tasks, especially in long video scenarios. In this work, we introduce CatV2TON, a simple and effective vision-based virtual try-on (V2TON) method that supports both image and video try-on tasks with a single diffusion transformer model. By temporally concatenating garment and person inputs and training on a mix of image and video datasets, CatV2TON achieves robust try-on performance across static and dynamic settings. For efficient long-video generation, we propose an overlapping clip-based inference strategy that uses sequential frame guidance and Adaptive Clip Normalization (AdaCN) to maintain temporal consistency with reduced resource demands. We also present ViViD-S, a refined video try-on dataset, achieved by filtering back-facing frames and applying 3D mask smoothing for enhanced temporal consistency. Comprehensive experiments demonstrate that CatV2TON outperforms existing methods in both image and video try-on tasks, offering a versatile and reliable solution for realistic virtual try-ons across diverse scenarios.', 'score': 0, 'issue_id': 1887, 'pub_date': '2025-01-20', 'pub_date_card': {'ru': '20 января', 'en': 'January 20', 'zh': '1月20日'}, 'hash': '3b21eab627e1a9f7', 'authors': ['Zheng Chong', 'Wenqing Zhang', 'Shiyue Zhang', 'Jun Zheng', 'Xiao Dong', 'Haoxiang Li', 'Yiling Wu', 'Dongmei Jiang', 'Xiaodan Liang'], 'affiliations': ['National University of Singapore', 'Pengcheng Laboratory', 'Pixocial Technology', 'Sun Yat-Sen University'], 'pdf_title_img': 'assets/pdf/title_img/2501.11325.jpg', 'data': {'categories': ['#cv', '#multimodal', '#dataset', '#video'], 'emoji': '👚', 'ru': {'title': 'Универсальная виртуальная примерка для изображений и видео', 'desc': 'CatV2TON - это новый метод виртуальной примерки одежды, использующий диффузионный трансформер для обработки как изображений, так и видео. Метод применяет технику конкатенации входных данных и обучение на смешанном наборе изображений и видео для достижения высокого качества результатов. Для эффективной обработки длинных видео предложена стратегия инференса с перекрывающимися клипами и адаптивной нормализацией. Авторы также представили улучшенный датасет ViViD-S для задачи виртуальной примерки на видео.'}, 'en': {'title': 'Transforming Virtual Try-Ons with CatV2TON: One Model, Many Scenarios!', 'desc': 'This paper presents CatV2TON, a novel virtual try-on method that utilizes a single diffusion transformer model for both image and video applications. It addresses the challenges of achieving high-quality results in long video scenarios by employing an overlapping clip-based inference strategy, which enhances temporal consistency. The method is trained on a diverse dataset that includes both images and videos, allowing it to perform effectively in various settings. Experimental results show that CatV2TON outperforms existing techniques, making it a promising solution for realistic virtual clothing visualization.'}, 'zh': {'title': 'CatV2TON:高效的虚拟试穿解决方案', 'desc': '虚拟试穿(VTON)技术在在线零售中引起了广泛关注,因为它能够实现真实的服装可视化。现有的方法在图像和视频试穿任务中,尤其是长视频场景中,往往难以达到高质量的效果。我们提出的CatV2TON是一种简单有效的基于视觉的虚拟试穿方法,能够支持图像和视频试穿任务,并使用单一的扩散变换器模型。通过时间上连接服装和人物输入,并在混合的图像和视频数据集上进行训练,CatV2TON在静态和动态场景中都表现出强大的试穿性能。'}}}, {'id': 'https://huggingface.co/papers/2406.18516', 'title': 'Denoising as Adaptation: Noise-Space Domain Adaptation for Image Restoration', 'url': 'https://huggingface.co/papers/2406.18516', 'abstract': 'Although learning-based image restoration methods have made significant progress, they still struggle with limited generalization to real-world scenarios due to the substantial domain gap caused by training on synthetic data. Existing methods address this issue by improving data synthesis pipelines, estimating degradation kernels, employing deep internal learning, and performing domain adaptation and regularization. Previous domain adaptation methods have sought to bridge the domain gap by learning domain-invariant knowledge in either feature or pixel space. However, these techniques often struggle to extend to low-level vision tasks within a stable and compact framework. In this paper, we show that it is possible to perform domain adaptation via the noise space using diffusion models. In particular, by leveraging the unique property of how auxiliary conditional inputs influence the multi-step denoising process, we derive a meaningful diffusion loss that guides the restoration model in progressively aligning both restored synthetic and real-world outputs with a target clean distribution. We refer to this method as denoising as adaptation. To prevent shortcuts during joint training, we present crucial strategies such as channel-shuffling layer and residual-swapping contrastive learning in the diffusion model. They implicitly blur the boundaries between conditioned synthetic and real data and prevent the reliance of the model on easily distinguishable features. Experimental results on three classical image restoration tasks, namely denoising, deblurring, and deraining, demonstrate the effectiveness of the proposed method.', 'score': 0, 'issue_id': 1883, 'pub_date': '2025-06-26', 'pub_date_card': {'ru': '26 июня', 'en': 'June 26', 'zh': '6月26日'}, 'hash': 'ef06fd4cf15b3995', 'authors': ['Kang Liao', 'Zongsheng Yue', 'Zhouxia Wang', 'Chen Change Loy'], 'affiliations': ['S-Lab, Nanyang Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2406.18516.jpg', 'data': {'categories': ['#training', '#diffusion', '#data', '#optimization', '#cv', '#transfer_learning'], 'emoji': '🖼️', 'ru': {'title': 'Адаптация домена через шум: новый подход к восстановлению изображений', 'desc': "Статья представляет новый метод адаптации домена для задач восстановления изображений с использованием диффузионных моделей. Авторы предлагают выполнять адаптацию через пространство шума, используя уникальные свойства многоступенчатого процесса удаления шума. Метод, названный 'denoising as adaptation', направляет модель восстановления на постепенное выравнивание как синтетических, так и реальных выходных данных с целевым чистым распределением. Экспериментальные результаты на задачах шумоподавления, устранения размытия и удаления дождя демонстрируют эффективность предложенного подхода."}, 'en': {'title': 'Bridging the Gap: Denoising as Adaptation for Image Restoration', 'desc': "This paper addresses the challenge of image restoration methods that struggle to generalize to real-world scenarios due to the gap between synthetic training data and real data. The authors propose a novel approach called 'denoising as adaptation' that utilizes diffusion models to perform domain adaptation in the noise space. By introducing a diffusion loss that aligns synthetic and real-world outputs, the method effectively guides the restoration process. Additionally, strategies like channel-shuffling and residual-swapping contrastive learning are implemented to enhance the model's robustness against overfitting to distinguishable features."}, 'zh': {'title': '去噪作为适应:提升图像恢复的领域适应能力', 'desc': '本文探讨了基于学习的图像恢复方法在真实场景中的泛化能力不足的问题,主要是由于训练数据与真实数据之间存在显著的领域差距。我们提出了一种新的领域适应方法,通过噪声空间利用扩散模型来实现,特别是利用辅助条件输入对多步去噪过程的影响,导出了一种有意义的扩散损失。该方法称为去噪作为适应,能够逐步对齐恢复的合成图像和真实图像。实验结果表明,该方法在去噪、去模糊和去雨等经典图像恢复任务中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.12948', 'title': 'DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning', 'url': 'https://huggingface.co/papers/2501.12948', 'abstract': 'We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama.', 'score': 94, 'issue_id': 1819, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': 'cae642107ec57790', 'authors': ['DeepSeek-AI', 'Daya Guo', 'Dejian Yang', 'Haowei Zhang', 'Junxiao Song', 'Ruoyu Zhang', 'Runxin Xu', 'Qihao Zhu', 'Shirong Ma', 'Peiyi Wang', 'Xiao Bi', 'Xiaokang Zhang', 'Xingkai Yu', 'Yu Wu', 'Z. F. Wu', 'Zhibin Gou', 'Zhihong Shao', 'Zhuoshu Li', 'Ziyi Gao', 'Aixin Liu', 'Bing Xue', 'Bingxuan Wang', 'Bochao Wu', 'Bei Feng', 'Chengda Lu', 'Chenggang Zhao', 'Chengqi Deng', 'Chenyu Zhang', 'Chong Ruan', 'Damai Dai', 'Deli Chen', 'Dongjie Ji', 'Erhang Li', 'Fangyun Lin', 'Fucong Dai', 'Fuli Luo', 'Guangbo Hao', 'Guanting Chen', 'Guowei Li', 'H. Zhang', 'Han Bao', 'Hanwei Xu', 'Haocheng Wang', 'Honghui Ding', 'Huajian Xin', 'Huazuo Gao', 'Hui Qu', 'Hui Li', 'Jianzhong Guo', 'Jiashi Li', 'Jiawei Wang', 'Jingchang Chen', 'Jingyang Yuan', 'Junjie Qiu', 'Junlong Li', 'J. L. Cai', 'Jiaqi Ni', 'Jian Liang', 'Jin Chen', 'Kai Dong', 'Kai Hu', 'Kaige Gao', 'Kang Guan', 'Kexin Huang', 'Kuai Yu', 'Lean Wang', 'Lecong Zhang', 'Liang Zhao', 'Litong Wang', 'Liyue Zhang', 'Lei Xu', 'Leyi Xia', 'Mingchuan Zhang', 'Minghua Zhang', 'Minghui Tang', 'Meng Li', 'Miaojun Wang', 'Mingming Li', 'Ning Tian', 'Panpan Huang', 'Peng Zhang', 'Qiancheng Wang', 'Qinyu Chen', 'Qiushi Du', 'Ruiqi Ge', 'Ruisong Zhang', 'Ruizhe Pan', 'Runji Wang', 'R. J. Chen', 'R. L. Jin', 'Ruyi Chen', 'Shanghao Lu', 'Shangyan Zhou', 'Shanhuang Chen', 'Shengfeng Ye', 'Shiyu Wang', 'Shuiping Yu', 'Shunfeng Zhou', 'Shuting Pan', 'S. S. Li', 'Shuang Zhou', 'Shaoqing Wu', 'Shengfeng Ye', 'Tao Yun', 'Tian Pei', 'Tianyu Sun', 'T. Wang', 'Wangding Zeng', 'Wanjia Zhao', 'Wen Liu', 'Wenfeng Liang', 'Wenjun Gao', 'Wenqin Yu', 'Wentao Zhang', 'W. L. Xiao', 'Wei An', 'Xiaodong Liu', 'Xiaohan Wang', 'Xiaokang Chen', 'Xiaotao Nie', 'Xin Cheng', 'Xin Liu', 'Xin Xie', 'Xingchao Liu', 'Xinyu Yang', 'Xinyuan Li', 'Xuecheng Su', 'Xuheng Lin', 'X. Q. Li', 'Xiangyue Jin', 'Xiaojin Shen', 'Xiaosha Chen', 'Xiaowen Sun', 'Xiaoxiang Wang', 'Xinnan Song', 'Xinyi Zhou', 'Xianzu Wang', 'Xinxia Shan', 'Y. K. Li', 'Y. Q. Wang', 'Y. X. Wei', 'Yang Zhang', 'Yanhong Xu', 'Yao Li', 'Yao Zhao', 'Yaofeng Sun', 'Yaohui Wang', 'Yi Yu', 'Yichao Zhang', 'Yifan Shi', 'Yiliang Xiong', 'Ying He', 'Yishi Piao', 'Yisong Wang', 'Yixuan Tan', 'Yiyang Ma', 'Yiyuan Liu', 'Yongqiang Guo', 'Yuan Ou', 'Yuduan Wang', 'Yue Gong', 'Yuheng Zou', 'Yujia He', 'Yunfan Xiong', 'Yuxiang Luo', 'Yuxiang You', 'Yuxuan Liu', 'Yuyang Zhou', 'Y. X. Zhu', 'Yanhong Xu', 'Yanping Huang', 'Yaohui Li', 'Yi Zheng', 'Yuchen Zhu', 'Yunxian Ma', 'Ying Tang', 'Yukun Zha', 'Yuting Yan', 'Z. Z. Ren', 'Zehui Ren', 'Zhangli Sha', 'Zhe Fu', 'Zhean Xu', 'Zhenda Xie', 'Zhengyan Zhang', 'Zhewen Hao', 'Zhicheng Ma', 'Zhigang Yan', 'Zhiyu Wu', 'Zihui Gu', 'Zijia Zhu', 'Zijun Liu', 'Zilin Li', 'Ziwei Xie', 'Ziyang Song', 'Zizheng Pan', 'Zhen Huang', 'Zhipeng Xu', 'Zhongyu Zhang', 'Zhen Zhang'], 'affiliations': ['DeepSeek-AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.12948.jpg', 'data': {'categories': ['#training', '#rl', '#reasoning', '#open_source', '#dataset'], 'emoji': '🧠', 'ru': {'title': 'Новое поколение моделей рассуждения: обучение с подкреплением открывает путь к улучшенному ИИ', 'desc': 'Исследователи представили модели рассуждений DeepSeek-R1-Zero и DeepSeek-R1. DeepSeek-R1-Zero обучена с помощью масштабного обучения с подкреплением без предварительной тонкой настройки и демонстрирует впечатляющие способности к рассуждению. DeepSeek-R1 использует многоэтапное обучение для улучшения производительности и решения проблем читаемости. Модели показывают результаты, сравнимые с OpenAI-o1-1217 на задачах рассуждения, и исследователи открыли исходный код моделей для научного сообщества.'}, 'en': {'title': 'Revolutionizing Reasoning with DeepSeek Models', 'desc': 'This paper presents two reasoning models, DeepSeek-R1-Zero and DeepSeek-R1, developed for enhanced reasoning capabilities. DeepSeek-R1-Zero is trained using large-scale reinforcement learning without any supervised fine-tuning, showcasing impressive reasoning behaviors but facing issues like readability and language mixing. To improve these aspects, DeepSeek-R1 employs a multi-stage training approach and utilizes cold-start data prior to reinforcement learning. The performance of DeepSeek-R1 is on par with existing models like OpenAI-o1-1217, and both models, along with several distilled versions, are made available to the research community.'}, 'zh': {'title': '深度推理模型的创新与挑战', 'desc': '我们介绍了第一代推理模型DeepSeek-R1-Zero和DeepSeek-R1。DeepSeek-R1-Zero是通过大规模强化学习(RL)训练的模型,没有经过监督微调(SFT),展现出卓越的推理能力。尽管如此,它在可读性和语言混合方面存在一些挑战。为了解决这些问题并进一步提升推理性能,我们引入了DeepSeek-R1,该模型在进行RL之前采用了多阶段训练和冷启动数据。'}}}, {'id': 'https://huggingface.co/papers/2501.12909', 'title': 'FilmAgent: A Multi-Agent Framework for End-to-End Film Automation in Virtual 3D Spaces', 'url': 'https://huggingface.co/papers/2501.12909', 'abstract': "Virtual film production requires intricate decision-making processes, including scriptwriting, virtual cinematography, and precise actor positioning and actions. Motivated by recent advances in automated decision-making with language agent-based societies, this paper introduces FilmAgent, a novel LLM-based multi-agent collaborative framework for end-to-end film automation in our constructed 3D virtual spaces. FilmAgent simulates various crew roles, including directors, screenwriters, actors, and cinematographers, and covers key stages of a film production workflow: (1) idea development transforms brainstormed ideas into structured story outlines; (2) scriptwriting elaborates on dialogue and character actions for each scene; (3) cinematography determines the camera setups for each shot. A team of agents collaborates through iterative feedback and revisions, thereby verifying intermediate scripts and reducing hallucinations. We evaluate the generated videos on 15 ideas and 4 key aspects. Human evaluation shows that FilmAgent outperforms all baselines across all aspects and scores 3.98 out of 5 on average, showing the feasibility of multi-agent collaboration in filmmaking. Further analysis reveals that FilmAgent, despite using the less advanced GPT-4o model, surpasses the single-agent o1, showing the advantage of a well-coordinated multi-agent system. Lastly, we discuss the complementary strengths and weaknesses of OpenAI's text-to-video model Sora and our FilmAgent in filmmaking.", 'score': 43, 'issue_id': 1819, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '0b73908eee2c2db3', 'authors': ['Zhenran Xu', 'Longyue Wang', 'Jifang Wang', 'Zhouyi Li', 'Senbao Shi', 'Xue Yang', 'Yiyu Wang', 'Baotian Hu', 'Jun Yu', 'Min Zhang'], 'affiliations': ['Harbin Institute of Technology (Shenzhen)', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.12909.jpg', 'data': {'categories': ['#multimodal', '#story_generation', '#3d', '#open_source', '#agents', '#hallucinations'], 'emoji': '🎬', 'ru': {'title': 'Виртуальная киностудия: ИИ-агенты создают фильмы от идеи до готового продукта', 'desc': 'FilmAgent - это новая система на основе языковых моделей для автоматизации создания фильмов в виртуальном 3D-пространстве. Она симулирует работу съемочной группы, включая режиссеров, сценаристов, актеров и операторов. Система охватывает ключевые этапы производства фильма: разработку идеи, написание сценария и выбор планов съемки. FilmAgent использует многоагентное взаимодействие для итеративной доработки результатов, что позволяет достичь лучшего качества по сравнению с одноагентными подходами.'}, 'en': {'title': 'Revolutionizing Film Production with Multi-Agent Collaboration', 'desc': 'This paper presents FilmAgent, a collaborative framework that utilizes large language models (LLMs) to automate the film production process in 3D virtual environments. FilmAgent employs multiple agents that simulate various roles in filmmaking, such as directors and screenwriters, to collaboratively develop ideas, write scripts, and plan cinematography. The framework enhances decision-making through iterative feedback, which helps to refine scripts and minimize errors. Evaluation results indicate that FilmAgent significantly outperforms traditional methods, demonstrating the effectiveness of multi-agent systems in creative tasks like filmmaking.'}, 'zh': {'title': '多智能体协作,革新虚拟电影制作', 'desc': '这篇论文介绍了一种名为FilmAgent的新型多智能体协作框架,旨在实现虚拟电影制作的自动化。FilmAgent利用大型语言模型(LLM)模拟导演、编剧、演员和摄影师等不同角色,涵盖电影制作的关键阶段,包括创意开发、剧本写作和摄影。通过智能体之间的迭代反馈和修订,FilmAgent能够验证中间剧本并减少错误。评估结果显示,FilmAgent在多个方面的表现优于所有基线模型,证明了多智能体协作在电影制作中的可行性。'}}}, {'id': 'https://huggingface.co/papers/2501.12895', 'title': 'Test-Time Preference Optimization: On-the-Fly Alignment via Iterative Textual Feedback', 'url': 'https://huggingface.co/papers/2501.12895', 'abstract': 'Large language models (LLMs) demonstrate impressive performance but lack the flexibility to adapt to human preferences quickly without retraining. In this work, we introduce Test-time Preference Optimization (TPO), a framework that aligns LLM outputs with human preferences during inference, removing the need to update model parameters. Rather than relying on purely numerical rewards, TPO translates reward signals into textual critiques and uses them as textual rewards to iteratively refine its response. Evaluations on benchmarks covering instruction following, preference alignment, safety, and mathematics reveal that TPO progressively improves alignment with human preferences. Notably, after only a few TPO steps, the initially unaligned Llama-3.1-70B-SFT model can surpass the aligned counterpart, Llama-3.1-70B-Instruct. Furthermore, TPO scales efficiently with both the search width and depth during inference. Through case studies, we illustrate how TPO exploits the innate capacity of LLM to interpret and act upon reward signals. Our findings establish TPO as a practical, lightweight alternative for test-time preference optimization, achieving alignment on the fly. Our code is publicly available at https://github.com/yafuly/TPO.', 'score': 40, 'issue_id': 1820, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': 'ebde6f173ad4f6f9', 'authors': ['Yafu Li', 'Xuyang Hu', 'Xiaoye Qu', 'Linjie Li', 'Yu Cheng'], 'affiliations': ['Shanghai AI Laboratory', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2501.12895.jpg', 'data': {'categories': ['#rlhf', '#training', '#alignment', '#inference'], 'emoji': '🎯', 'ru': {'title': 'Адаптация языковых моделей на лету: оптимизация без переобучения', 'desc': 'Авторы представляют новый метод под названием Test-time Preference Optimization (TPO), который позволяет адаптировать выходные данные больших языковых моделей (LLM) к предпочтениям человека во время вывода, без необходимости обновления параметров модели. TPO преобразует сигналы вознаграждения в текстовые критические замечания и использует их в качестве текстовых наград для итеративного улучшения ответа. Эксперименты показывают, что TPO постепенно улучшает соответствие предпочтениям человека, причем даже изначально не настроенная модель Llama-3.1-70B-SFT может превзойти настроенный аналог после нескольких шагов TPO. Метод демонстрирует эффективность и масштабируемость, представляя собой практичную альтернативу для оптимизации предпочтений во время вывода.'}, 'en': {'title': 'Aligning Language Models with Human Preferences on the Fly', 'desc': 'This paper presents Test-time Preference Optimization (TPO), a novel framework designed to enhance the alignment of large language model (LLM) outputs with human preferences during inference without the need for retraining. TPO utilizes textual critiques as a form of reward signals, allowing the model to iteratively refine its responses based on human feedback. The results show that TPO can significantly improve the performance of the Llama-3.1-70B-SFT model, enabling it to exceed the performance of the pre-aligned Llama-3.1-70B-Instruct model after just a few optimization steps. Additionally, TPO demonstrates efficient scaling with search width and depth, making it a practical solution for real-time preference alignment in LLMs.'}, 'zh': {'title': '测试时偏好优化:让模型更懂你', 'desc': '大型语言模型(LLMs)在性能上表现出色,但在不重新训练的情况下,难以快速适应人类偏好。我们提出了一种名为测试时偏好优化(TPO)的框架,它在推理过程中将LLM的输出与人类偏好对齐,避免了更新模型参数的需求。TPO通过将奖励信号转化为文本批评,并将其作为文本奖励,逐步优化模型的响应。评估结果显示,经过少量TPO步骤后,最初未对齐的Llama-3.1-70B-SFT模型能够超越已对齐的Llama-3.1-70B-Instruct模型。'}}}, {'id': 'https://huggingface.co/papers/2501.13106', 'title': 'VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding', 'url': 'https://huggingface.co/papers/2501.13106', 'abstract': 'In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation model for image and video understanding. The core design philosophy of VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the vision-centric training paradigm and vision-centric framework design. The key insight of our vision-centric training paradigm is that high-quality image-text data is crucial for both image and video understanding. Instead of preparing massive video-text datasets, we focus on constructing large-scale and high-quality image-text datasets. VideoLLaMA3 has four training stages: 1) vision-centric alignment stage, which warms up the vision encoder and projector; 2) vision-language pretraining stage, which jointly tunes the vision encoder, projector, and LLM with large-scale image-text data covering multiple types (including scene images, documents, charts) as well as text-only data. 3) multi-task fine-tuning stage, which incorporates image-text SFT data for downstream tasks and video-text data to establish a foundation for video understanding. 4) video-centric fine-tuning, which further improves the model\'s capability in video understanding. As for the framework design, to better capture fine-grained details in images, the pretrained vision encoder is adapted to encode images of varying sizes into vision tokens with corresponding numbers, rather than a fixed number of tokens. For video inputs, we reduce the number of vision tokens according to their similarity so that the representation of videos will be more precise and compact. Benefit from vision-centric designs, VideoLLaMA3 achieves compelling performances in both image and video understanding benchmarks.', 'score': 39, 'issue_id': 1820, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': 'd22ea6b804e73c9a', 'authors': ['Boqiang Zhang', 'Kehan Li', 'Zesen Cheng', 'Zhiqiang Hu', 'Yuqian Yuan', 'Guanzheng Chen', 'Sicong Leng', 'Yuming Jiang', 'Hang Zhang', 'Xin Li', 'Peng Jin', 'Wenqi Zhang', 'Fan Wang', 'Lidong Bing', 'Deli Zhao'], 'affiliations': ['DAMO Academy, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.13106.jpg', 'data': {'categories': ['#multimodal', '#cv', '#agi', '#games', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'VideoLLaMA3: Зрение как ключ к пониманию изображений и видео', 'desc': 'VideoLLaMA3 - это усовершенствованная мультимодальная модель для понимания изображений и видео. Ключевая особенность модели - ориентированность на зрение, что проявляется как в парадигме обучения, так и в архитектуре. Модель обучается в четыре этапа, уделяя особое внимание высококачественным данным изображение-текст. VideoLLaMA3 использует адаптивное кодирование изображений разного размера и сжатие представления видео для более точного анализа.'}, 'en': {'title': 'Empowering Image and Video Understanding with Vision-Centric Design', 'desc': 'VideoLLaMA3 is a cutting-edge multimodal foundation model designed for understanding images and videos. It emphasizes a vision-centric approach, which involves training with high-quality image-text datasets instead of large video-text datasets. The model undergoes four training stages, including alignment, pretraining, fine-tuning, and video-centric fine-tuning, to enhance its capabilities in both image and video comprehension. By adapting the vision encoder to handle varying image sizes and optimizing video token representation, VideoLLaMA3 demonstrates impressive performance across various benchmarks.'}, 'zh': {'title': '以视觉为中心的多模态理解模型', 'desc': '本文提出了VideoLLaMA3,这是一个更先进的多模态基础模型,用于图像和视频理解。其核心设计理念是以视觉为中心,强调高质量的图像-文本数据对图像和视频理解的重要性。VideoLLaMA3的训练分为四个阶段,包括视觉对齐、视觉-语言预训练、多任务微调和视频微调,以提升模型在视频理解方面的能力。通过适应性地编码不同大小的图像和优化视频输入的表示,VideoLLaMA3在图像和视频理解基准测试中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.12599', 'title': 'Kimi k1.5: Scaling Reinforcement Learning with LLMs', 'url': 'https://huggingface.co/papers/2501.12599', 'abstract': "Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of artificial intelligence, with the promise that large language models (LLMs) can scale their training data by learning to explore with rewards. However, prior published work has not produced competitive results. In light of this, we report on the training practice of Kimi k1.5, our latest multi-modal LLM trained with RL, including its RL training techniques, multi-modal data recipes, and infrastructure optimization. Long context scaling and improved policy optimization methods are key ingredients of our approach, which establishes a simplistic, effective RL framework without relying on more complex techniques such as Monte Carlo tree search, value functions, and process reward models. Notably, our system achieves state-of-the-art reasoning performance across multiple benchmarks and modalities -- e.g., 77.5 on AIME, 96.2 on MATH 500, 94-th percentile on Codeforces, 74.9 on MathVista -- matching OpenAI's o1. Moreover, we present effective long2short methods that use long-CoT techniques to improve short-CoT models, yielding state-of-the-art short-CoT reasoning results -- e.g., 60.8 on AIME, 94.6 on MATH500, 47.3 on LiveCodeBench -- outperforming existing short-CoT models such as GPT-4o and Claude Sonnet 3.5 by a large margin (up to +550%).", 'score': 33, 'issue_id': 1819, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '427fb9e286a6e3a8', 'authors': ['Kimi Team', 'Angang Du', 'Bofei Gao', 'Bowei Xing', 'Changjiu Jiang', 'Cheng Chen', 'Cheng Li', 'Chenjun Xiao', 'Chenzhuang Du', 'Chonghua Liao', 'Chuning Tang', 'Congcong Wang', 'Dehao Zhang', 'Enming Yuan', 'Enzhe Lu', 'Fengxiang Tang', 'Flood Sung', 'Guangda Wei', 'Guokun Lai', 'Haiqing Guo', 'Han Zhu', 'Hao Ding', 'Hao Hu', 'Hao Yang', 'Hao Zhang', 'Haotian Yao', 'Haotian Zhao', 'Haoyu Lu', 'Haoze Li', 'Haozhen Yu', 'Hongcheng Gao', 'Huabin Zheng', 'Huan Yuan', 'Jia Chen', 'Jianhang Guo', 'Jianlin Su', 'Jianzhou Wang', 'Jie Zhao', 'Jin Zhang', 'Jingyuan Liu', 'Junjie Yan', 'Junyan Wu', 'Lidong Shi', 'Ling Ye', 'Longhui Yu', 'Mengnan Dong', 'Neo Zhang', 'Ningchen Ma', 'Qiwei Pan', 'Qucheng Gong', 'Shaowei Liu', 'Shengling Ma', 'Shupeng Wei', 'Sihan Cao', 'Siying Huang', 'Tao Jiang', 'Weihao Gao', 'Weimin Xiong', 'Weiran He', 'Weixiao Huang', 'Wenhao Wu', 'Wenyang He', 'Xianghui Wei', 'Xianqing Jia', 'Xingzhe Wu', 'Xinran Xu', 'Xinxing Zu', 'Xinyu Zhou', 'Xuehai Pan', 'Y. Charles', 'Yang Li', 'Yangyang Hu', 'Yangyang Liu', 'Yanru Chen', 'Yejie Wang', 'Yibo Liu', 'Yidao Qin', 'Yifeng Liu', 'Ying Yang', 'Yiping Bao', 'Yulun Du', 'Yuxin Wu', 'Yuzhi Wang', 'Zaida Zhou', 'Zhaoji Wang', 'Zhaowei Li', 'Zhen Zhu', 'Zheng Zhang', 'Zhexu Wang', 'Zhilin Yang', 'Zhiqi Huang', 'Zihao Huang', 'Ziyao Xu', 'Zonghan Yang'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.12599.jpg', 'data': {'categories': ['#multimodal', '#optimization', '#training', '#benchmark', '#rl', '#reasoning', '#long_context', '#math'], 'emoji': '🤖', 'ru': {'title': 'Эффективное обучение с подкреплением для многомодальных языковых моделей', 'desc': "Статья описывает обучение многомодальной языковой модели Kimi k1.5 с использованием обучения с подкреплением (RL). Авторы представляют упрощенный эффективный подход к RL без использования сложных техник, таких как поиск по дереву Монте-Карло. Ключевыми элементами являются масштабирование на длинный контекст и улучшенные методы оптимизации политики. Модель достигает передовых результатов по рассуждению на различных бенчмарках и модальностях, сравнимых с OpenAI's o1."}, 'en': {'title': 'Unlocking AI Potential with Reinforcement Learning in LLMs', 'desc': 'This paper discusses the development of Kimi k1.5, a multi-modal large language model (LLM) that utilizes reinforcement learning (RL) to enhance its training data exploration through reward mechanisms. The authors highlight their innovative RL training techniques and infrastructure optimizations that allow for effective long context scaling and policy optimization without complex methods like Monte Carlo tree search. Kimi k1.5 achieves state-of-the-art performance on various reasoning benchmarks, demonstrating its competitive edge over existing models. Additionally, the paper introduces long2short methods that leverage long-context techniques to significantly improve short-context reasoning results, outperforming other models by a substantial margin.'}, 'zh': {'title': '强化学习助力大语言模型的突破', 'desc': '本文介绍了Kimi k1.5的训练实践,这是一种最新的多模态大语言模型,采用强化学习(RL)进行训练。我们的方法通过长上下文扩展和改进的策略优化,建立了一个简单有效的RL框架,而不依赖于复杂的技术,如蒙特卡洛树搜索和价值函数。Kimi k1.5在多个基准测试中表现出色,达到了最先进的推理性能,超越了现有的短链推理模型。我们的研究表明,利用长链技术可以显著提升短链模型的表现,取得了显著的进步。'}}}, {'id': 'https://huggingface.co/papers/2501.13074', 'title': 'Autonomy-of-Experts Models', 'url': 'https://huggingface.co/papers/2501.13074', 'abstract': "Mixture-of-Experts (MoE) models mostly use a router to assign tokens to specific expert modules, activating only partial parameters and often outperforming dense models. We argue that the separation between the router's decision-making and the experts' execution is a critical yet overlooked issue, leading to suboptimal expert selection and ineffective learning. To address this, we propose Autonomy-of-Experts (AoE), a novel MoE paradigm in which experts autonomously select themselves to process inputs. AoE is based on the insight that an expert is aware of its own capacity to effectively process a token, an awareness reflected in the scale of its internal activations. In AoE, routers are removed; instead, experts pre-compute internal activations for inputs and are ranked based on their activation norms. Only the top-ranking experts proceed with the forward pass, while the others abort. The overhead of pre-computing activations is reduced through a low-rank weight factorization. This self-evaluating-then-partner-comparing approach ensures improved expert selection and effective learning. We pre-train language models having 700M up to 4B parameters, demonstrating that AoE outperforms traditional MoE models with comparable efficiency.", 'score': 29, 'issue_id': 1819, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '5cf511144ad54091', 'authors': ['Ang Lv', 'Ruobing Xie', 'Yining Qian', 'Songhao Wu', 'Xingwu Sun', 'Zhanhui Kang', 'Di Wang', 'Rui Yan'], 'affiliations': ['Machine Learning Platform Department, Tencent', 'Renmin University of China', 'Southeast University, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.13074.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization'], 'emoji': '🧠', 'ru': {'title': 'Самоотбор экспертов: новый подход к эффективным нейросетям', 'desc': 'Статья представляет новый подход к моделям Mixture-of-Experts (MoE) под названием Autonomy-of-Experts (AoE). В AoE эксперты самостоятельно выбирают себя для обработки входных данных, основываясь на внутренних активациях, что устраняет необходимость в отдельном маршрутизаторе. Этот метод обеспечивает более эффективный выбор экспертов и улучшенное обучение. Эксперименты с языковыми моделями от 700 млн до 4 млрд параметров показывают, что AoE превосходит традиционные модели MoE при сопоставимой эффективности.'}, 'en': {'title': 'Empowering Experts: Self-Selection for Enhanced Learning in MoE Models', 'desc': 'This paper introduces a new approach called Autonomy-of-Experts (AoE) for Mixture-of-Experts (MoE) models, which traditionally rely on a router to assign tasks to expert modules. The authors argue that the separation of decision-making and execution in MoE leads to poor expert selection and learning inefficiencies. In AoE, experts autonomously evaluate their ability to process inputs based on their internal activations, eliminating the need for a router. By allowing only the most capable experts to participate in processing, AoE enhances expert selection and improves overall model performance while maintaining efficiency.'}, 'zh': {'title': '自主选择,提升专家学习效率', 'desc': '混合专家模型(MoE)通常使用路由器将输入分配给特定的专家模块,仅激活部分参数,通常比密集模型表现更好。我们认为,路由器的决策与专家的执行之间的分离是一个关键但被忽视的问题,导致专家选择不佳和学习效果不理想。为了解决这个问题,我们提出了自主专家(AoE),一种新颖的MoE范式,其中专家自主选择自己处理输入。AoE基于专家能够意识到自身处理能力的洞察,通过内部激活的规模反映出来,从而确保了更好的专家选择和有效学习。'}}}, {'id': 'https://huggingface.co/papers/2501.13007', 'title': 'Pairwise RM: Perform Best-of-N Sampling with Knockout Tournament', 'url': 'https://huggingface.co/papers/2501.13007', 'abstract': "Best-of-N (BoN) sampling, a common strategy for test-time scaling of Large Language Models (LLMs), relies on reward models to select the best candidate solution from multiple generations. However, traditional reward models often assign arbitrary and inconsistent scores, limiting their effectiveness. To address this, we propose a Pairwise Reward Model (Pairwise RM) combined with a knockout tournament for BoN sampling. Instead of assigning absolute scores, given one math problem, Pairwise RM evaluates two candidate solutions' correctness simultaneously. This approach eliminates the need for arbitrary scoring and enables cross-validation of solutions through parallel comparison. In the knockout tournament, Pairwise RM conducts pairwise comparisons between candidate solutions and eliminates the incorrect ones iteratively. We construct \\ourdataset, a large-scale dataset of 443K pairwise comparisons derived from NumiaMath and annotated using gemini-1.5-flash, and train the Pairwise RM via supervised fine-tuning. Experiments on MATH-500 and the Olympiad Bench demonstrate significant improvements over traditional discriminative reward models. And a 40\\% to 60\\% relative improvement is achieved on the top 50\\% challenging problems.", 'score': 13, 'issue_id': 1821, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': 'a34210b73ec25875', 'authors': ['Yantao Liu', 'Zijun Yao', 'Rui Min', 'Yixin Cao', 'Lei Hou', 'Juanzi Li'], 'affiliations': ['Fudan University', 'Hong Kong University of Science and Technology', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13007.jpg', 'data': {'categories': ['#reasoning', '#training', '#optimization', '#dataset', '#math', '#rlhf'], 'emoji': '🏆', 'ru': {'title': 'Попарное сравнение вместо абсолютных оценок: новый подход к выбору лучшего решения в LLM', 'desc': 'Эта статья представляет новый подход к выбору лучшего решения из нескольких вариантов, генерируемых большими языковыми моделями (LLM). Авторы предлагают использовать попарную модель вознаграждения (Pairwise Reward Model) в сочетании с турниром на выбывание для Best-of-N сэмплирования. Этот метод позволяет избежать произвольного назначения баллов и обеспечивает перекрестную проверку решений через параллельное сравнение. Эксперименты показали значительное улучшение результатов по сравнению с традиционными дискриминативными моделями вознаграждения, особенно на сложных задачах.'}, 'en': {'title': 'Enhancing Solution Selection with Pairwise Comparisons', 'desc': 'This paper introduces a new method called Pairwise Reward Model (Pairwise RM) to improve the selection process in Best-of-N (BoN) sampling for Large Language Models (LLMs). Instead of giving arbitrary scores to candidate solutions, Pairwise RM compares two solutions at a time to determine which one is more correct. This method allows for better validation of solutions through direct comparison and eliminates inconsistencies in scoring. The authors also created a large dataset of 443,000 pairwise comparisons to train the model, resulting in significant performance improvements on challenging math problems compared to traditional reward models.'}, 'zh': {'title': '成对奖励模型:提升大型语言模型的选择能力', 'desc': '本文提出了一种新的奖励模型,称为成对奖励模型(Pairwise RM),用于大型语言模型的最佳N(BoN)采样。传统的奖励模型常常给出任意且不一致的分数,限制了其有效性。成对奖励模型通过同时评估两个候选解的正确性,消除了对任意评分的需求,并通过并行比较实现了解决方案的交叉验证。我们构建了一个包含443K成对比较的大规模数据集,并通过监督微调训练了成对奖励模型,实验结果显示其在解决数学问题时显著优于传统的判别奖励模型。'}}}, {'id': 'https://huggingface.co/papers/2501.12570', 'title': 'O1-Pruner: Length-Harmonizing Fine-Tuning for O1-Like Reasoning Pruning', 'url': 'https://huggingface.co/papers/2501.12570', 'abstract': "Recently, long-thought reasoning LLMs, such as OpenAI's O1, adopt extended reasoning processes similar to how humans ponder over complex problems. This reasoning paradigm significantly enhances the model's problem-solving abilities and has achieved promising results. However, long-thought reasoning process leads to a substantial increase in inference time. A pressing challenge is reducing the inference overhead of long-thought LLMs while ensuring accuracy. In this paper, we experimentally demonstrate that long-thought reasoning models struggle to effectively allocate token budgets based on problem difficulty and reasoning redundancies. To address this, we propose Length-Harmonizing Fine-Tuning (O1-Pruner), aiming at minimizing reasoning overhead while maintaining accuracy. This effective fine-tuning method first estimates the LLM's baseline performance through pre-sampling and then uses RL-style fine-tuning to encourage the model to generate shorter reasoning processes under accuracy constraints. This allows the model to achieve efficient reasoning with lower redundancy while maintaining accuracy. Experiments on various mathematical reasoning benchmarks show that O1-Pruner not only significantly reduces inference overhead but also achieves higher accuracy, providing a novel and promising solution to this challenge. Our code is coming soon at https://github.com/StarDewXXX/O1-Pruner", 'score': 11, 'issue_id': 1818, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '2cb7e92315bbf3e4', 'authors': ['Haotian Luo', 'Li Shen', 'Haiying He', 'Yibo Wang', 'Shiwei Liu', 'Wei Li', 'Naiqiang Tan', 'Xiaochun Cao', 'Dacheng Tao'], 'affiliations': ['China Agriculture University', 'Didichuxing Co. Ltd', 'Nanyang Technological University', 'Shenzhen Campus of Sun Yat-sen University', 'Tsinghua University', 'University of Oxford'], 'pdf_title_img': 'assets/pdf/title_img/2501.12570.jpg', 'data': {'categories': ['#reasoning', '#math', '#optimization', '#training', '#benchmark', '#inference'], 'emoji': '⚡', 'ru': {'title': 'Ускорение мышления ИИ без потери качества', 'desc': "Статья описывает метод оптимизации работы языковых моделей с длительным рассуждением, таких как OpenAI's O1. Авторы предлагают технику под названием Length-Harmonizing Fine-Tuning (O1-Pruner), которая сокращает время вывода, сохраняя точность модели. Метод использует предварительную выборку для оценки базовой производительности модели, а затем применяет обучение с подкреплением для генерации более коротких процессов рассуждения. Эксперименты на различных бенчмарках математического рассуждения показали, что O1-Pruner значительно снижает вычислительные затраты при сохранении или даже повышении точности."}, 'en': {'title': 'Optimizing Long-Thought Reasoning for Efficient Problem Solving', 'desc': "This paper discusses a new approach to improve long-thought reasoning in large language models (LLMs) like OpenAI's O1. The authors identify that while these models enhance problem-solving, they also increase inference time due to inefficient token usage. To tackle this, they introduce Length-Harmonizing Fine-Tuning (O1-Pruner), which optimizes the reasoning process by balancing accuracy and efficiency. Their experiments show that O1-Pruner reduces inference overhead and improves accuracy on mathematical reasoning tasks, making it a valuable advancement in LLM performance."}, 'zh': {'title': '优化推理效率,提升准确性!', 'desc': '最近,长思考推理的语言模型(LLM)如OpenAI的O1,采用了类似人类思考复杂问题的扩展推理过程。这种推理范式显著增强了模型的解决问题能力,并取得了良好的效果。然而,长思考推理过程导致推理时间大幅增加。为了解决这个问题,我们提出了长度协调微调(O1-Pruner),旨在在保持准确性的同时,减少长思考LLM的推理开销。'}}}, {'id': 'https://huggingface.co/papers/2501.11067', 'title': 'IntellAgent: A Multi-Agent Framework for Evaluating Conversational AI Systems', 'url': 'https://huggingface.co/papers/2501.11067', 'abstract': 'Large Language Models (LLMs) are transforming artificial intelligence, evolving into task-oriented systems capable of autonomous planning and execution. One of the primary applications of LLMs is conversational AI systems, which must navigate multi-turn dialogues, integrate domain-specific APIs, and adhere to strict policy constraints. However, evaluating these agents remains a significant challenge, as traditional methods fail to capture the complexity and variability of real-world interactions. We introduce IntellAgent, a scalable, open-source multi-agent framework designed to evaluate conversational AI systems comprehensively. IntellAgent automates the creation of diverse, synthetic benchmarks by combining policy-driven graph modeling, realistic event generation, and interactive user-agent simulations. This innovative approach provides fine-grained diagnostics, addressing the limitations of static and manually curated benchmarks with coarse-grained metrics. IntellAgent represents a paradigm shift in evaluating conversational AI. By simulating realistic, multi-policy scenarios across varying levels of complexity, IntellAgent captures the nuanced interplay of agent capabilities and policy constraints. Unlike traditional methods, it employs a graph-based policy model to represent relationships, likelihoods, and complexities of policy interactions, enabling highly detailed diagnostics. IntellAgent also identifies critical performance gaps, offering actionable insights for targeted optimization. Its modular, open-source design supports seamless integration of new domains, policies, and APIs, fostering reproducibility and community collaboration. Our findings demonstrate that IntellAgent serves as an effective framework for advancing conversational AI by addressing challenges in bridging research and deployment. The framework is available at https://github.com/plurai-ai/intellagent', 'score': 6, 'issue_id': 1820, 'pub_date': '2025-01-19', 'pub_date_card': {'ru': '19 января', 'en': 'January 19', 'zh': '1月19日'}, 'hash': '019b0714b4212a7f', 'authors': ['Elad Levi', 'Ilan Kadar'], 'affiliations': ['Plurai'], 'pdf_title_img': 'assets/pdf/title_img/2501.11067.jpg', 'data': {'categories': ['#multimodal', '#agents', '#open_source', '#games', '#optimization', '#graphs', '#benchmark'], 'emoji': '🤖', 'ru': {'title': 'IntellAgent: революция в оценке разговорного ИИ', 'desc': 'IntellAgent - это масштабируемая система с открытым исходным кодом для комплексной оценки разговорных ИИ-систем. Она автоматизирует создание разнообразных синтетических тестов, объединяя моделирование графов на основе политик, генерацию реалистичных событий и интерактивное моделирование взаимодействия пользователя и агента. IntellAgent использует графовую модель политик для представления отношений, вероятностей и сложностей взаимодействия политик, что позволяет проводить детальную диагностику. Система выявляет критические пробелы в производительности и предлагает полезные идеи для целенаправленной оптимизации.'}, 'en': {'title': 'Revolutionizing Evaluation of Conversational AI with IntellAgent', 'desc': 'This paper presents IntellAgent, a new framework for evaluating conversational AI systems, particularly those powered by Large Language Models (LLMs). It addresses the challenges of traditional evaluation methods by automating the creation of diverse benchmarks that simulate real-world interactions. IntellAgent uses a graph-based policy model to analyze the complex relationships and interactions between different policies, providing detailed diagnostics and identifying performance gaps. The open-source nature of IntellAgent encourages collaboration and integration of new features, making it a valuable tool for improving conversational AI systems.'}, 'zh': {'title': 'IntellAgent:对话式AI评估的新范式', 'desc': '大型语言模型(LLMs)正在改变人工智能,成为能够自主规划和执行任务的系统。它们在对话式人工智能系统中的应用尤为重要,这些系统需要处理多轮对话、整合特定领域的API,并遵循严格的政策约束。然而,评估这些智能体仍然是一个重大挑战,因为传统方法无法捕捉现实世界交互的复杂性和多样性。我们提出了IntellAgent,这是一个可扩展的开源多智能体框架,旨在全面评估对话式人工智能系统。'}}}, {'id': 'https://huggingface.co/papers/2412.19723', 'title': 'OS-Genesis: Automating GUI Agent Trajectory Construction via Reverse Task Synthesis', 'url': 'https://huggingface.co/papers/2412.19723', 'abstract': "Graphical User Interface (GUI) agents powered by Vision-Language Models (VLMs) have demonstrated human-like computer control capability. Despite their utility in advancing digital automation, a critical bottleneck persists: collecting high-quality trajectory data for training. Common practices for collecting such data rely on human supervision or synthetic data generation through executing pre-defined tasks, which are either resource-intensive or unable to guarantee data quality. Moreover, these methods suffer from limited data diversity and significant gaps between synthetic data and real-world environments. To address these challenges, we propose OS-Genesis, a novel GUI data synthesis pipeline that reverses the conventional trajectory collection process. Instead of relying on pre-defined tasks, OS-Genesis enables agents first to perceive environments and perform step-wise interactions, then retrospectively derive high-quality tasks to enable trajectory-level exploration. A trajectory reward model is then employed to ensure the quality of the generated trajectories. We demonstrate that training GUI agents with OS-Genesis significantly improves their performance on highly challenging online benchmarks. In-depth analysis further validates OS-Genesis's efficiency and its superior data quality and diversity compared to existing synthesis methods. Our codes, data, and checkpoints are available at https://qiushisun.github.io/OS-Genesis-Home/{OS-Genesis Homepage}.", 'score': 50, 'issue_id': 1455, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': 'b331198d09aa8650', 'authors': ['Qiushi Sun', 'Kanzhi Cheng', 'Zichen Ding', 'Chuanyang Jin', 'Yian Wang', 'Fangzhi Xu', 'Zhenyu Wu', 'Chengyou Jia', 'Liheng Chen', 'Zhoumianze Liu', 'Ben Kao', 'Guohao Li', 'Junxian He', 'Yu Qiao', 'Zhiyong Wu'], 'affiliations': ['Hong Kong University of Science and Technology', 'Johns Hopkins University', 'Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The University of Hong Kong', 'University of Oxford'], 'pdf_title_img': 'assets/pdf/title_img/2412.19723.jpg', 'data': {'categories': ['#benchmark', '#synthetic', '#dataset', '#optimization', '#training', '#data', '#agents'], 'emoji': '🖥️', 'ru': {'title': 'Революция в обучении ИИ-агентов: от заданий к исследованию', 'desc': 'Статья представляет OS-Genesis - новый метод синтеза данных для обучения ИИ-агентов взаимодействию с графическим интерфейсом. Вместо предопределенных задач, агенты сначала исследуют среду и выполняют пошаговые действия, а затем ретроспективно формируют качественные траектории. Используется модель вознаграждения для обеспечения качества сгенерированных траекторий. Результаты показывают значительное улучшение производительности агентов на сложных онлайн-бенчмарках по сравнению с существующими методами.'}, 'en': {'title': 'Revolutionizing GUI Agent Training with OS-Genesis', 'desc': 'This paper introduces OS-Genesis, a new method for generating high-quality trajectory data for training GUI agents using Vision-Language Models (VLMs). Unlike traditional methods that rely on human supervision or predefined tasks, OS-Genesis allows agents to first interact with their environment and then derive tasks retrospectively. This approach enhances data diversity and quality by enabling agents to explore and learn from real-world interactions. The results show that GUI agents trained with OS-Genesis perform significantly better on challenging benchmarks, demonstrating the effectiveness of this novel data synthesis pipeline.'}, 'zh': {'title': 'OS-Genesis:提升GUI代理性能的新方法', 'desc': '本论文提出了一种名为OS-Genesis的新型图形用户界面(GUI)数据合成管道,旨在解决高质量轨迹数据收集的瓶颈。传统方法依赖于人类监督或合成数据生成,往往资源消耗大且数据质量难以保证。OS-Genesis通过让代理先感知环境并进行逐步交互,随后回溯生成高质量任务,从而实现轨迹级探索。实验结果表明,使用OS-Genesis训练的GUI代理在复杂的在线基准测试中表现显著提升,且其数据质量和多样性优于现有合成方法。'}}}, {'id': 'https://huggingface.co/papers/2412.19638', 'title': 'Xmodel-2 Technical Report', 'url': 'https://huggingface.co/papers/2412.19638', 'abstract': 'Xmodel-2 is a 1.2-billion-parameter large language model designed specifically for reasoning tasks. Its architecture enables different model scales to share a unified set of hyperparameters, allowing for extensive experimentation on smaller models and seamless transfer of optimal configurations to larger models. To maximize training efficiency and stability, Xmodel-2 employs the WSD learning rate scheduler from MiniCPM. Pretrained on 1.5 trillion tokens from diverse sources, Xmodel-2 achieves state-of-the-art performance in complex reasoning and agent-based tasks, while maintaining low training costs. These results highlight the potential of efficient model design and training strategies in advancing reasoning capabilities. Model checkpoints and code are publicly available on GitHub at https://github.com/XiaoduoAILab/Xmodel-2', 'score': 11, 'issue_id': 1453, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': '4707dc8ac5a87e66', 'authors': ['Wang Qun', 'Liu Yang', 'Lin Qingquan', 'Qu Zhijiu', 'Jiang Ling'], 'affiliations': ['AI Lab, Xiaodu Technology'], 'pdf_title_img': 'assets/pdf/title_img/2412.19638.jpg', 'data': {'categories': ['#optimization', '#training', '#small_models', '#reasoning', '#open_source', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективное рассуждение с Xmodel-2: мощь в компактности', 'desc': 'Xmodel-2 - это языковая модель с 1,2 миллиардами параметров, специализирующаяся на задачах рассуждения. Её архитектура позволяет разным масштабам модели использовать единый набор гиперпараметров, что облегчает эксперименты и перенос оптимальных конфигураций. Модель использует планировщик скорости обучения WSD из MiniCPM для повышения эффективности и стабильности. Предобученная на 1,5 триллионах токенов, Xmodel-2 достигает передовых результатов в сложных задачах рассуждения, сохраняя низкие затраты на обучение.'}, 'en': {'title': 'Unlocking Reasoning Power with Efficient Model Design', 'desc': 'Xmodel-2 is a large language model with 1.2 billion parameters, specifically built for reasoning tasks. It features a flexible architecture that allows different model sizes to use the same hyperparameters, facilitating experimentation and optimization across scales. The model utilizes the WSD learning rate scheduler to enhance training efficiency and stability. With pretraining on 1.5 trillion tokens, Xmodel-2 demonstrates superior performance in complex reasoning tasks while keeping training costs low, showcasing the benefits of efficient model design.'}, 'zh': {'title': '高效推理能力的模型设计与训练策略', 'desc': 'Xmodel-2 是一个拥有 12 亿参数的大型语言模型,专门设计用于推理任务。它的架构允许不同规模的模型共享统一的超参数,从而可以在较小的模型上进行广泛实验,并将最佳配置无缝转移到更大的模型上。为了最大化训练效率和稳定性,Xmodel-2 采用了 MiniCPM 的 WSD 学习率调度器。经过在 1.5 万亿个来自多样化来源的标记上进行预训练,Xmodel-2 在复杂推理和基于代理的任务中达到了最先进的性能,同时保持了较低的训练成本。'}}}, {'id': 'https://huggingface.co/papers/2412.20735', 'title': 'HUNYUANPROVER: A Scalable Data Synthesis Framework and Guided Tree Search for Automated Theorem Proving', 'url': 'https://huggingface.co/papers/2412.20735', 'abstract': 'We introduce HunyuanProver, an language model finetuned from the Hunyuan 7B for interactive automatic theorem proving with LEAN4. To alleviate the data sparsity issue, we design a scalable framework to iterative synthesize data with low cost. Besides, guided tree search algorithms are designed to enable effective ``system 2 thinking`` of the prover. HunyuanProver achieves state-of-the-art (SOTA) performances on major benchmarks. Specifically, it achieves a pass of 68.4% on the miniF2F-test compared to 65.9%, the current SOTA results. It proves 4 IMO statements (imo_1960_p2, imo_1962_p2}, imo_1964_p2 and imo_1983_p6) in miniF2F-test. To benefit the community, we will open-source a dataset of 30k synthesized instances, where each instance contains the original question in natural language, the converted statement by autoformalization, and the proof by HunyuanProver.', 'score': 3, 'issue_id': 1464, 'pub_date': '2025-12-30', 'pub_date_card': {'ru': '30 декабря', 'en': 'December 30', 'zh': '12月30日'}, 'hash': '18d70581e862bf86', 'authors': ['Yang Li', 'Dong Du', 'Linfeng Song', 'Chen Li', 'Weikang Wang', 'Tao Yang', 'Haitao Mi'], 'affiliations': ['Tencent', 'Tencent Hunyuan Teams'], 'pdf_title_img': 'assets/pdf/title_img/2412.20735.jpg', 'data': {'categories': ['#dataset', '#synthetic', '#data', '#benchmark', '#reasoning', '#open_source', '#training', '#math'], 'emoji': '🧠', 'ru': {'title': 'Прорыв в автоматическом доказательстве теорем с помощью ИИ', 'desc': "HunyuanProver - это языковая модель, настроенная для автоматического доказательства теорем с использованием LEAN4. Модель использует масштабируемую структуру для итеративного синтеза данных и алгоритмы направленного поиска по дереву для эффективного 'системного мышления'. HunyuanProver достигает лучших результатов на основных бенчмарках, включая 68.4% прохождения на miniF2F-test. Авторы планируют открыть доступ к набору данных из 30 тысяч синтезированных примеров для пользы сообщества."}, 'en': {'title': 'HunyuanProver: Advancing Theorem Proving with AI', 'desc': 'HunyuanProver is a language model specifically fine-tuned for interactive automatic theorem proving using LEAN4. To address the challenge of data sparsity, the authors developed a scalable framework that allows for the iterative synthesis of data at a low cost. They also implemented guided tree search algorithms to enhance the reasoning capabilities of the prover, enabling it to perform complex logical deductions. HunyuanProver has achieved state-of-the-art performance on key benchmarks, including a notable pass rate of 68.4% on the miniF2F-test, surpassing previous results and proving several significant mathematical statements.'}, 'zh': {'title': 'HunyuanProver:自动定理证明的新突破', 'desc': '本文介绍了HunyuanProver,这是一个基于Hunyuan 7B微调的语言模型,旨在与LEAN4进行交互式自动定理证明。为了缓解数据稀疏问题,我们设计了一个可扩展的框架,以低成本迭代合成数据。此外,我们还设计了引导树搜索算法,以实现证明者的有效“系统2思维”。HunyuanProver在主要基准测试中达到了最先进的性能,特别是在miniF2F-test中取得了68.4%的通过率,超越了当前的65.9%最先进结果。'}}}, {'id': 'https://huggingface.co/papers/2501.07301', 'title': 'The Lessons of Developing Process Reward Models in Mathematical Reasoning', 'url': 'https://huggingface.co/papers/2501.07301', 'abstract': 'Process Reward Models (PRMs) emerge as a promising approach for process supervision in mathematical reasoning of Large Language Models (LLMs), which aim to identify and mitigate intermediate errors in the reasoning processes. However, the development of effective PRMs faces significant challenges, particularly in data annotation and evaluation methodologies. In this paper, through extensive experiments, we demonstrate that commonly used Monte Carlo (MC) estimation-based data synthesis for PRMs typically yields inferior performance and generalization compared to LLM-as-a-judge and human annotation methods. MC estimation relies on completion models to evaluate current-step correctness, leading to inaccurate step verification. Furthermore, we identify potential biases in conventional Best-of-N (BoN) evaluation strategies for PRMs: (1) The unreliable policy models generate responses with correct answers but flawed processes, leading to a misalignment between the evaluation criteria of BoN and the PRM objectives of process verification. (2) The tolerance of PRMs of such responses leads to inflated BoN scores. (3) Existing PRMs have a significant proportion of minimum scores concentrated on the final answer steps, revealing the shift from process to outcome-based assessment in BoN Optimized PRMs. To address these challenges, we develop a consensus filtering mechanism that effectively integrates MC estimation with LLM-as-a-judge and advocates a more comprehensive evaluation framework that combines response-level and step-level metrics. Based on the mechanisms, we significantly improve both model performance and data efficiency in the BoN evaluation and the step-wise error identification task. Finally, we release a new state-of-the-art PRM that outperforms existing open-source alternatives and provides practical guidelines for future research in building process supervision models.', 'score': 46, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '98f46bb1e2772efc', 'authors': ['Zhenru Zhang', 'Chujie Zheng', 'Yangzhen Wu', 'Beichen Zhang', 'Runji Lin', 'Bowen Yu', 'Dayiheng Liu', 'Jingren Zhou', 'Junyang Lin'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.07301.jpg', 'data': {'categories': ['#math', '#data', '#reasoning', '#benchmark', '#optimization', '#open_source', '#training'], 'emoji': '🧮', 'ru': {'title': 'Усовершенствование Process Reward Models для более точного контроля математических рассуждений', 'desc': 'Статья посвящена Process Reward Models (PRM) для контроля процесса математических рассуждений в больших языковых моделях. Авторы выявили проблемы в существующих методах синтеза данных и оценки PRMs, таких как Monte Carlo и Best-of-N. Они предложили новый механизм фильтрации на основе консенсуса, объединяющий MC-оценку с подходом LLM-as-a-judge. В результате исследователи создали улучшенную PRM, превосходящую существующие open-source альтернативы.'}, 'en': {'title': 'Enhancing Reasoning in LLMs with Process Reward Models', 'desc': 'This paper introduces Process Reward Models (PRMs) as a method to enhance the reasoning capabilities of Large Language Models (LLMs) by identifying and correcting errors in their reasoning processes. The authors highlight the limitations of traditional Monte Carlo estimation methods for data synthesis, which often lead to poor performance in evaluating reasoning steps. They also point out biases in the Best-of-N evaluation strategies that can misalign with the goals of PRMs, particularly in how they assess the correctness of reasoning processes versus final answers. To overcome these issues, the paper proposes a new consensus filtering mechanism that combines different evaluation methods, resulting in improved model performance and more accurate error identification.'}, 'zh': {'title': '提升过程监督模型的有效性', 'desc': '本文探讨了过程奖励模型(PRMs)在大型语言模型(LLMs)数学推理中的应用,旨在识别和减少推理过程中的中间错误。研究表明,传统的基于蒙特卡洛估计的数据合成方法在性能和泛化能力上不如使用LLM作为评判者和人工标注的方法。我们还发现,现有的最佳选择(BoN)评估策略存在偏差,导致评估标准与PRM的过程验证目标不一致。为了解决这些问题,本文提出了一种共识过滤机制,结合了蒙特卡洛估计和LLM评判者,显著提高了模型性能和数据效率。'}}}, {'id': 'https://huggingface.co/papers/2501.06425', 'title': 'Tensor Product Attention Is All You Need', 'url': 'https://huggingface.co/papers/2501.06425', 'abstract': 'Scaling language models to handle longer input sequences typically necessitates large key-value (KV) caches, resulting in substantial memory overhead during inference. In this paper, we propose Tensor Product Attention (TPA), a novel attention mechanism that uses tensor decompositions to represent queries, keys, and values compactly, significantly shrinking KV cache size at inference time. By factorizing these representations into contextual low-rank components (contextual factorization) and seamlessly integrating with RoPE, TPA achieves improved model quality alongside memory efficiency. Based on TPA, we introduce the Tensor ProducT ATTenTion Transformer (T6), a new model architecture for sequence modeling. Through extensive empirical evaluation of language modeling tasks, we demonstrate that T6 exceeds the performance of standard Transformer baselines including MHA, MQA, GQA, and MLA across various metrics, including perplexity and a range of renowned evaluation benchmarks. Notably, TPAs memory efficiency enables the processing of significantly longer sequences under fixed resource constraints, addressing a critical scalability challenge in modern language models. The code is available at https://github.com/tensorgi/T6.', 'score': 35, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'f723487eccf1ccfe', 'authors': ['Yifan Zhang', 'Yifeng Liu', 'Huizhuo Yuan', 'Zhen Qin', 'Yang Yuan', 'Quanquan Gu', 'Andrew Chi-Chih Yao'], 'affiliations': ['IIIS, Tsinghua University', 'Shanghai Qi Zhi Institute', 'TapTap', 'University of California, Los Angeles'], 'pdf_title_img': 'assets/pdf/title_img/2501.06425.jpg', 'data': {'categories': ['#benchmark', '#long_context', '#optimization', '#inference', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Эффективное внимание: компактные трансформеры для длинных последовательностей', 'desc': 'В статье представлен новый механизм внимания - Tensor Product Attention (TPA), использующий тензорные разложения для компактного представления запросов, ключей и значений. TPA значительно уменьшает размер кэша ключ-значение при выводе, что повышает эффективность использования памяти. На основе TPA авторы разработали новую архитектуру модели - Tensor ProducT ATTenTion Transformer (T6). Эмпирические исследования показали, что T6 превосходит стандартные базовые модели Transformer по различным метрикам. TPA позволяет обрабатывать значительно более длинные последовательности при фиксированных ресурсах, решая важную проблему масштабируемости современных языковых моделей.'}, 'en': {'title': 'Efficient Attention for Longer Sequences with TPA', 'desc': 'This paper introduces Tensor Product Attention (TPA), a new attention mechanism designed to reduce memory usage during inference in language models. TPA achieves this by using tensor decompositions to compactly represent queries, keys, and values, which allows for smaller key-value caches. The authors present the Tensor ProducT ATTenTion Transformer (T6), a model that integrates TPA and shows improved performance on language modeling tasks compared to traditional Transformer architectures. T6 not only enhances model quality but also enables the processing of longer input sequences efficiently, addressing a key limitation in current language models.'}, 'zh': {'title': '张量乘积注意力:高效处理长序列的创新方案', 'desc': '本文提出了一种新的注意力机制,称为张量乘积注意力(TPA),旨在解决长输入序列处理中的内存开销问题。TPA通过张量分解技术,紧凑地表示查询、键和值,从而显著减少推理时的KV缓存大小。该机制结合了上下文低秩分解和RoPE,提升了模型质量和内存效率。基于TPA,我们还引入了一种新的模型架构——张量乘积注意力变换器(T6),在语言建模任务中表现优于传统的Transformer基线。'}}}, {'id': 'https://huggingface.co/papers/2501.06252', 'title': '$\\text{Transformer}^2$: Self-adaptive LLMs', 'url': 'https://huggingface.co/papers/2501.06252', 'abstract': 'Self-adaptive large language models (LLMs) aim to solve the challenges posed by traditional fine-tuning methods, which are often computationally intensive and static in their ability to handle diverse tasks. We introduce \\implname, a novel self-adaptation framework that adapts LLMs for unseen tasks in real-time by selectively adjusting only the singular components of their weight matrices. During inference, \\implname employs a two-pass mechanism: first, a dispatch system identifies the task properties, and then task-specific "expert" vectors, trained using reinforcement learning, are dynamically mixed to obtain targeted behavior for the incoming prompt. Our method outperforms ubiquitous approaches such as LoRA, with fewer parameters and greater efficiency. \\implname demonstrates versatility across different LLM architectures and modalities, including vision-language tasks. \\implname represents a significant leap forward, offering a scalable, efficient solution for enhancing the adaptability and task-specific performance of LLMs, paving the way for truly dynamic, self-organizing AI systems.', 'score': 19, 'issue_id': 1651, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': '935c31e095aeeec8', 'authors': ['Qi Sun', 'Edoardo Cetin', 'Yujin Tang'], 'affiliations': ['Institute of Science Tokyo, Japan', 'Sakana AI, Japan'], 'pdf_title_img': 'assets/pdf/title_img/2501.06252.jpg', 'data': {'categories': ['#multimodal', '#agi', '#rl', '#optimization', '#training', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Самоадаптация языковых моделей в реальном времени', 'desc': 'Статья представляет новый фреймворк самоадаптации для больших языковых моделей (LLM), который позволяет адаптироваться к новым задачам в реальном времени. Метод использует двухэтапный механизм: сначала определяются свойства задачи, затем применяются специальные векторы экспертов для настройки поведения модели. Подход превосходит традиционные методы вроде LoRA, используя меньше параметров и работая эффективнее. Фреймворк демонстрирует универсальность для разных архитектур LLM и модальностей, включая задачи компьютерного зрения.'}, 'en': {'title': 'Dynamic Adaptation for Language Models', 'desc': "This paper presents a new framework called \textit{implname} that enhances large language models (LLMs) by allowing them to adapt to new tasks in real-time without the heavy computational costs of traditional fine-tuning. Instead of adjusting the entire model, \textit{implname} selectively modifies specific components of the model's weight matrices, making it more efficient. The framework uses a two-step process during inference: first, it identifies the task requirements, and then it combines specialized 'expert' vectors, which are optimized through reinforcement learning, to tailor the model's response. This approach not only improves performance compared to existing methods like LoRA but also works across various LLM architectures and tasks, including those involving both text and images."}, 'zh': {'title': '自适应LLMs:高效应对多样化任务的未来', 'desc': '自适应大型语言模型(LLMs)旨在解决传统微调方法的挑战,这些方法通常计算密集且在处理多样化任务时能力有限。我们介绍了一种新颖的自适应框架\textit{implname},它通过选择性调整权重矩阵的单个组件,实时适应LLMs以应对未见过的任务。在推理过程中,\textit{implname}采用双重机制:首先,调度系统识别任务属性,然后动态混合经过强化学习训练的任务特定“专家”向量,以获得针对输入提示的目标行为。我们的研究方法在参数更少且效率更高的情况下,超越了广泛使用的方法,如LoRA,展示了在不同LLM架构和模态(包括视觉-语言任务)中的多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.06173', 'title': 'VideoAuteur: Towards Long Narrative Video Generation', 'url': 'https://huggingface.co/papers/2501.06173', 'abstract': 'Recent video generation models have shown promising results in producing high-quality video clips lasting several seconds. However, these models face challenges in generating long sequences that convey clear and informative events, limiting their ability to support coherent narrations. In this paper, we present a large-scale cooking video dataset designed to advance long-form narrative generation in the cooking domain. We validate the quality of our proposed dataset in terms of visual fidelity and textual caption accuracy using state-of-the-art Vision-Language Models (VLMs) and video generation models, respectively. We further introduce a Long Narrative Video Director to enhance both visual and semantic coherence in generated videos and emphasize the role of aligning visual embeddings to achieve improved overall video quality. Our method demonstrates substantial improvements in generating visually detailed and semantically aligned keyframes, supported by finetuning techniques that integrate text and image embeddings within the video generation process. Project page: https://videoauteur.github.io/', 'score': 18, 'issue_id': 1653, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': 'e110fbe840c50afa', 'authors': ['Junfei Xiao', 'Feng Cheng', 'Lu Qi', 'Liangke Gui', 'Jiepeng Cen', 'Zhibei Ma', 'Alan Yuille', 'Lu Jiang'], 'affiliations': ['ByteDance', 'ByteDance Seed', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06173.jpg', 'data': {'categories': ['#video', '#story_generation', '#dataset', '#long_context', '#training', '#multimodal', '#alignment'], 'emoji': '🍳', 'ru': {'title': 'Готовим длинные видео: новый подход к генерации нарративного контента', 'desc': 'Статья представляет новый датасет видеороликов о приготовлении пищи для улучшения генерации длинных нарративных видео. Авторы проверяют качество датасета с помощью современных моделей компьютерного зрения и генерации видео. Они также предлагают метод Long Narrative Video Director для повышения визуальной и семантической согласованности генерируемых видео. Результаты показывают значительное улучшение в генерации детализированных и семантически согласованных ключевых кадров.'}, 'en': {'title': 'Enhancing Long-Form Video Generation with Coherent Narratives', 'desc': 'This paper addresses the limitations of current video generation models in creating long, coherent videos, particularly in the cooking domain. It introduces a large-scale dataset specifically designed for generating long-form cooking videos, ensuring high visual quality and accurate textual descriptions. The authors propose a Long Narrative Video Director that improves both the visual and semantic coherence of the generated content by aligning visual embeddings. Their approach shows significant advancements in producing detailed keyframes and enhancing overall video quality through the integration of text and image embeddings.'}, 'zh': {'title': '推动烹饪视频的长篇叙事生成', 'desc': '最近的视频生成模型在生成持续几秒的高质量视频片段方面取得了良好效果。然而,这些模型在生成长序列时面临挑战,难以传达清晰且信息丰富的事件,限制了它们支持连贯叙述的能力。本文提出了一个大规模的烹饪视频数据集,旨在推动烹饪领域的长篇叙事生成。我们引入了一种长叙事视频导演,增强生成视频的视觉和语义一致性,并强调对齐视觉嵌入在提高整体视频质量中的重要性。'}}}, {'id': 'https://huggingface.co/papers/2501.07572', 'title': 'WebWalker: Benchmarking LLMs in Web Traversal', 'url': 'https://huggingface.co/papers/2501.07572', 'abstract': "Retrieval-augmented generation (RAG) demonstrates remarkable performance across tasks in open-domain question-answering. However, traditional search engines may retrieve shallow content, limiting the ability of LLMs to handle complex, multi-layered information. To address it, we introduce WebWalkerQA, a benchmark designed to assess the ability of LLMs to perform web traversal. It evaluates the capacity of LLMs to traverse a website's subpages to extract high-quality data systematically. We propose WebWalker, which is a multi-agent framework that mimics human-like web navigation through an explore-critic paradigm. Extensive experimental results show that WebWalkerQA is challenging and demonstrates the effectiveness of RAG combined with WebWalker, through the horizontal and vertical integration in real-world scenarios.", 'score': 14, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '1dd4e60432c1ca54', 'authors': ['Jialong Wu', 'Wenbiao Yin', 'Yong Jiang', 'Zhenglin Wang', 'Zekun Xi', 'Runnan Fang', 'Deyu Zhou', 'Pengjun Xie', 'Fei Huang'], 'affiliations': ['Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.07572.jpg', 'data': {'categories': ['#rag', '#reasoning', '#benchmark', '#agi', '#optimization', '#games', '#interpretability', '#agents', '#survey'], 'emoji': '🕸️', 'ru': {'title': 'WebWalker: умная навигация по веб-страницам для улучшения вопросно-ответных систем', 'desc': 'В статье представлен новый подход к решению задач открытого вопросно-ответного поиска - WebWalkerQA. Эта система оценивает способность языковых моделей систематически исследовать подстраницы веб-сайтов для извлечения качественной информации. Авторы предлагают фреймворк WebWalker, использующий мультиагентный подход для имитации человеческой навигации по веб-страницам. Экспериментальные результаты демонстрируют эффективность комбинации RAG и WebWalker в реальных сценариях.'}, 'en': {'title': 'Enhancing LLMs with Human-like Web Navigation for Better Information Retrieval', 'desc': "This paper introduces WebWalkerQA, a benchmark for evaluating large language models (LLMs) in open-domain question-answering tasks. It addresses the limitations of traditional search engines that often retrieve superficial content, which hinders LLMs from accessing complex information. The proposed WebWalker framework uses a multi-agent system that simulates human-like web navigation, allowing LLMs to systematically traverse subpages of a website to gather high-quality data. Experimental results indicate that combining retrieval-augmented generation (RAG) with WebWalker enhances the models' performance in real-world scenarios by enabling deeper information extraction."}, 'zh': {'title': 'WebWalkerQA:提升问答系统的网页导航能力', 'desc': '检索增强生成(RAG)在开放领域问答任务中表现出色,但传统搜索引擎可能只检索到表面内容,限制了大型语言模型(LLMs)处理复杂信息的能力。为了解决这个问题,我们引入了WebWalkerQA,这是一个评估LLMs进行网页遍历能力的基准。它评估LLMs系统性地遍历网站子页面以提取高质量数据的能力。我们提出了WebWalker,这是一个多代理框架,通过探索-评估范式模拟人类的网页导航。'}}}, {'id': 'https://huggingface.co/papers/2501.06458', 'title': 'O1 Replication Journey -- Part 3: Inference-time Scaling for Medical Reasoning', 'url': 'https://huggingface.co/papers/2501.06458', 'abstract': "Building upon our previous investigations of O1 replication (Part 1: Journey Learning [Qin et al., 2024] and Part 2: Distillation [Huang et al., 2024]), this work explores the potential of inference-time scaling in large language models (LLMs) for medical reasoning tasks, ranging from diagnostic decision-making to treatment planning. Through extensive experiments on medical benchmarks of varying complexity (MedQA, Medbullets, and JAMA Clinical Challenges), our investigation reveals several key insights: (1) Increasing inference time does lead to improved performance. With a modest training set of 500 samples, our model yields substantial performance improvements of 6%-11%. (2) Task complexity directly correlates with the required length of reasoning chains, confirming the necessity of extended thought processes for challenging problems. (3) The differential diagnoses generated by our model adhere to the principles of the hypothetico-deductive method, producing a list of potential conditions that may explain a patient's symptoms and systematically narrowing these possibilities by evaluating the evidence. These findings demonstrate the promising synergy between inference-time scaling and journey learning in advancing LLMs' real-world clinical reasoning capabilities.", 'score': 14, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'c95817afd181bd85', 'authors': ['Zhongzhen Huang', 'Gui Geng', 'Shengyi Hua', 'Zhen Huang', 'Haoyang Zou', 'Shaoting Zhang', 'Pengfei Liu', 'Xiaofan Zhang'], 'affiliations': ['Generative AI Research Lab (GAIR)', 'SII', 'SPIRAL Lab', 'Shanghai Jiao Tong University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06458.jpg', 'data': {'categories': ['#science', '#inference', '#healthcare', '#reasoning'], 'emoji': '🩺', 'ru': {'title': 'Масштабирование времени вывода LLM улучшает медицинские рассуждения', 'desc': 'Данная работа исследует потенциал масштабирования времени вывода в больших языковых моделях (LLM) для задач медицинского рассуждения. Эксперименты на медицинских бенчмарках показали, что увеличение времени вывода приводит к улучшению производительности модели. Сложность задачи напрямую коррелирует с необходимой длиной цепочек рассуждений. Дифференциальные диагнозы, генерируемые моделью, соответствуют принципам гипотетико-дедуктивного метода.'}, 'en': {'title': 'Enhancing Medical Reasoning in LLMs through Inference-Time Scaling', 'desc': "This paper investigates how increasing inference time can enhance the performance of large language models (LLMs) in medical reasoning tasks. The authors conducted experiments on various medical benchmarks and found that longer inference times lead to significant performance improvements, even with a small training dataset. They also discovered that more complex tasks require longer reasoning chains, highlighting the importance of extended thought processes. Additionally, the model's differential diagnoses align with the hypothetico-deductive method, showcasing its ability to systematically evaluate potential conditions based on patient symptoms."}, 'zh': {'title': '推理时间扩展助力医学推理能力提升', 'desc': '本研究基于我们之前对O1复制的研究,探讨了在大型语言模型(LLMs)中推理时间扩展对医学推理任务的潜力。通过在不同复杂度的医学基准(如MedQA、Medbullets和JAMA临床挑战)上进行广泛实验,我们发现增加推理时间确实能提高模型性能,尤其是在仅有500个样本的训练集上,性能提升可达6%-11%。此外,任务的复杂性与所需推理链的长度直接相关,表明对于复杂问题需要更长的思考过程。最后,我们的模型生成的差异性诊断遵循假设演绎法的原则,系统地评估证据以缩小可能的病症范围。'}}}, {'id': 'https://huggingface.co/papers/2501.06282', 'title': 'MinMo: A Multimodal Large Language Model for Seamless Voice Interaction', 'url': 'https://huggingface.co/papers/2501.06282', 'abstract': 'Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon.', 'score': 13, 'issue_id': 1651, 'pub_date': '2025-01-10', 'pub_date_card': {'ru': '10 января', 'en': 'January 10', 'zh': '1月10日'}, 'hash': '2bd352453760208e', 'authors': ['Qian Chen', 'Yafeng Chen', 'Yanni Chen', 'Mengzhe Chen', 'Yingda Chen', 'Chong Deng', 'Zhihao Du', 'Ruize Gao', 'Changfeng Gao', 'Zhifu Gao', 'Yabin Li', 'Xiang Lv', 'Jiaqing Liu', 'Haoneng Luo', 'Bin Ma', 'Chongjia Ni', 'Xian Shi', 'Jialong Tang', 'Hui Wang', 'Hao Wang', 'Wen Wang', 'Yuxuan Wang', 'Yunlan Xu', 'Fan Yu', 'Zhijie Yan', 'Yexin Yang', 'Baosong Yang', 'Xian Yang', 'Guanrou Yang', 'Tianyu Zhao', 'Qinglin Zhang', 'Shiliang Zhang', 'Nan Zhao', 'Pei Zhang', 'Chong Zhang', 'Jinren Zhou'], 'affiliations': ['Tongyi Lab, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.06282.jpg', 'data': {'categories': ['#audio', '#multimodal', '#training'], 'emoji': '🗣️', 'ru': {'title': 'MinMo: революция в голосовом ИИ-взаимодействии', 'desc': 'Статья представляет MinMo - мультимодальную большую языковую модель для беспрепятственного голосового взаимодействия. Модель обучена на 1,4 миллионах часов разнообразных речевых данных и широком спектре речевых задач через несколько этапов выравнивания речи и текста. MinMo достигает передовых результатов в понимании и генерации речи, сохраняя при этом возможности текстовых ЯБМ. Модель также поддерживает полнодуплексное общение и управляемую генерацию речи с различными нюансами, включая эмоции, диалекты и темп речи.'}, 'en': {'title': 'MinMo: Revolutionizing Voice Interactions with Multimodal Learning', 'desc': 'This paper presents MinMo, a Multimodal Large Language Model designed for seamless voice interactions, featuring around 8 billion parameters. It overcomes limitations of previous aligned models by employing a multi-stage training approach that includes speech-to-text, text-to-speech, and duplex interaction alignments, utilizing a vast dataset of 1.4 million hours of diverse speech. MinMo achieves state-of-the-art performance in voice comprehension and generation, enabling full-duplex conversations and enhanced instruction-following capabilities for nuanced speech generation. Additionally, it introduces a novel voice decoder that significantly improves voice generation quality compared to earlier models.'}, 'zh': {'title': 'MinMo:无缝语音交互的新突破', 'desc': '本文介绍了一种名为MinMo的多模态大型语言模型,旨在实现无缝的语音交互。MinMo具有约80亿个参数,通过多阶段的对齐训练,克服了以往模型在语音理解和生成方面的局限性。该模型能够支持全双工对话,允许用户与系统进行实时的双向交流。MinMo还具备根据用户指令生成语音的能力,能够调整情感、方言和语速等细节。'}}}, {'id': 'https://huggingface.co/papers/2501.06842', 'title': 'SPAM: Spike-Aware Adam with Momentum Reset for Stable LLM Training', 'url': 'https://huggingface.co/papers/2501.06842', 'abstract': 'Large Language Models (LLMs) have demonstrated exceptional performance across diverse tasks, yet their training remains highly resource-intensive and susceptible to critical challenges such as training instability. A predominant source of this instability stems from gradient and loss spikes, which disrupt the learning process, often leading to costly interventions like checkpoint recovery and experiment restarts, further amplifying inefficiencies. This paper presents a comprehensive investigation into gradient spikes observed during LLM training, revealing their prevalence across multiple architectures and datasets. Our analysis shows that these spikes can be up to 1000times larger than typical gradients, substantially deteriorating model performance. To address this issue, we propose Spike-Aware Adam with Momentum Reset SPAM, a novel optimizer designed to counteract gradient spikes through momentum reset and spike-aware gradient clipping. Extensive experiments, including both pre-training and fine-tuning, demonstrate that SPAM consistently surpasses Adam and its variants across various tasks, including (1) LLM pre-training from 60M to 1B, (2) 4-bit LLM pre-training,(3) reinforcement learning, and (4) Time Series Forecasting. Additionally, SPAM facilitates memory-efficient training by enabling sparse momentum, where only a subset of momentum terms are maintained and updated. When operating under memory constraints, SPAM outperforms state-of-the-art memory-efficient optimizers such as GaLore and Adam-Mini. Our work underscores the importance of mitigating gradient spikes in LLM training and introduces an effective optimization strategy that enhances both training stability and resource efficiency at scale. Code is available at https://github.com/TianjinYellow/SPAM-Optimizer.git', 'score': 10, 'issue_id': 1658, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': 'd5fec659e34cf867', 'authors': ['Tianjin Huang', 'Ziquan Zhu', 'Gaojie Jin', 'Lu Liu', 'Zhangyang Wang', 'Shiwei Liu'], 'affiliations': ['Eindhoven University of Technology', 'University of Exeter', 'University of Leicester', 'University of Oxford', 'University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.06842.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization'], 'emoji': '📈', 'ru': {'title': 'SPAM: Стабильное и эффективное обучение языковых моделей', 'desc': 'Исследователи представили новый оптимизатор SPAM (Spike-Aware Adam with Momentum Reset) для обучения больших языковых моделей (LLM). SPAM предназначен для решения проблемы резких скачков градиентов, которые могут быть в 1000 раз больше обычных и нарушают процесс обучения. Оптимизатор использует сброс импульса и адаптивное ограничение градиента для противодействия этим скачкам. Эксперименты показали, что SPAM превосходит Adam и его варианты в различных задачах, включая предобучение LLM, обучение с подкреплением и прогнозирование временных рядов.'}, 'en': {'title': 'Taming Gradient Spikes for Stable LLM Training with SPAM', 'desc': 'This paper investigates the issue of gradient spikes during the training of Large Language Models (LLMs), which can lead to instability and inefficiencies. These spikes can be significantly larger than normal gradients, negatively impacting model performance and requiring costly interventions. To combat this problem, the authors propose a new optimizer called Spike-Aware Adam with Momentum Reset (SPAM), which incorporates momentum reset and spike-aware gradient clipping. Experimental results show that SPAM outperforms traditional optimizers like Adam in various tasks while also being more memory-efficient.'}, 'zh': {'title': '应对梯度波动,提升训练稳定性!', 'desc': '大型语言模型(LLMs)在多种任务中表现出色,但其训练过程资源消耗大且容易出现不稳定性。研究发现,梯度和损失的剧烈波动是导致训练不稳定的主要原因,这会影响学习过程并增加干预成本。本文提出了一种新型优化器——Spike-Aware Adam with Momentum Reset(SPAM),旨在通过动量重置和梯度剪切来应对梯度波动。实验结果表明,SPAM在多种任务中均优于传统的Adam优化器,显著提高了训练的稳定性和资源效率。'}}}, {'id': 'https://huggingface.co/papers/2501.07574', 'title': 'UnCommon Objects in 3D', 'url': 'https://huggingface.co/papers/2501.07574', 'abstract': 'We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for 3D deep learning and 3D generative AI. uCO3D is the largest publicly-available collection of high-resolution videos of objects with 3D annotations that ensures full-360^{circ} coverage. uCO3D is significantly more diverse than MVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of higher quality, due to extensive quality checks of both the collected videos and the 3D annotations. Similar to analogous datasets, uCO3D contains annotations for 3D camera poses, depth maps and sparse point clouds. In addition, each object is equipped with a caption and a 3D Gaussian Splat reconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D and obtain superior results using the latter, showing that uCO3D is better for learning applications.', 'score': 7, 'issue_id': 1651, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '79c40f6997052ddd', 'authors': ['Xingchen Liu', 'Piyush Tayal', 'Jianyuan Wang', 'Jesus Zarzar', 'Tom Monnier', 'Konstantinos Tertikas', 'Jiali Duan', 'Antoine Toisoul', 'Jason Y. Zhang', 'Natalia Neverova', 'Andrea Vedaldi', 'Roman Shapovalov', 'David Novotny'], 'affiliations': ['Carnegie Mellon University', 'KAUST', 'Meta AI', 'NKUA, Greece'], 'pdf_title_img': 'assets/pdf/title_img/2501.07574.jpg', 'data': {'categories': ['#dataset', '#open_source', '#synthetic', '#3d'], 'emoji': '🔍', 'ru': {'title': 'uCO3D: Новый стандарт для 3D-данных в машинном обучении', 'desc': 'Авторы представляют новый набор данных uCO3D для глубокого обучения и генеративного ИИ в 3D. Этот датасет содержит высококачественные видео объектов с полным 360-градусным охватом и 3D-аннотациями. uCO3D превосходит аналоги по разнообразию, охватывая более 1000 категорий объектов, и качеству благодаря тщательным проверкам. Помимо стандартных аннотаций, датасет включает подписи к объектам и 3D-реконструкции на основе гауссовых сплатов.'}, 'en': {'title': 'Unlocking 3D Learning with uCO3D: A New Era of Object-Centric Datasets', 'desc': 'The paper presents Uncommon Objects in 3D (uCO3D), a comprehensive dataset designed for advancing 3D deep learning and generative AI. This dataset features high-resolution videos with full 360-degree coverage and includes over 1,000 diverse object categories, making it larger and more varied than existing datasets like MVImgNet and CO3Dv2. uCO3D provides detailed annotations such as 3D camera poses, depth maps, and sparse point clouds, along with captions and 3D Gaussian Splat reconstructions for each object. Experiments demonstrate that training large 3D models on uCO3D yields superior performance compared to other datasets, highlighting its effectiveness for learning applications.'}, 'zh': {'title': 'uCO3D:提升3D学习的全新数据集', 'desc': '我们介绍了一个新的3D深度学习和生成AI数据集,名为Uncommon Objects in 3D(uCO3D)。uCO3D是一个公开可用的高分辨率视频集合,包含360度的3D注释,涵盖超过1000个物体类别,具有更高的多样性和质量。该数据集提供了3D相机姿态、深度图和稀疏点云的注释,并为每个物体配备了描述和3D高斯点云重建。通过在多个数据集上训练大型3D模型,我们发现uCO3D在学习应用中表现更优。'}}}, {'id': 'https://huggingface.co/papers/2501.07171', 'title': 'BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and Vision-Language Models Derived from Scientific Literature', 'url': 'https://huggingface.co/papers/2501.07171', 'abstract': 'The development of vision-language models (VLMs) is driven by large-scale and diverse multimodal datasets. However, progress toward generalist biomedical VLMs is limited by the lack of annotated, publicly accessible datasets across biology and medicine. Existing efforts are restricted to narrow domains, missing the full diversity of biomedical knowledge encoded in scientific literature. To address this gap, we introduce BIOMEDICA, a scalable, open-source framework to extract, annotate, and serialize the entirety of the PubMed Central Open Access subset into an easy-to-use, publicly accessible dataset.Our framework produces a comprehensive archive with over 24 million unique image-text pairs from over 6 million articles. Metadata and expert-guided annotations are also provided. We demonstrate the utility and accessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style models continuously pre-trained on the BIOMEDICA dataset via streaming, eliminating the need to download 27 TB of data locally.On average, our models achieve state-of-the-art performance across 40 tasks - spanning pathology, radiology, ophthalmology, dermatology, surgery, molecular biology, parasitology, and cell biology - excelling in zero-shot classification with a 6.56% average improvement (as high as 29.8% and 17.5% in dermatology and ophthalmology, respectively), and stronger image-text retrieval, all while using 10x less compute. To foster reproducibility and collaboration, we release our codebase and dataset for the broader research community.', 'score': 3, 'issue_id': 1656, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '07db2230e08b0fde', 'authors': ['Alejandro Lozano', 'Min Woo Sun', 'James Burgess', 'Liangyu Chen', 'Jeffrey J Nirschl', 'Jeffrey Gu', 'Ivan Lopez', 'Josiah Aklilu', 'Austin Wolfgang Katzer', 'Collin Chiu', 'Anita Rau', 'Xiaohan Wang', 'Yuhui Zhang', 'Alfred Seunghoon Song', 'Robert Tibshirani', 'Serena Yeung-Levy'], 'affiliations': ['Department of Biomedical Data Science, Stanford University', 'Department of Computer Science, Stanford University', 'Department of Electrical Engineering, Stanford University', 'Department of Pathology, Stanford University', 'Department of Statistics, Stanford University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07171.jpg', 'data': {'categories': ['#healthcare', '#cv', '#dataset', '#science', '#multimodal', '#open_source'], 'emoji': '🧬', 'ru': {'title': 'BIOMEDICA: Прорыв в обработке биомедицинских данных с помощью ИИ', 'desc': 'Статья представляет BIOMEDICA - масштабируемый фреймворк с открытым исходным кодом для извлечения и аннотирования биомедицинских данных из научной литературы. Фреймворк создал обширный архив из более чем 24 миллионов уникальных пар изображение-текст из более 6 миллионов статей. На основе этого датасета были обучены модели BMCA-CLIP, достигшие state-of-the-art результатов в 40 биомедицинских задачах. Модели показали значительное улучшение в zero-shot классификации и поиске изображений по тексту при использовании в 10 раз меньших вычислительных ресурсов.'}, 'en': {'title': 'Unlocking Biomedical Knowledge with BIOMEDICA', 'desc': 'This paper presents BIOMEDICA, a new framework designed to create a large, open-source dataset from the PubMed Central Open Access subset, which includes over 24 million image-text pairs from scientific articles. The framework addresses the challenge of limited annotated datasets in the biomedical field, enabling the development of generalist vision-language models (VLMs) that can understand diverse biomedical knowledge. The authors also introduce BMCA-CLIP, a set of models that are continuously pre-trained on this dataset, achieving state-of-the-art performance across various medical tasks with significant improvements in zero-shot classification and image-text retrieval. By making their codebase and dataset publicly available, they aim to enhance reproducibility and collaboration in biomedical research.'}, 'zh': {'title': '推动生物医学领域的视觉语言模型发展', 'desc': '本文介绍了BIOMEDICA,一个可扩展的开源框架,用于提取、注释和序列化PubMed Central开放获取子集的全部内容。该框架生成了一个包含超过2400万个独特图像-文本对的综合档案,来自超过600万篇文章。我们还提供了元数据和专家指导的注释,并展示了BMCA-CLIP模型在40个医学任务中的优越性能,尤其在零样本分类和图像-文本检索方面表现突出。通过发布代码库和数据集,我们促进了研究的可重复性和合作。'}}}, {'id': 'https://huggingface.co/papers/2501.06590', 'title': 'ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning', 'url': 'https://huggingface.co/papers/2501.06590', 'abstract': 'Chemical reasoning usually involves complex, multi-step processes that demand precise calculations, where even minor errors can lead to cascading failures. Furthermore, large language models (LLMs) encounter difficulties handling domain-specific formulas, executing reasoning steps accurately, and integrating code effectively when tackling chemical reasoning tasks. To address these challenges, we present ChemAgent, a novel framework designed to improve the performance of LLMs through a dynamic, self-updating library. This library is developed by decomposing chemical tasks into sub-tasks and compiling these sub-tasks into a structured collection that can be referenced for future queries. Then, when presented with a new problem, ChemAgent retrieves and refines pertinent information from the library, which we call memory, facilitating effective task decomposition and the generation of solutions. Our method designs three types of memory and a library-enhanced reasoning component, enabling LLMs to improve over time through experience. Experimental results on four chemical reasoning datasets from SciBench demonstrate that ChemAgent achieves performance gains of up to 46% (GPT-4), significantly outperforming existing methods. Our findings suggest substantial potential for future applications, including tasks such as drug discovery and materials science. Our code can be found at https://github.com/gersteinlab/chemagent', 'score': 3, 'issue_id': 1651, 'pub_date': '2025-01-11', 'pub_date_card': {'ru': '11 января', 'en': 'January 11', 'zh': '1月11日'}, 'hash': 'c217e826245ef357', 'authors': ['Xiangru Tang', 'Tianyu Hu', 'Muyang Ye', 'Yanjun Shao', 'Xunjian Yin', 'Siru Ouyang', 'Wangchunshu Zhou', 'Pan Lu', 'Zhuosheng Zhang', 'Yilun Zhao', 'Arman Cohan', 'Mark Gerstein'], 'affiliations': ['Shanghai Jiao Tong University', 'Stanford University', 'UIUC', 'Yale University'], 'pdf_title_img': 'assets/pdf/title_img/2501.06590.jpg', 'data': {'categories': ['#science', '#reasoning', '#multimodal', '#agents', '#dataset'], 'emoji': '🧪', 'ru': {'title': 'ChemAgent: Умный помощник для LLM в химических задачах', 'desc': 'ChemAgent - это новая система, улучшающая работу больших языковых моделей (LLM) в задачах химического рассуждения. Она использует динамически обновляемую библиотеку, созданную путем декомпозиции химических задач на подзадачи. При решении новых проблем ChemAgent извлекает и уточняет релевантную информацию из библиотеки, что позволяет эффективно декомпозировать задачи и генерировать решения. Система показала значительное превосходство над существующими методами, улучшив производительность LLM до 46% на четырех наборах данных по химическому рассуждению.'}, 'en': {'title': 'Empowering LLMs for Chemical Reasoning with ChemAgent', 'desc': 'This paper introduces ChemAgent, a new framework that enhances large language models (LLMs) for chemical reasoning tasks. It addresses the challenges LLMs face with complex chemical calculations and domain-specific formulas by creating a dynamic library of decomposed sub-tasks. ChemAgent retrieves and refines relevant information from this library, allowing for better task decomposition and solution generation. Experimental results show that ChemAgent significantly improves performance on chemical reasoning datasets, indicating its potential for applications in drug discovery and materials science.'}, 'zh': {'title': 'ChemAgent:提升化学推理的智能助手', 'desc': '化学推理通常涉及复杂的多步骤过程,需要精确的计算,哪怕是微小的错误也可能导致严重的后果。大型语言模型(LLMs)在处理特定领域的公式、准确执行推理步骤和有效整合代码时面临困难。为了解决这些问题,我们提出了ChemAgent,一个通过动态自更新库来提升LLMs性能的新框架。该框架通过将化学任务分解为子任务,并将这些子任务编译成结构化的集合,以便在未来查询时参考,从而实现有效的任务分解和解决方案生成。'}}}, {'id': 'https://huggingface.co/papers/2501.06708', 'title': 'Evaluating Sample Utility for Data Selection by Mimicking Model Weights', 'url': 'https://huggingface.co/papers/2501.06708', 'abstract': "Foundation models rely on large-scale web-crawled datasets, which frequently contain noisy data, biases, and irrelevant content. Existing data selection techniques typically use human heuristics, downstream evaluation datasets, or specialized scoring models, and can overlook samples' utility in the training process. Instead, we propose a new approach, Mimic Score, a data quality metric that uses a pretrained reference model as a guide to assess the usefulness of data samples for training a new model. It relies on the alignment between the gradient of the new model parameters and the vector pointing toward the reference model in weight space. Samples that misalign with this direction are considered low-value and can be filtered out. Motivated by the Mimic score, we develop Grad-Mimic, a data selection framework that identifies and prioritizes useful samples, automating the selection process to create effective filters. Empirically, using Mimic scores to guide model training results in consistent performance gains across six image datasets and enhances the performance of CLIP models. Moreover, Mimic scores and their associated filters improve upon existing filtering methods and offer accurate estimation of dataset quality.", 'score': 2, 'issue_id': 1661, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': '7560c17a0e1b7234', 'authors': ['Tzu-Heng Huang', 'Manjot Bilkhu', 'Frederic Sala', 'Javier Movellan'], 'affiliations': ['Apple Inc.', 'University of Wisconsin-Madison'], 'pdf_title_img': 'assets/pdf/title_img/2501.06708.jpg', 'data': {'categories': ['#data', '#optimization', '#dataset', '#ethics', '#training'], 'emoji': '🧠', 'ru': {'title': 'Умный отбор данных для эффективного обучения моделей', 'desc': 'Предложен новый подход к оценке качества данных для обучения моделей машинного обучения - Mimic Score. Этот метод использует предобученную эталонную модель для оценки полезности образцов данных, анализируя выравнивание градиента параметров новой модели с вектором, указывающим на эталонную модель в пространстве весов. На основе Mimic Score разработан фреймворк Grad-Mimic для автоматизированного отбора полезных образцов данных. Эксперименты показали, что использование Mimic Score приводит к улучшению производительности моделей на нескольких наборах данных изображений и моделей CLIP.'}, 'en': {'title': 'Enhancing Data Selection with Mimic Score for Better Model Training', 'desc': 'This paper introduces a new method called Mimic Score to improve data selection for training foundation models. It uses a pretrained reference model to evaluate the usefulness of data samples by analyzing the alignment of gradients in weight space. Samples that do not align well with the reference model are deemed low-value and can be removed from the training dataset. The proposed Grad-Mimic framework automates this selection process, leading to better model performance across various image datasets and outperforming existing data filtering techniques.'}, 'zh': {'title': 'Mimic Score:提升数据选择的新方法', 'desc': '基础模型依赖于大规模的网络爬取数据集,这些数据集常常包含噪声数据、偏见和无关内容。现有的数据选择技术通常使用人工启发式方法、下游评估数据集或专门的评分模型,可能会忽视样本在训练过程中的实用性。我们提出了一种新的方法,称为Mimic Score,这是一种数据质量指标,利用预训练的参考模型来评估数据样本对新模型训练的有用性。基于Mimic Score,我们开发了Grad-Mimic数据选择框架,自动识别和优先选择有用样本,从而提高模型训练的效果。'}}}, {'id': 'https://huggingface.co/papers/2501.03262', 'title': 'REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models', 'url': 'https://huggingface.co/papers/2501.03262', 'abstract': 'Reinforcement Learning from Human Feedback (RLHF) has emerged as a critical approach for aligning large language models with human preferences, witnessing rapid algorithmic evolution through methods such as Proximal Policy Optimization (PPO), Direct Preference Optimization (DPO), REINFORCE Leave One-Out (RLOO), ReMax, and Group Relative Policy Optimization (GRPO). We present REINFORCE++, an enhanced variant of the classical REINFORCE algorithm that incorporates key optimization techniques from PPO while eliminating the need for a critic network. REINFORCE++ achieves three primary objectives: (1) simplicity (2) enhanced training stability, and (3) reduced computational overhead. Through extensive empirical evaluation, we demonstrate that REINFORCE++ exhibits superior stability compared to GRPO and achieves greater computational efficiency than PPO while maintaining comparable performance. The implementation is available at https://github.com/OpenRLHF/OpenRLHF.', 'score': 42, 'issue_id': 1553, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': 'a05acf5aab0c07dd', 'authors': ['Jian Hu'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.03262.jpg', 'data': {'categories': ['#training', '#rlhf', '#optimization', '#alignment'], 'emoji': '🤖', 'ru': {'title': 'REINFORCE++: Простой и эффективный алгоритм для RLHF', 'desc': 'В статье представлен REINFORCE++, улучшенная версия алгоритма REINFORCE для обучения с подкреплением на основе обратной связи от человека (RLHF). REINFORCE++ сочетает ключевые техники оптимизации из PPO, но не требует использования критической нейронной сети. Алгоритм отличается простотой, повышенной стабильностью обучения и сниженными вычислительными затратами. Эмпирические исследования показывают, что REINFORCE++ демонстрирует лучшую стабильность по сравнению с GRPO и большую вычислительную эффективность, чем PPO, при сохранении сопоставимой производительности.'}, 'en': {'title': 'REINFORCE++: Simplifying Reinforcement Learning with Human Feedback', 'desc': 'This paper introduces REINFORCE++, a new version of the REINFORCE algorithm designed to improve the training of reinforcement learning models using human feedback. It combines the strengths of Proximal Policy Optimization (PPO) while removing the need for a critic network, making it simpler and more efficient. The authors highlight that REINFORCE++ offers better training stability and lower computational costs compared to existing methods like GRPO and PPO. Their experiments show that REINFORCE++ performs well while being easier to use and faster to train.'}, 'zh': {'title': 'REINFORCE++:简化与高效的强化学习新选择', 'desc': '强化学习中的人类反馈(RLHF)是一种重要的方法,用于使大型语言模型更符合人类的偏好。本文提出了REINFORCE++,这是经典REINFORCE算法的增强版本,结合了PPO的优化技术,并且不再需要评论网络。REINFORCE++的主要目标是实现简单性、提高训练稳定性和减少计算开销。通过大量实证评估,我们证明了REINFORCE++在稳定性上优于GRPO,并且在计算效率上超过PPO,同时保持了相似的性能。'}}}, {'id': 'https://huggingface.co/papers/2501.02955', 'title': 'MotionBench: Benchmarking and Improving Fine-grained Video Motion Understanding for Vision Language Models', 'url': 'https://huggingface.co/papers/2501.02955', 'abstract': "In recent years, vision language models (VLMs) have made significant advancements in video understanding. However, a crucial capability - fine-grained motion comprehension - remains under-explored in current benchmarks. To address this gap, we propose MotionBench, a comprehensive evaluation benchmark designed to assess the fine-grained motion comprehension of video understanding models. MotionBench evaluates models' motion-level perception through six primary categories of motion-oriented question types and includes data collected from diverse sources, ensuring a broad representation of real-world video content. Experimental results reveal that existing VLMs perform poorly in understanding fine-grained motions. To enhance VLM's ability to perceive fine-grained motion within a limited sequence length of LLM, we conduct extensive experiments reviewing VLM architectures optimized for video feature compression and propose a novel and efficient Through-Encoder (TE) Fusion method. Experiments show that higher frame rate inputs and TE Fusion yield improvements in motion understanding, yet there is still substantial room for enhancement. Our benchmark aims to guide and motivate the development of more capable video understanding models, emphasizing the importance of fine-grained motion comprehension. Project page: https://motion-bench.github.io .", 'score': 30, 'issue_id': 1551, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'a7051c2d239484b4', 'authors': ['Wenyi Hong', 'Yean Cheng', 'Zhuoyi Yang', 'Weihan Wang', 'Lefan Wang', 'Xiaotao Gu', 'Shiyu Huang', 'Yuxiao Dong', 'Jie Tang'], 'affiliations': ['Tsinghua University', 'Zhipu AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.02955.jpg', 'data': {'categories': ['#architecture', '#optimization', '#benchmark', '#video'], 'emoji': '🎥', 'ru': {'title': 'MotionBench: новый рубеж в понимании движения для моделей компьютерного зрения', 'desc': 'Статья представляет новый бенчмарк MotionBench для оценки способности моделей компьютерного зрения понимать детальные движения в видео. Авторы обнаружили, что существующие модели плохо справляются с этой задачей. Для улучшения результатов предложен новый метод Through-Encoder Fusion, а также использование видео с более высокой частотой кадров. Бенчмарк призван стимулировать развитие более совершенных моделей понимания видео.'}, 'en': {'title': 'Enhancing Video Understanding with Fine-Grained Motion Comprehension', 'desc': "This paper introduces MotionBench, a new benchmark for evaluating how well vision language models (VLMs) understand fine-grained motion in videos. It identifies a gap in current models' abilities to comprehend detailed motion, which is crucial for accurate video analysis. The benchmark includes various motion-oriented question types and diverse video data to ensure comprehensive testing. The authors also propose a Through-Encoder Fusion method to improve VLM performance, highlighting the need for further advancements in fine-grained motion comprehension."}, 'zh': {'title': '提升视频理解的细粒度运动能力', 'desc': '近年来,视觉语言模型(VLMs)在视频理解方面取得了显著进展。然而,细粒度运动理解这一关键能力在当前基准测试中仍未得到充分探索。为了解决这一问题,我们提出了MotionBench,这是一个全面的评估基准,旨在评估视频理解模型的细粒度运动理解能力。实验结果表明,现有的VLM在理解细粒度运动方面表现不佳,因此我们提出了一种新颖的Through-Encoder(TE)融合方法,以提高模型的运动理解能力。'}}}, {'id': 'https://huggingface.co/papers/2501.03575', 'title': 'Cosmos World Foundation Model Platform for Physical AI', 'url': 'https://huggingface.co/papers/2501.03575', 'abstract': 'Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.', 'score': 25, 'issue_id': 1552, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'f4b2044cbc1076a8', 'authors': ['NVIDIA', ':', 'Niket Agarwal', 'Arslan Ali', 'Maciej Bala', 'Yogesh Balaji', 'Erik Barker', 'Tiffany Cai', 'Prithvijit Chattopadhyay', 'Yongxin Chen', 'Yin Cui', 'Yifan Ding', 'Daniel Dworakowski', 'Jiaojiao Fan', 'Michele Fenzi', 'Francesco Ferroni', 'Sanja Fidler', 'Dieter Fox', 'Songwei Ge', 'Yunhao Ge', 'Jinwei Gu', 'Siddharth Gururani', 'Ethan He', 'Jiahui Huang', 'Jacob Huffman', 'Pooya Jannaty', 'Jingyi Jin', 'Seung Wook Kim', 'Gergely Klár', 'Grace Lam', 'Shiyi Lan', 'Laura Leal-Taixe', 'Anqi Li', 'Zhaoshuo Li', 'Chen-Hsuan Lin', 'Tsung-Yi Lin', 'Huan Ling', 'Ming-Yu Liu', 'Xian Liu', 'Alice Luo', 'Qianli Ma', 'Hanzi Mao', 'Kaichun Mo', 'Arsalan Mousavian', 'Seungjun Nah', 'Sriharsha Niverty', 'David Page', 'Despoina Paschalidou', 'Zeeshan Patel', 'Lindsey Pavao', 'Morteza Ramezanali', 'Fitsum Reda', 'Xiaowei Ren', 'Vasanth Rao Naik Sabavat', 'Ed Schmerling', 'Stella Shi', 'Bartosz Stefaniak', 'Shitao Tang', 'Lyne Tchapmi', 'Przemek Tredak', 'Wei-Cheng Tseng', 'Jibin Varghese', 'Hao Wang', 'Haoxiang Wang', 'Heng Wang', 'Ting-Chun Wang', 'Fangyin Wei', 'Xinyue Wei', 'Jay Zhangjie Wu', 'Jiashu Xu', 'Wei Yang', 'Lin Yen-Chen', 'Xiaohui Zeng', 'Yu Zeng', 'Jing Zhang', 'Qinsheng Zhang', 'Yuxuan Zhang', 'Qingqing Zhao', 'Artur Zolkowski'], 'affiliations': ['NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2501.03575.jpg', 'data': {'categories': ['#open_source', '#data', '#benchmark', '#architecture', '#video', '#multimodal', '#dataset', '#training'], 'emoji': '🌍', 'ru': {'title': 'Цифровой двойник мира для обучения физического ИИ', 'desc': 'Статья представляет платформу Cosmos World Foundation Model для разработки моделей мира в физическом ИИ. Авторы предлагают концепцию базовой модели мира, которую можно дообучать для конкретных приложений. Платформа включает конвейер курации видео, предобученные базовые модели мира, примеры дообучения и токенизаторы видео. Проект открытый и доступен на GitHub для помощи разработчикам физического ИИ в решении важных проблем общества.'}, 'en': {'title': 'Empowering Physical AI with Customizable World Models', 'desc': 'This paper introduces the Cosmos World Foundation Model Platform, designed to assist developers in creating tailored world models for Physical AI systems. It emphasizes the necessity of having a digital twin of both the AI and its environment to enable effective training. The platform includes a comprehensive video curation pipeline, pre-trained models, and tools for fine-tuning these models for specific applications. By making the platform and models open-source, the authors aim to empower developers to address significant societal challenges using Physical AI.'}, 'zh': {'title': '构建物理AI的数字双胞胎与世界模型', 'desc': '这篇论文介绍了物理人工智能(Physical AI)在数字训练中的重要性。为了实现这一目标,需要构建一个数字双胞胎(digital twin)和一个世界模型(world model)。我们提出了Cosmos世界基础模型平台,帮助开发者为物理人工智能定制世界模型。该平台提供了视频策划管道、预训练的世界基础模型以及后训练示例,旨在解决社会中的关键问题,并且是开源的。'}}}, {'id': 'https://huggingface.co/papers/2501.03895', 'title': 'LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One Vision Token', 'url': 'https://huggingface.co/papers/2501.03895', 'abstract': 'The advent of real-time large multimodal models (LMMs) like GPT-4o has sparked considerable interest in efficient LMMs. LMM frameworks typically encode visual inputs into vision tokens (continuous representations) and integrate them and textual instructions into the context of large language models (LLMs), where large-scale parameters and numerous context tokens (predominantly vision tokens) result in substantial computational overhead. Previous efforts towards efficient LMMs always focus on replacing the LLM backbone with smaller models, while neglecting the crucial issue of token quantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal vision tokens. To achieve a high compression ratio of vision tokens while preserving visual information, we first analyze how LMMs understand vision tokens and find that most vision tokens only play a crucial role in the early layers of LLM backbone, where they mainly fuse visual information into text tokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to fuse visual information into text tokens in advance, thereby facilitating the extreme compression of vision tokens fed to LLM backbone into one token. LLaVA-Mini is a unified large multimodal model that can support the understanding of images, high-resolution images, and videos in an efficient manner. Experiments across 11 image-based and 7 video-based benchmarks demonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token instead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by 77%, deliver low-latency responses within 40 milliseconds, and process over 10,000 frames of video on the GPU hardware with 24GB of memory.', 'score': 19, 'issue_id': 1550, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '925d2f81d6fcbb0b', 'authors': ['Shaolei Zhang', 'Qingkai Fang', 'Zhe Yang', 'Yang Feng'], 'affiliations': ['Key Laboratory of AI Safety, Chinese Academy of Sciences', 'Key Laboratory of Intelligent Information Processing, Institute of Computing Technology, Chinese Academy of Sciences (ICT/CAS)', 'University of Chinese Academy of Sciences, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.03895.jpg', 'data': {'categories': ['#agi', '#video', '#multimodal', '#architecture', '#optimization', '#cv', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Эффективность через минимизацию: революция в мультимодальных моделях', 'desc': 'Статья представляет LLaVA-Mini - эффективную мультимодальную модель с минимальным количеством визуальных токенов. Авторы обнаружили, что большинство визуальных токенов играют ключевую роль только в ранних слоях языковой модели. LLaVA-Mini вводит предварительное слияние модальностей, чтобы объединить визуальную информацию с текстовыми токенами заранее. Эксперименты показывают, что LLaVA-Mini превосходит LLaVA-v1.5, используя всего 1 визуальный токен вместо 576, что значительно повышает эффективность обработки.'}, 'en': {'title': 'Maximizing Efficiency with Minimal Vision Tokens in LMMs', 'desc': 'This paper presents LLaVA-Mini, an efficient large multimodal model (LMM) designed to reduce the number of vision tokens while maintaining visual information integrity. The authors identify that most vision tokens are primarily important in the early layers of the language model, where they integrate visual data with text. By implementing a technique called modality pre-fusion, LLaVA-Mini compresses the input from 576 vision tokens to just one, significantly enhancing efficiency. Experimental results show that LLaVA-Mini not only outperforms its predecessor but also achieves a 77% reduction in computational load and rapid processing times for high-resolution images and videos.'}, 'zh': {'title': '高效多模态模型LLaVA-Mini的创新之路', 'desc': '本文介绍了一种高效的多模态模型LLaVA-Mini,该模型通过减少视觉标记的数量来提高效率。研究发现,大多数视觉标记在大型语言模型的早期层中起着关键作用,因此可以在此之前将视觉信息与文本标记融合。LLaVA-Mini采用了模态预融合的方法,将视觉信息提前融合,从而将输入到语言模型的视觉标记压缩为一个标记。实验结果表明,LLaVA-Mini在多个基准测试中表现优于之前的模型,且显著降低了计算复杂度和延迟。'}}}, {'id': 'https://huggingface.co/papers/2501.04001', 'title': 'Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos', 'url': 'https://huggingface.co/papers/2501.04001', 'abstract': 'This work presents Sa2VA, the first unified model for dense grounded understanding of both images and videos. Unlike existing multi-modal large language models, which are often limited to specific modalities and tasks, Sa2VA supports a wide range of image and video tasks, including referring segmentation and conversation, with minimal one-shot instruction tuning. Sa2VA combines SAM-2, a foundation video segmentation model, with LLaVA, an advanced vision-language model, and unifies text, image, and video into a shared LLM token space. Using the LLM, Sa2VA generates instruction tokens that guide SAM-2 in producing precise masks, enabling a grounded, multi-modal understanding of both static and dynamic visual content. Additionally, we introduce Ref-SAV, an auto-labeled dataset containing over 72k object expressions in complex video scenes, designed to boost model performance. We also manually validate 2k video objects in the Ref-SAV datasets to benchmark referring video object segmentation in complex environments. Experiments show that Sa2VA achieves state-of-the-art across multiple tasks, particularly in referring video object segmentation, highlighting its potential for complex real-world applications.', 'score': 16, 'issue_id': 1555, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'd079946bf74858cd', 'authors': ['Haobo Yuan', 'Xiangtai Li', 'Tao Zhang', 'Zilong Huang', 'Shilin Xu', 'Shunping Ji', 'Yunhai Tong', 'Lu Qi', 'Jiashi Feng', 'Ming-Hsuan Yang'], 'affiliations': ['Bytedance Seed', 'Peking University', 'UC Merced', 'Wuhan University'], 'pdf_title_img': 'assets/pdf/title_img/2501.04001.jpg', 'data': {'categories': ['#dataset', '#multimodal', '#benchmark', '#cv'], 'emoji': '🎥', 'ru': {'title': 'Sa2VA: Единая модель для понимания изображений и видео', 'desc': 'Sa2VA - это первая унифицированная модель для плотного заземленного понимания изображений и видео. Она объединяет SAM-2 (модель сегментации видео) с LLaVA (продвинутой моделью компьютерного зрения и языка) в едином пространстве токенов большой языковой модели. Sa2VA генерирует токены инструкций, направляющие SAM-2 в создании точных масок, что позволяет осуществлять заземленное мультимодальное понимание как статического, так и динамического визуального контента. Модель достигает передовых результатов в различных задачах, особенно в сегментации объектов по ссылкам в видео.'}, 'en': {'title': 'Sa2VA: Unifying Image and Video Understanding for Enhanced Multi-Modal Tasks', 'desc': 'Sa2VA is a groundbreaking model that integrates image and video understanding into a single framework. It combines the strengths of SAM-2 for video segmentation and LLaVA for vision-language tasks, allowing it to handle various multi-modal tasks with minimal tuning. By creating a shared token space for text, images, and videos, Sa2VA can generate specific instruction tokens that help in accurately segmenting objects in both images and videos. The introduction of the Ref-SAV dataset further enhances its capabilities, enabling it to achieve top performance in complex visual environments.'}, 'zh': {'title': 'Sa2VA:图像与视频的统一理解模型', 'desc': '本研究提出了Sa2VA,这是第一个统一的模型,能够对图像和视频进行密集的基础理解。与现有的多模态大型语言模型不同,Sa2VA支持多种图像和视频任务,包括引用分割和对话,且只需最少的一次性指令调优。Sa2VA结合了基础视频分割模型SAM-2和先进的视觉语言模型LLaVA,将文本、图像和视频统一到共享的LLM令牌空间中。实验表明,Sa2VA在多个任务上达到了最先进的水平,特别是在引用视频对象分割方面,展示了其在复杂现实应用中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.03847', 'title': 'Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control', 'url': 'https://huggingface.co/papers/2501.03847', 'abstract': 'Diffusion models have demonstrated impressive performance in generating high-quality videos from text prompts or images. However, precise control over the video generation process, such as camera manipulation or content editing, remains a significant challenge. Existing methods for controlled video generation are typically limited to a single control type, lacking the flexibility to handle diverse control demands. In this paper, we introduce Diffusion as Shader (DaS), a novel approach that supports multiple video control tasks within a unified architecture. Our key insight is that achieving versatile video control necessitates leveraging 3D control signals, as videos are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods limited to 2D control signals, DaS leverages 3D tracking videos as control inputs, making the video diffusion process inherently 3D-aware. This innovation allows DaS to achieve a wide range of video controls by simply manipulating the 3D tracking videos. A further advantage of using 3D tracking videos is their ability to effectively link frames, significantly enhancing the temporal consistency of the generated videos. With just 3 days of fine-tuning on 8 H800 GPUs using less than 10k videos, DaS demonstrates strong control capabilities across diverse tasks, including mesh-to-video generation, camera control, motion transfer, and object manipulation.', 'score': 11, 'issue_id': 1552, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '975d5fa9d59bde28', 'authors': ['Zekai Gu', 'Rui Yan', 'Jiahao Lu', 'Peng Li', 'Zhiyang Dou', 'Chenyang Si', 'Zhen Dong', 'Qifeng Liu', 'Cheng Lin', 'Ziwei Liu', 'Wenping Wang', 'Yuan Liu'], 'affiliations': ['Hong Kong University of Science and Technology, China', 'Nanyang Technological University, Singapore', 'Texas A&M University, U.S.A', 'The University of Hong Kong, China', 'Wuhan University, China', 'Zhejiang University, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.03847.jpg', 'data': {'categories': ['#video', '#diffusion', '#3d'], 'emoji': '🎬', 'ru': {'title': 'DaS: Универсальный контроль над генерацией видео через 3D-сигналы', 'desc': 'Авторы представляют новый подход под названием Diffusion as Shader (DaS) для контролируемой генерации видео с помощью диффузионных моделей. В отличие от существующих методов, ограниченных одним типом контроля, DaS поддерживает множество задач управления видео в единой архитектуре. Ключевая идея заключается в использовании 3D-сигналов управления, что делает процесс диффузии видео изначально 3D-ориентированным. DaS демонстрирует сильные возможности управления в различных задачах, включая генерацию видео из 3D-моделей, контроль камеры, перенос движения и манипуляции с объектами.'}, 'en': {'title': 'Empowering Video Generation with 3D Control Signals', 'desc': 'This paper presents Diffusion as Shader (DaS), a new method for generating videos that allows for precise control over various aspects of video creation. Unlike previous models that only used 2D control signals, DaS utilizes 3D tracking videos, which helps in managing the dynamic nature of video content. This approach enables users to manipulate video elements like camera angles and object movements more effectively. The results show that DaS can maintain high-quality video generation while ensuring temporal consistency across frames, even with limited training data.'}, 'zh': {'title': '多样化视频控制的新方法:扩散作为着色器', 'desc': '扩散模型在从文本提示或图像生成高质量视频方面表现出色。然而,精确控制视频生成过程,如相机操作或内容编辑,仍然是一个重大挑战。现有的受控视频生成方法通常仅限于单一控制类型,缺乏处理多样化控制需求的灵活性。本文提出了一种新方法——扩散作为着色器(DaS),它在统一架构中支持多种视频控制任务,利用3D控制信号来实现更灵活的视频控制。'}}}, {'id': 'https://huggingface.co/papers/2501.03936', 'title': 'PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides', 'url': 'https://huggingface.co/papers/2501.03936', 'abstract': 'Automatically generating presentations from documents is a challenging task that requires balancing content quality, visual design, and structural coherence. Existing methods primarily focus on improving and evaluating the content quality in isolation, often overlooking visual design and structural coherence, which limits their practical applicability. To address these limitations, we propose PPTAgent, which comprehensively improves presentation generation through a two-stage, edit-based approach inspired by human workflows. PPTAgent first analyzes reference presentations to understand their structural patterns and content schemas, then drafts outlines and generates slides through code actions to ensure consistency and alignment. To comprehensively evaluate the quality of generated presentations, we further introduce PPTEval, an evaluation framework that assesses presentations across three dimensions: Content, Design, and Coherence. Experiments show that PPTAgent significantly outperforms traditional automatic presentation generation methods across all three dimensions. The code and data are available at https://github.com/icip-cas/PPTAgent.', 'score': 7, 'issue_id': 1557, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '57bb4703056c9e20', 'authors': ['Hao Zheng', 'Xinyan Guan', 'Hao Kong', 'Jia Zheng', 'Hongyu Lin', 'Yaojie Lu', 'Ben He', 'Xianpei Han', 'Le Sun'], 'affiliations': ['Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences', 'Shanghai Jiexin Technology', 'University of Chinese Academy of Sciences'], 'pdf_title_img': 'assets/pdf/title_img/2501.03936.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#dataset'], 'emoji': '🎭', 'ru': {'title': 'PPTAgent: ИИ-помощник для создания презентаций нового уровня', 'desc': 'Исследователи представили PPTAgent - систему для автоматического создания презентаций из документов. В отличие от существующих методов, PPTAgent улучшает не только качество контента, но и визуальный дизайн и структурную согласованность. Система использует двухэтапный подход, вдохновленный рабочим процессом человека: сначала анализирует образцы презентаций, затем создает слайды с помощью программных действий. Авторы также разработали фреймворк PPTEval для комплексной оценки генерируемых презентаций.'}, 'en': {'title': 'PPTAgent: Elevating Presentation Generation with Content, Design, and Coherence', 'desc': 'This paper presents PPTAgent, a novel approach for automatically generating presentations from documents. Unlike existing methods that focus solely on content quality, PPTAgent enhances the overall presentation by considering visual design and structural coherence as well. It employs a two-stage, edit-based process that first analyzes reference presentations to extract patterns and then generates slides through code actions. Additionally, the authors introduce PPTEval, a framework for evaluating presentations based on content, design, and coherence, demonstrating that PPTAgent outperforms traditional methods in all areas.'}, 'zh': {'title': '智能生成高质量演示文稿的解决方案', 'desc': '本文提出了一种名为PPTAgent的自动生成演示文稿的方法。该方法通过两阶段的编辑式流程,综合考虑内容质量、视觉设计和结构一致性。PPTAgent首先分析参考演示文稿,以理解其结构模式和内容框架,然后通过代码操作草拟大纲并生成幻灯片。为了全面评估生成演示文稿的质量,本文还引入了PPTEval评估框架,从内容、设计和一致性三个维度进行评估。'}}}, {'id': 'https://huggingface.co/papers/2501.03714', 'title': 'MoDec-GS: Global-to-Local Motion Decomposition and Temporal Interval Adjustment for Compact Dynamic 3D Gaussian Splatting', 'url': 'https://huggingface.co/papers/2501.03714', 'abstract': '3D Gaussian Splatting (3DGS) has made significant strides in scene representation and neural rendering, with intense efforts focused on adapting it for dynamic scenes. Despite delivering remarkable rendering quality and speed, existing methods struggle with storage demands and representing complex real-world motions. To tackle these issues, we propose MoDecGS, a memory-efficient Gaussian splatting framework designed for reconstructing novel views in challenging scenarios with complex motions. We introduce GlobaltoLocal Motion Decomposition (GLMD) to effectively capture dynamic motions in a coarsetofine manner. This approach leverages Global Canonical Scaffolds (Global CS) and Local Canonical Scaffolds (Local CS), extending static Scaffold representation to dynamic video reconstruction. For Global CS, we propose Global Anchor Deformation (GAD) to efficiently represent global dynamics along complex motions, by directly deforming the implicit Scaffold attributes which are anchor position, offset, and local context features. Next, we finely adjust local motions via the Local Gaussian Deformation (LGD) of Local CS explicitly. Additionally, we introduce Temporal Interval Adjustment (TIA) to automatically control the temporal coverage of each Local CS during training, allowing MoDecGS to find optimal interval assignments based on the specified number of temporal segments. Extensive evaluations demonstrate that MoDecGS achieves an average 70% reduction in model size over stateoftheart methods for dynamic 3D Gaussians from realworld dynamic videos while maintaining or even improving rendering quality.', 'score': 5, 'issue_id': 1556, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': 'c6cfa761edc047da', 'authors': ['Sangwoon Kwak', 'Joonsoo Kim', 'Jun Young Jeong', 'Won-Sik Cheong', 'Jihyong Oh', 'Munchurl Kim'], 'affiliations': ['Chung-Ang University', 'Electronics and Telecommunications Research Institute', 'Korea Advanced Institute of Science and Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.03714.jpg', 'data': {'categories': ['#3d'], 'emoji': '🎭', 'ru': {'title': 'Эффективное представление сложных движений в динамических сценах', 'desc': 'MoDecGS - это новый фреймворк для эффективной реконструкции динамических сцен с использованием 3D Gaussian Splatting. Он вводит метод GlobaltoLocal Motion Decomposition (GLMD) для захвата сложных движений, используя Global Canonical Scaffolds и Local Canonical Scaffolds. Фреймворк также включает Global Anchor Deformation (GAD) для представления глобальной динамики и Local Gaussian Deformation (LGD) для точной настройки локальных движений. MoDecGS демонстрирует значительное сокращение размера модели при сохранении или улучшении качества рендеринга по сравнению с существующими методами.'}, 'en': {'title': 'Efficient Dynamic Scene Rendering with MoDecGS', 'desc': 'The paper presents MoDecGS, a new framework for 3D Gaussian Splatting that efficiently handles dynamic scenes in neural rendering. It introduces GlobaltoLocal Motion Decomposition (GLMD) to capture complex motions using both Global and Local Canonical Scaffolds. The method employs Global Anchor Deformation (GAD) for global dynamics and Local Gaussian Deformation (LGD) for fine-tuning local motions. MoDecGS significantly reduces model size by 70% compared to existing methods while enhancing rendering quality, making it suitable for real-world dynamic video reconstruction.'}, 'zh': {'title': '高效动态场景重建的新方法', 'desc': '3D高斯点云(3DGS)在场景表示和神经渲染方面取得了显著进展,但在处理动态场景时仍面临存储需求和复杂运动表示的挑战。为了解决这些问题,我们提出了MoDecGS,一个内存高效的高斯点云框架,旨在重建具有复杂运动的新视角。我们引入了全局到局部运动分解(GLMD),以粗到细的方式有效捕捉动态运动,并扩展了静态支架表示以适应动态视频重建。通过全局锚点变形(GAD)和局部高斯变形(LGD),MoDecGS在保持或提高渲染质量的同时,平均减少了70%的模型大小。'}}}, {'id': 'https://huggingface.co/papers/2501.03931', 'title': 'Magic Mirror: ID-Preserved Video Generation in Video Diffusion Transformers', 'url': 'https://huggingface.co/papers/2501.03931', 'abstract': 'We present Magic Mirror, a framework for generating identity-preserved videos with cinematic-level quality and dynamic motion. While recent advances in video diffusion models have shown impressive capabilities in text-to-video generation, maintaining consistent identity while producing natural motion remains challenging. Previous methods either require person-specific fine-tuning or struggle to balance identity preservation with motion diversity. Built upon Video Diffusion Transformers, our method introduces three key components: (1) a dual-branch facial feature extractor that captures both identity and structural features, (2) a lightweight cross-modal adapter with Conditioned Adaptive Normalization for efficient identity integration, and (3) a two-stage training strategy combining synthetic identity pairs with video data. Extensive experiments demonstrate that Magic Mirror effectively balances identity consistency with natural motion, outperforming existing methods across multiple metrics while requiring minimal parameters added. The code and model will be made publicly available at: https://github.com/dvlab-research/MagicMirror/', 'score': 4, 'issue_id': 1550, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '1c9696a99b57f781', 'authors': ['Yuechen Zhang', 'Yaoyang Liu', 'Bin Xia', 'Bohao Peng', 'Zexin Yan', 'Eric Lo', 'Jiaya Jia'], 'affiliations': ['CMU', 'CUHK', 'HKUST', 'SmartMore'], 'pdf_title_img': 'assets/pdf/title_img/2501.03931.jpg', 'data': {'categories': ['#training', '#video', '#multimodal', '#open_source', '#synthetic', '#architecture', '#diffusion'], 'emoji': '🪞', 'ru': {'title': 'Магическое зеркало: видео с сохранением личности и естественным движением', 'desc': 'Magic Mirror - это новая система для создания видео с сохранением идентичности и кинематографическим качеством. Она использует модель видеодиффузии и вводит три ключевых компонента: двойной экстрактор лицевых признаков, легкий кросс-модальный адаптер и двухэтапную стратегию обучения. Система эффективно сочетает сохранение идентичности с естественным движением, превосходя существующие методы по нескольким метрикам. Magic Mirror требует минимального добавления параметров и будет доступна в открытом доступе.'}, 'en': {'title': 'Magic Mirror: Identity-Preserved Video Generation with Cinematic Quality', 'desc': 'Magic Mirror is a new framework designed to create high-quality videos that maintain the identity of individuals while showcasing dynamic motion. It addresses the challenges faced by previous video generation methods, which often struggled to keep a consistent identity or required extensive fine-tuning for specific individuals. The framework utilizes Video Diffusion Transformers and introduces innovative components like a dual-branch facial feature extractor and a cross-modal adapter to enhance identity integration. Through a two-stage training approach, Magic Mirror achieves a remarkable balance between identity preservation and natural motion, outperforming existing techniques with fewer additional parameters.'}, 'zh': {'title': 'Magic Mirror:保持身份一致的动态视频生成', 'desc': '本文介绍了Magic Mirror,一个用于生成保持身份一致的视频框架,具有电影级质量和动态运动。尽管最近的视频扩散模型在文本到视频生成方面取得了显著进展,但在生成自然运动的同时保持一致的身份仍然具有挑战性。我们的方法基于视频扩散变换器,提出了三个关键组件,以有效整合身份信息并保持运动多样性。实验结果表明,Magic Mirror在多个指标上超越了现有方法,同时增加的参数极少。'}}}, {'id': 'https://huggingface.co/papers/2501.03916', 'title': 'Dolphin: Closed-loop Open-ended Auto-research through Thinking, Practice, and Feedback', 'url': 'https://huggingface.co/papers/2501.03916', 'abstract': 'The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific research), in this paper, we propose Dolphin, the first closed-loop open-ended auto-research framework to further build the entire process of human scientific research. Dolphin can generate research ideas, perform experiments, and get feedback from experimental results to generate higher-quality ideas. More specifically, Dolphin first generates novel ideas based on relevant papers which are ranked by the topic and task attributes. Then, the codes are automatically generated and debugged with the exception-traceback-guided local code structure. Finally, Dolphin automatically analyzes the results of each idea and feeds the results back to the next round of idea generation. Experiments are conducted on the benchmark datasets of different topics and results show that Dolphin can generate novel ideas continuously and complete the experiment in a loop. We highlight that Dolphin can automatically propose methods that are comparable to the state-of-the-art in some tasks such as 2D image classification and 3D point classification.', 'score': 3, 'issue_id': 1555, 'pub_date': '2025-01-07', 'pub_date_card': {'ru': '7 января', 'en': 'January 7', 'zh': '1月7日'}, 'hash': '9a18a60e788b7840', 'authors': ['Jiakang Yuan', 'Xiangchao Yan', 'Botian Shi', 'Tao Chen', 'Wanli Ouyang', 'Bo Zhang', 'Lei Bai', 'Yu Qiao', 'Bowen Zhou'], 'affiliations': ['Fudan University', 'Shanghai Artificial Intelligence Laboratory'], 'pdf_title_img': 'assets/pdf/title_img/2501.03916.jpg', 'data': {'categories': ['#open_source', '#agents', '#science', '#3d', '#cv', '#benchmark', '#dataset'], 'emoji': '🐬', 'ru': {'title': 'Dolphin: ИИ-ассистент для полного цикла научных исследований', 'desc': 'Статья представляет Dolphin - первую замкнутую систему для автоматического проведения научных исследований. Dolphin генерирует идеи на основе релевантных статей, автоматически создает и отлаживает код для экспериментов, а затем анализирует результаты. Система способна непрерывно генерировать новые идеи и проводить эксперименты в цикле. Эксперименты показали, что Dolphin может предлагать методы, сопоставимые с современными подходами в некоторых задачах машинного обучения.'}, 'en': {'title': 'Dolphin: Automating Scientific Research with AI', 'desc': 'This paper introduces Dolphin, an innovative framework designed to automate the scientific research process. Dolphin operates in a closed-loop system, generating research ideas, conducting experiments, and analyzing results to refine future ideas. It utilizes AI to rank relevant literature and automatically generate and debug code, enhancing research efficiency. The framework has been tested on various benchmark datasets, demonstrating its ability to produce novel ideas and achieve results comparable to leading methods in tasks like image classification.'}, 'zh': {'title': 'Dolphin:自动化科学研究的新纪元', 'desc': '这篇论文介绍了一个名为Dolphin的闭环开放式自动研究框架,旨在提升科学研究的效率。Dolphin能够生成研究想法、进行实验,并根据实验结果反馈生成更高质量的想法。具体来说,Dolphin首先根据相关论文生成新想法,然后自动生成和调试代码,最后分析每个想法的结果并反馈到下一轮生成中。实验结果表明,Dolphin能够持续生成新想法,并在循环中完成实验,且在某些任务上与最先进的方法相当。'}}}, {'id': 'https://huggingface.co/papers/2501.02260', 'title': 'MagicFace: High-Fidelity Facial Expression Editing with Action-Unit Control', 'url': 'https://huggingface.co/papers/2501.02260', 'abstract': "We address the problem of facial expression editing by controling the relative variation of facial action-unit (AU) from the same person. This enables us to edit this specific person's expression in a fine-grained, continuous and interpretable manner, while preserving their identity, pose, background and detailed facial attributes. Key to our model, which we dub MagicFace, is a diffusion model conditioned on AU variations and an ID encoder to preserve facial details of high consistency. Specifically, to preserve the facial details with the input identity, we leverage the power of pretrained Stable-Diffusion models and design an ID encoder to merge appearance features through self-attention. To keep background and pose consistency, we introduce an efficient Attribute Controller by explicitly informing the model of current background and pose of the target. By injecting AU variations into a denoising UNet, our model can animate arbitrary identities with various AU combinations, yielding superior results in high-fidelity expression editing compared to other facial expression editing works. Code is publicly available at https://github.com/weimengting/MagicFace.", 'score': 3, 'issue_id': 1550, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': '9eeeb5b132839793', 'authors': ['Mengting Wei', 'Tuomas Varanka', 'Xingxun Jiang', 'Huai-Qian Khor', 'Guoying Zhao'], 'affiliations': ['Center for Machine Vision and Signal Analysis, Faculty of Information Technology and Electrical Engineering, University of Oulu, Oulu, FI-90014, Finland', 'Key Laboratory of Child Development and Learning Science of Ministry of Education, School of Biological Sciences and Medical Engineering, Southeast University, Nanjing 210096, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.02260.jpg', 'data': {'categories': ['#multimodal', '#diffusion', '#open_source', '#cv'], 'emoji': '🎭', 'ru': {'title': 'Точное редактирование мимики с сохранением личности', 'desc': 'Статья представляет новый подход к редактированию мимики лица с использованием диффузионной модели, названной MagicFace. Модель позволяет точно и интерпретируемо изменять выражение лица конкретного человека, сохраняя его идентичность, позу и фоновые детали. Ключевым элементом является условная генерация на основе вариаций лицевых единиц действия (AU) и использование ID-энкодера для сохранения деталей лица. MagicFace демонстрирует превосходные результаты в высококачественном редактировании выражений лица по сравнению с другими методами.'}, 'en': {'title': 'MagicFace: Fine-Grained Facial Expression Editing with Consistent Identity', 'desc': 'This paper presents a method for editing facial expressions while maintaining the identity and other attributes of the person. The proposed model, named MagicFace, utilizes a diffusion model that is conditioned on facial action unit (AU) variations, allowing for fine-grained control over expressions. It incorporates a pretrained Stable-Diffusion model and an ID encoder to ensure high consistency in facial details. Additionally, an Attribute Controller is introduced to maintain background and pose consistency during the editing process, resulting in high-fidelity expression animations.'}, 'zh': {'title': '魔法面孔:高保真面部表情编辑的创新之路', 'desc': '我们提出了一种面部表情编辑的方法,通过控制同一人的面部动作单元(AU)的相对变化来实现。这种方法可以细致、连续且可解释地编辑特定人的表情,同时保持他们的身份、姿势、背景和面部细节。我们的模型称为MagicFace,核心是一个基于AU变化的扩散模型和一个ID编码器,以保持面部细节的一致性。通过将AU变化注入去噪UNet,我们的模型能够以高保真度编辑面部表情,效果优于其他相关工作。'}}}, {'id': 'https://huggingface.co/papers/2501.02790', 'title': 'Segmenting Text and Learning Their Rewards for Improved RLHF in Language Model', 'url': 'https://huggingface.co/papers/2501.02790', 'abstract': 'Reinforcement learning from human feedback (RLHF) has been widely adopted to align language models (LMs) with human preference. Prior RLHF works typically take a bandit formulation, which, though intuitive, ignores the sequential nature of LM generation and can suffer from the sparse reward issue. While recent works propose dense token-level RLHF, treating each token as an action may be oversubtle to proper reward assignment. In this paper, we seek to get the best of both by training and utilizing a segment-level reward model, which assigns a reward to each semantically complete text segment that spans over a short sequence of tokens. For reward learning, our method allows dynamic text segmentation and compatibility with standard sequence-preference datasets. For effective RL-based LM training against segment reward, we generalize the classical scalar bandit reward normalizers into location-aware normalizer functions and interpolate the segment reward for further densification. With these designs, our method performs competitively on three popular RLHF benchmarks for LM policy: AlpacaEval 2.0, Arena-Hard, and MT-Bench. Ablation studies are conducted to further demonstrate our method.', 'score': 2, 'issue_id': 1562, 'pub_date': '2025-01-06', 'pub_date_card': {'ru': '6 января', 'en': 'January 6', 'zh': '1月6日'}, 'hash': 'bd19e4a3e48539d4', 'authors': ['Yueqin Yin', 'Shentao Yang', 'Yujia Xie', 'Ziyi Yang', 'Yuting Sun', 'Hany Awadalla', 'Weizhu Chen', 'Mingyuan Zhou'], 'affiliations': ['Microsoft', 'The University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2501.02790.jpg', 'data': {'categories': ['#training', '#reasoning', '#alignment', '#rlhf', '#benchmark'], 'emoji': '🧠', 'ru': {'title': 'Сегментарный RLHF: золотая середина между токенами и бандитами', 'desc': 'Данная статья представляет новый подход к обучению языковых моделей с подкреплением на основе обратной связи от человека (RLHF). Авторы предлагают использовать сегментарную модель вознаграждения, которая присваивает награду семантически завершенным текстовым сегментам. Метод позволяет динамическую сегментацию текста и совместим со стандартными наборами данных последовательных предпочтений. Для эффективного RL-обучения языковой модели авторы обобщают классические нормализаторы скалярного бандитного вознаграждения в локально-зависимые функции нормализации.'}, 'en': {'title': 'Enhancing Language Models with Segment-Level Rewards in RLHF', 'desc': 'This paper discusses a new approach to Reinforcement Learning from Human Feedback (RLHF) for language models (LMs). It critiques previous methods that treat the task as a bandit problem, which can overlook the sequential nature of text generation and lead to sparse rewards. The authors propose a segment-level reward model that assigns rewards to complete text segments, improving reward assignment. Their method incorporates dynamic text segmentation and enhances training efficiency by using location-aware normalizer functions, showing competitive results on established RLHF benchmarks.'}, 'zh': {'title': '段落级奖励模型:强化学习的新突破', 'desc': '本论文探讨了如何通过人类反馈进行强化学习(RLHF),以使语言模型(LM)更符合人类偏好。以往的RLHF研究通常采用赌博机模型,但这种方法忽视了语言模型生成的序列特性,并可能面临稀疏奖励的问题。我们提出了一种基于段落级奖励模型的方法,为每个语义完整的文本段落分配奖励,从而克服了以往方法的不足。通过动态文本分割和与标准序列偏好数据集的兼容性,我们的方法在多个RLHF基准测试中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2501.02393', 'title': 'Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers', 'url': 'https://huggingface.co/papers/2501.02393', 'abstract': "We present an approach to modifying Transformer architectures by integrating graph-aware relational reasoning into the attention mechanism, merging concepts from graph neural networks and language modeling. Building on the inherent connection between attention and graph theory, we reformulate the Transformer's attention mechanism as a graph operation and propose Graph-Aware Isomorphic Attention. This method leverages advanced graph modeling strategies, including Graph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA), to enrich the representation of relational structures. Our approach captures complex dependencies and generalizes across tasks, as evidenced by a reduced generalization gap and improved learning performance. Additionally, we expand the concept of graph-aware attention to introduce Sparse GIN-Attention, a fine-tuning approach that employs sparse GINs. By interpreting attention matrices as sparse adjacency graphs, this technique enhances the adaptability of pre-trained foundational models with minimal computational overhead, endowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning achieves improved training dynamics and better generalization compared to alternative methods like low-rank adaption (LoRA). We discuss latent graph-like structures within traditional attention mechanisms, offering a new lens through which Transformers can be understood. By evolving Transformers as hierarchical GIN models for relational reasoning. This perspective suggests profound implications for foundational model development, enabling the design of architectures that dynamically adapt to both local and global dependencies. Applications in bioinformatics, materials science, language modeling, and beyond could benefit from this synthesis of relational and sequential data modeling, setting the stage for interpretable and generalizable modeling strategies.", 'score': 1, 'issue_id': 1563, 'pub_date': '2025-01-04', 'pub_date_card': {'ru': '4 января', 'en': 'January 4', 'zh': '1月4日'}, 'hash': 'a200448c9795e159', 'authors': ['Markus J. Buehler'], 'affiliations': ['Laboratory for Atomistic and Molecular Mechanics (LAMM) MIT Cambridge, MA 02139, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.02393.jpg', 'data': {'categories': ['#graphs', '#architecture', '#interpretability', '#training'], 'emoji': '🕸️', 'ru': {'title': 'Трансформеры эволюционируют в графовые модели для реляционного рассуждения', 'desc': 'Статья представляет новый подход к модификации архитектуры Трансформеров путем интеграции графового реляционного рассуждения в механизм внимания. Авторы переформулируют механизм внимания Трансформера как графовую операцию и предлагают Graph-Aware Isomorphic Attention, используя стратегии моделирования графов, такие как Graph Isomorphism Networks (GIN) и Principal Neighborhood Aggregation (PNA). Метод позволяет улучшить представление реляционных структур, уменьшить разрыв в обобщении и повысить производительность обучения. Также предложен метод тонкой настройки Sparse GIN-Attention, который интерпретирует матрицы внимания как разреженные графы смежности, улучшая адаптивность предобученных моделей.'}, 'en': {'title': 'Transforming Attention: Merging Graphs and Transformers for Enhanced Learning', 'desc': 'This paper introduces a new way to enhance Transformer models by incorporating graph-based reasoning into their attention mechanisms. By treating attention as a graph operation, the authors propose a method called Graph-Aware Isomorphic Attention, which utilizes advanced graph techniques to better capture relationships in data. They also present Sparse GIN-Attention, a fine-tuning method that interprets attention matrices as sparse graphs, improving the adaptability of pre-trained models with less computational cost. Overall, this approach not only improves learning performance but also opens up new possibilities for applying Transformers in various fields like bioinformatics and language modeling.'}, 'zh': {'title': '图感知注意力:Transformer的新视角', 'desc': '本文提出了一种通过将图感知关系推理整合到注意力机制中来修改Transformer架构的方法。这种方法将Transformer的注意力机制重新表述为图操作,并提出了图感知同构注意力(Graph-Aware Isomorphic Attention)。该方法利用图同构网络(GIN)和主邻域聚合(PNA)等先进的图建模策略,增强了关系结构的表示能力。通过引入稀疏GIN注意力(Sparse GIN-Attention),我们展示了如何在保持计算效率的同时,提升预训练模型的适应性和泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2501.08313', 'title': 'MiniMax-01: Scaling Foundation Models with Lightning Attention', 'url': 'https://huggingface.co/papers/2501.08313', 'abstract': 'We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, of which 45.9 billion are activated for each token. We develop an optimized parallel strategy and highly efficient computation-communication overlap techniques for MoE and lightning attention. This approach enables us to conduct efficient training and inference on models with hundreds of billions of parameters across contexts spanning millions of tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens during training and extrapolate to 4 million tokens during inference at an affordable cost. Our vision-language model, MiniMax-VL-01 is built through continued training with 512 billion vision-language tokens. Experiments on both standard and in-house benchmarks show that our models match the performance of state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 times longer context window. We publicly release MiniMax-01 at https://github.com/MiniMax-AI.', 'score': 192, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'a57d7b1914e7383a', 'authors': ['MiniMax', 'Aonian Li', 'Bangwei Gong', 'Bo Yang', 'Boji Shan', 'Chang Liu', 'Cheng Zhu', 'Chunhao Zhang', 'Congchao Guo', 'Da Chen', 'Dong Li', 'Enwei Jiao', 'Gengxin Li', 'Guojun Zhang', 'Haohai Sun', 'Houze Dong', 'Jiadai Zhu', 'Jiaqi Zhuang', 'Jiayuan Song', 'Jin Zhu', 'Jingtao Han', 'Jingyang Li', 'Junbin Xie', 'Junhao Xu', 'Junjie Yan', 'Kaishun Zhang', 'Kecheng Xiao', 'Kexi Kang', 'Le Han', 'Leyang Wang', 'Lianfei Yu', 'Liheng Feng', 'Lin Zheng', 'Linbo Chai', 'Long Xing', 'Meizhi Ju', 'Mingyuan Chi', 'Mozhi Zhang', 'Peikai Huang', 'Pengcheng Niu', 'Pengfei Li', 'Pengyu Zhao', 'Qi Yang', 'Qidi Xu', 'Qiexiang Wang', 'Qin Wang', 'Qiuhui Li', 'Ruitao Leng', 'Shengmin Shi', 'Shuqi Yu', 'Sichen Li', 'Songquan Zhu', 'Tao Huang', 'Tianrun Liang', 'Weigao Sun', 'Weixuan Sun', 'Weiyu Cheng', 'Wenkai Li', 'Xiangjun Song', 'Xiao Su', 'Xiaodong Han', 'Xinjie Zhang', 'Xinzhu Hou', 'Xu Min', 'Xun Zou', 'Xuyang Shen', 'Yan Gong', 'Yingjie Zhu', 'Yipeng Zhou', 'Yiran Zhong', 'Yongyi Hu', 'Yuanxiang Fan', 'Yue Yu', 'Yufeng Yang', 'Yuhao Li', 'Yunan Huang', 'Yunji Li', 'Yunpeng Huang', 'Yunzhi Xu', 'Yuxin Mao', 'Zehan Li', 'Zekang Li', 'Zewei Tao', 'Zewen Ying', 'Zhaoyang Cong', 'Zhen Qin', 'Zhenhua Fan', 'Zhihang Yu', 'Zhuo Jiang', 'Zijia Wu'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08313.jpg', 'data': {'categories': ['#open_source', '#architecture', '#optimization', '#benchmark', '#long_context', '#training'], 'emoji': '🚀', 'ru': {'title': 'MiniMax-01: Революция в обработке длинных контекстов', 'desc': 'Исследователи представили серию моделей MiniMax-01, включая MiniMax-Text-01 и MiniMax-VL-01, которые сравнимы с лучшими моделями, но обладают улучшенными возможностями обработки длинных контекстов. В основе лежит технология lightning attention и ее эффективное масштабирование, интегрированные с Mixture of Experts (MoE). Модель имеет 32 эксперта и 456 миллиардов параметров, из которых 45,9 миллиардов активируются для каждого токена. Контекстное окно MiniMax-Text-01 может достигать 1 миллиона токенов при обучении и экстраполироваться до 4 миллионов токенов при инференсе.'}, 'en': {'title': 'Unleashing Long Contexts with MiniMax-01 Models', 'desc': 'The MiniMax-01 series introduces advanced models, MiniMax-Text-01 and MiniMax-VL-01, designed to handle longer contexts effectively. These models utilize lightning attention and a Mixture of Experts (MoE) architecture, featuring 32 experts and a staggering 456 billion parameters, optimizing the activation of 45.9 billion parameters per token. By implementing efficient parallel strategies and computation-communication overlap techniques, the models can train and infer on extensive datasets, reaching context windows of up to 1 million tokens during training and 4 million during inference. Performance evaluations indicate that MiniMax-01 models rival leading models like GPT-4o and Claude-3.5-Sonnet while significantly extending context capabilities.'}, 'zh': {'title': 'MiniMax-01:超长上下文处理的新纪元', 'desc': '我们介绍了MiniMax-01系列,包括MiniMax-Text-01和MiniMax-VL-01,这些模型在处理更长的上下文时具有优越的能力。核心技术是闪电注意力和高效的扩展能力。为了最大化计算能力,我们将其与专家混合模型(MoE)结合,创建了一个拥有32个专家和4560亿参数的模型。我们的实验表明,这些模型在标准和内部基准测试中表现出色,能够与最先进的模型相媲美,同时提供20到32倍更长的上下文窗口。'}}}, {'id': 'https://huggingface.co/papers/2501.08332', 'title': 'MangaNinja: Line Art Colorization with Precise Reference Following', 'url': 'https://huggingface.co/papers/2501.08332', 'abstract': 'Derived from diffusion models, MangaNinjia specializes in the task of reference-guided line art colorization. We incorporate two thoughtful designs to ensure precise character detail transcription, including a patch shuffling module to facilitate correspondence learning between the reference color image and the target line art, and a point-driven control scheme to enable fine-grained color matching. Experiments on a self-collected benchmark demonstrate the superiority of our model over current solutions in terms of precise colorization. We further showcase the potential of the proposed interactive point control in handling challenging cases, cross-character colorization, multi-reference harmonization, beyond the reach of existing algorithms.', 'score': 31, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '20ea6b75639e2ced', 'authors': ['Zhiheng Liu', 'Ka Leong Cheng', 'Xi Chen', 'Jie Xiao', 'Hao Ouyang', 'Kai Zhu', 'Yu Liu', 'Yujun Shen', 'Qifeng Chen', 'Ping Luo'], 'affiliations': ['Ant Group', 'HKU', 'HKUST', 'Tongyi Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.08332.jpg', 'data': {'categories': ['#cv', '#diffusion', '#benchmark'], 'emoji': '🎨', 'ru': {'title': 'Прецизионное раскрашивание манги с помощью ИИ', 'desc': 'MangaNinjia - это модель для раскрашивания линейных рисунков манги, основанная на диффузионных моделях. Она использует модуль перемешивания патчей для обучения соответствиям между цветным изображением-образцом и целевым линейным рисунком. Модель также включает схему точечного контроля для точного подбора цветов. Эксперименты показывают превосходство MangaNinjia над существующими решениями в точности раскрашивания.'}, 'en': {'title': 'MangaNinjia: Mastering Line Art Colorization with Precision', 'desc': 'MangaNinjia is a model designed for coloring line art by using reference images. It employs a patch shuffling module to help the model learn how to match colors from the reference image to the target line art accurately. Additionally, it features a point-driven control scheme that allows for detailed color adjustments, ensuring that colors are applied precisely. Our experiments show that MangaNinjia outperforms existing methods in colorization tasks, especially in complex scenarios involving multiple references and different characters.'}, 'zh': {'title': 'MangaNinjia:精准上色的新方法', 'desc': 'MangaNinjia 是一种基于扩散模型的参考引导线条艺术上色技术。我们设计了两个模块来确保角色细节的准确转录,包括补丁洗牌模块和点驱动控制方案,以实现精细的颜色匹配。实验结果表明,我们的模型在精确上色方面优于现有解决方案。我们还展示了所提议的交互式点控制在处理复杂案例和多参考协调方面的潜力,超越了现有算法的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.06751', 'title': 'Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models', 'url': 'https://huggingface.co/papers/2501.06751', 'abstract': "Text-to-image (T2I) diffusion models rely on encoded prompts to guide the image generation process. Typically, these prompts are extended to a fixed length by adding padding tokens before text encoding. Despite being a default practice, the influence of padding tokens on the image generation process has not been investigated. In this work, we conduct the first in-depth analysis of the role padding tokens play in T2I models. We develop two causal techniques to analyze how information is encoded in the representation of tokens across different components of the T2I pipeline. Using these techniques, we investigate when and how padding tokens impact the image generation process. Our findings reveal three distinct scenarios: padding tokens may affect the model's output during text encoding, during the diffusion process, or be effectively ignored. Moreover, we identify key relationships between these scenarios and the model's architecture (cross or self-attention) and its training process (frozen or trained text encoder). These insights contribute to a deeper understanding of the mechanisms of padding tokens, potentially informing future model design and training practices in T2I systems.", 'score': 27, 'issue_id': 1677, 'pub_date': '2025-01-12', 'pub_date_card': {'ru': '12 января', 'en': 'January 12', 'zh': '1月12日'}, 'hash': '05733e8e82e23568', 'authors': ['Michael Toker', 'Ido Galil', 'Hadas Orgad', 'Rinon Gal', 'Yoad Tewel', 'Gal Chechik', 'Yonatan Belinkov'], 'affiliations': ['Bar-Ilan University', 'NVIDIA', 'Technion Israel Institute of Technology'], 'pdf_title_img': 'assets/pdf/title_img/2501.06751.jpg', 'data': {'categories': ['#cv', '#architecture', '#interpretability', '#diffusion', '#training'], 'emoji': '🧩', 'ru': {'title': 'Раскрытие тайн токенов заполнения в генерации изображений', 'desc': 'Исследователи провели первый глубокий анализ роли токенов заполнения в моделях преобразования текста в изображение (T2I). Они разработали две причинно-следственные техники для изучения того, как информация кодируется в представлении токенов в различных компонентах конвейера T2I. Результаты показали три различных сценария влияния токенов заполнения на процесс генерации изображений. Исследование выявило ключевые взаимосвязи между этими сценариями и архитектурой модели, а также процессом ее обучения.'}, 'en': {'title': 'Unpacking Padding: The Hidden Role in Text-to-Image Models', 'desc': "This paper explores the impact of padding tokens in text-to-image (T2I) diffusion models, which are used to generate images from text prompts. The authors analyze how these padding tokens influence the image generation process at different stages, including text encoding and the diffusion process. They identify three scenarios where padding tokens can either affect the output or be ignored, depending on the model's architecture and training methods. The findings provide valuable insights that could guide future improvements in T2I model design and training practices."}, 'zh': {'title': '填充标记在图像生成中的关键作用', 'desc': '本文研究了文本到图像(T2I)扩散模型中填充标记的作用。填充标记通常用于将提示扩展到固定长度,但其对图像生成过程的影响尚未被深入探讨。我们开发了两种因果分析技术,探讨填充标记在T2I模型不同组件中的信息编码方式。研究结果表明,填充标记在文本编码、扩散过程中的影响各不相同,并与模型架构和训练过程存在重要关系。'}}}, {'id': 'https://huggingface.co/papers/2501.08316', 'title': 'Diffusion Adversarial Post-Training for One-Step Video Generation', 'url': 'https://huggingface.co/papers/2501.08316', 'abstract': 'The diffusion models are widely used for image and video generation, but their iterative generation process is slow and expansive. While existing distillation approaches have demonstrated the potential for one-step generation in the image domain, they still suffer from significant quality degradation. In this work, we propose Adversarial Post-Training (APT) against real data following diffusion pre-training for one-step video generation. To improve the training stability and quality, we introduce several improvements to the model architecture and training procedures, along with an approximated R1 regularization objective. Empirically, our experiments show that our adversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720, 24fps videos in real time using a single forward evaluation step. Additionally, our model is capable of generating 1024px images in a single step, achieving quality comparable to state-of-the-art methods.', 'score': 19, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '4122a780e8356ce7', 'authors': ['Shanchuan Lin', 'Xin Xia', 'Yuxi Ren', 'Ceyuan Yang', 'Xuefeng Xiao', 'Lu Jiang'], 'affiliations': ['ByteDance'], 'pdf_title_img': 'assets/pdf/title_img/2501.08316.jpg', 'data': {'categories': ['#architecture', '#optimization', '#video', '#diffusion', '#training'], 'emoji': '🎬', 'ru': {'title': 'Революция в генерации видео: от итераций к мгновенному результату', 'desc': 'Эта статья представляет новый метод под названием Adversarial Post-Training (APT) для одношаговой генерации видео. Авторы предлагают улучшения архитектуры модели и процедур обучения, включая аппроксимированную регуляризацию R1. Их модель Seaweed-APT способна генерировать 2-секундные видео высокого разрешения в реальном времени за один проход. Кроме того, модель может создавать изображения размером 1024px за один шаг, достигая качества, сравнимого с современными методами.'}, 'en': {'title': 'Fast and High-Quality Video Generation with Seaweed-APT', 'desc': 'This paper addresses the slow and costly iterative process of generating images and videos using diffusion models. The authors introduce Adversarial Post-Training (APT) to enhance one-step video generation while maintaining high quality. They implement architectural and procedural improvements, including an approximated R1 regularization, to stabilize training. Their model, Seaweed-APT, successfully generates high-quality 2-second videos and 1024px images in real time with a single forward evaluation step.'}, 'zh': {'title': '对抗后训练:快速高质量视频生成的新方法', 'desc': '扩散模型广泛应用于图像和视频生成,但其迭代生成过程较慢且成本高昂。现有的蒸馏方法在图像领域展示了单步生成的潜力,但仍存在显著的质量下降。本文提出了一种针对真实数据的对抗后训练(APT)方法,以实现单步视频生成。我们的实验表明,经过对抗后训练的模型Seaweed-APT能够实时生成1280x720、24fps的2秒视频,并且在单步生成1024px图像时,其质量可与最先进的方法相媲美。'}}}, {'id': 'https://huggingface.co/papers/2501.08187', 'title': 'A Multi-Modal AI Copilot for Single-Cell Analysis with Instruction Following', 'url': 'https://huggingface.co/papers/2501.08187', 'abstract': 'Large language models excel at interpreting complex natural language instructions, enabling them to perform a wide range of tasks. In the life sciences, single-cell RNA sequencing (scRNA-seq) data serves as the "language of cellular biology", capturing intricate gene expression patterns at the single-cell level. However, interacting with this "language" through conventional tools is often inefficient and unintuitive, posing challenges for researchers. To address these limitations, we present InstructCell, a multi-modal AI copilot that leverages natural language as a medium for more direct and flexible single-cell analysis. We construct a comprehensive multi-modal instruction dataset that pairs text-based instructions with scRNA-seq profiles from diverse tissues and species. Building on this, we develop a multi-modal cell language architecture capable of simultaneously interpreting and processing both modalities. InstructCell empowers researchers to accomplish critical tasks-such as cell type annotation, conditional pseudo-cell generation, and drug sensitivity prediction-using straightforward natural language commands. Extensive evaluations demonstrate that InstructCell consistently meets or exceeds the performance of existing single-cell foundation models, while adapting to diverse experimental conditions. More importantly, InstructCell provides an accessible and intuitive tool for exploring complex single-cell data, lowering technical barriers and enabling deeper biological insights.', 'score': 18, 'issue_id': 1672, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'de984ce7cc62fa5e', 'authors': ['Yin Fang', 'Xinle Deng', 'Kangwei Liu', 'Ningyu Zhang', 'Jingyang Qian', 'Penghui Yang', 'Xiaohui Fan', 'Huajun Chen'], 'affiliations': ['College of Computer Science and Technology, Zhejiang University, Hangzhou 310027, China', 'College of Pharmaceutical Sciences, Zhejiang University, Hangzhou 310058, China', 'Future Health Laboratory, Innovation Center of Yangtze River Delta, Zhejiang University, Jiaxing 314100, China', 'Innovation Center in Zhejiang University, State Key Laboratory of Component-Based Chinese Medicine, Hangzhou 310058, China', 'School of Software Technology, Zhejiang University, Ningbo 315048, China', 'ZJU-Hangzhou Global Scientific and Technological Innovation Center, Hangzhou 311200, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.08187.jpg', 'data': {'categories': ['#architecture', '#multimodal', '#dataset', '#science', '#healthcare'], 'emoji': '🧬', 'ru': {'title': 'Естественный язык как ключ к расшифровке клеточной биологии', 'desc': 'InstructCell - это мультимодальный ИИ-помощник для анализа данных одноклеточного РНК-секвенирования (scRNA-seq). Он использует архитектуру, способную интерпретировать как естественный язык, так и профили экспрессии генов. InstructCell позволяет исследователям выполнять такие задачи, как аннотация типов клеток и предсказание чувствительности к лекарствам, с помощью простых текстовых команд. Модель демонстрирует высокую производительность и адаптивность к различным экспериментальным условиям.'}, 'en': {'title': 'InstructCell: Bridging Language and Biology for Seamless Single-Cell Analysis', 'desc': 'This paper introduces InstructCell, an AI tool designed to simplify the analysis of single-cell RNA sequencing (scRNA-seq) data using natural language instructions. By creating a dataset that links text commands with scRNA-seq profiles, InstructCell allows researchers to perform complex tasks like cell type annotation and drug sensitivity prediction more intuitively. The model employs a multi-modal architecture that processes both text and biological data simultaneously, enhancing its usability. Evaluations show that InstructCell outperforms existing models, making single-cell analysis more accessible and efficient for researchers in the life sciences.'}, 'zh': {'title': '用自然语言解锁单细胞数据的潜力', 'desc': '这篇论文介绍了InstructCell,一个多模态的人工智能助手,旨在通过自然语言简化单细胞RNA测序(scRNA-seq)数据的分析。传统工具在处理细胞生物学的复杂数据时效率低下,而InstructCell通过将文本指令与scRNA-seq数据结合,提供了更直接和灵活的分析方式。该系统能够执行细胞类型注释、条件伪细胞生成和药物敏感性预测等关键任务,且使用简单的自然语言命令即可完成。评估结果表明,InstructCell在性能上优于现有的单细胞基础模型,同时适应多种实验条件,降低了技术门槛,促进了生物学的深入理解。'}}}, {'id': 'https://huggingface.co/papers/2501.08225', 'title': 'FramePainter: Endowing Interactive Image Editing with Video Diffusion Priors', 'url': 'https://huggingface.co/papers/2501.08225', 'abstract': 'Interactive image editing allows users to modify images through visual interaction operations such as drawing, clicking, and dragging. Existing methods construct such supervision signals from videos, as they capture how objects change with various physical interactions. However, these models are usually built upon text-to-image diffusion models, so necessitate (i) massive training samples and (ii) an additional reference encoder to learn real-world dynamics and visual consistency. In this paper, we reformulate this task as an image-to-video generation problem, so that inherit powerful video diffusion priors to reduce training costs and ensure temporal consistency. Specifically, we introduce FramePainter as an efficient instantiation of this formulation. Initialized with Stable Video Diffusion, it only uses a lightweight sparse control encoder to inject editing signals. Considering the limitations of temporal attention in handling large motion between two frames, we further propose matching attention to enlarge the receptive field while encouraging dense correspondence between edited and source image tokens. We highlight the effectiveness and efficiency of FramePainter across various of editing signals: it domainantly outperforms previous state-of-the-art methods with far less training data, achieving highly seamless and coherent editing of images, \\eg, automatically adjust the reflection of the cup. Moreover, FramePainter also exhibits exceptional generalization in scenarios not present in real-world videos, \\eg, transform the clownfish into shark-like shape. Our code will be available at https://github.com/YBYBZhang/FramePainter.', 'score': 12, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '811cfd0f18eb1e53', 'authors': ['Yabo Zhang', 'Xinpeng Zhou', 'Yihan Zeng', 'Hang Xu', 'Hui Li', 'Wangmeng Zuo'], 'affiliations': ['Harbin Institute of Technology', 'Huawei Noahs Ark Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.08225.jpg', 'data': {'categories': ['#video', '#cv', '#optimization', '#diffusion'], 'emoji': '🎨', 'ru': {'title': 'FramePainter: эффективное редактирование изображений через генерацию видео', 'desc': 'Статья представляет FramePainter - новый подход к интерактивному редактированию изображений, основанный на генерации видео. В отличие от существующих методов, использующих модели диффузии текст-изображение, FramePainter опирается на мощные видео-диффузионные модели для обеспечения временной согласованности и снижения затрат на обучение. Метод использует легковесный энкодер для внедрения сигналов редактирования и вводит механизм согласованного внимания для улучшения обработки крупных движений между кадрами. FramePainter превосходит современные методы, требуя значительно меньше обучающих данных и демонстрируя высокую обобщающую способность.'}, 'en': {'title': 'Revolutionizing Image Editing with Efficient Video Diffusion', 'desc': 'This paper presents FramePainter, a novel approach to interactive image editing that reformulates the task as image-to-video generation. By leveraging video diffusion models, FramePainter reduces the need for extensive training data while ensuring temporal consistency in edited images. It utilizes a lightweight sparse control encoder to effectively incorporate editing signals, and introduces matching attention to improve the handling of large motion between frames. The results demonstrate that FramePainter significantly outperforms existing methods, achieving seamless image edits and showcasing strong generalization capabilities.'}, 'zh': {'title': 'FramePainter:高效的图像编辑新方法', 'desc': '本文提出了一种交互式图像编辑的新方法,称为FramePainter。该方法将图像编辑任务重新定义为图像到视频的生成问题,从而利用强大的视频扩散先验,降低训练成本并确保时间一致性。FramePainter使用轻量级的稀疏控制编码器来注入编辑信号,并通过匹配注意力机制增强了对大运动的处理能力。实验结果表明,FramePainter在各种编辑信号下表现优异,能够实现无缝且连贯的图像编辑,且在未见过的场景中也展现出卓越的泛化能力。'}}}, {'id': 'https://huggingface.co/papers/2501.08326', 'title': 'Omni-RGPT: Unifying Image and Video Region-level Understanding via Token Marks', 'url': 'https://huggingface.co/papers/2501.08326', 'abstract': 'We present Omni-RGPT, a multimodal large language model designed to facilitate region-level comprehension for both images and videos. To achieve consistent region representation across spatio-temporal dimensions, we introduce Token Mark, a set of tokens highlighting the target regions within the visual feature space. These tokens are directly embedded into spatial regions using region prompts (e.g., boxes or masks) and simultaneously incorporated into the text prompt to specify the target, establishing a direct connection between visual and text tokens. To further support robust video understanding without requiring tracklets, we introduce an auxiliary task that guides Token Mark by leveraging the consistency of the tokens, enabling stable region interpretation across the video. Additionally, we introduce a large-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT achieves state-of-the-art results on image and video-based commonsense reasoning benchmarks while showing strong performance in captioning and referring expression comprehension tasks.', 'score': 11, 'issue_id': 1678, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '463580cacfaa6789', 'authors': ['Miran Heo', 'Min-Hung Chen', 'De-An Huang', 'Sifei Liu', 'Subhashree Radhakrishnan', 'Seon Joo Kim', 'Yu-Chiang Frank Wang', 'Ryo Hachiuma'], 'affiliations': ['NVIDIA', 'Yonsei University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08326.jpg', 'data': {'categories': ['#multimodal', '#reasoning', '#agi', '#cv', '#dataset', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'Omni-RGPT: Новый уровень понимания изображений и видео искусственным интеллектом', 'desc': 'Omni-RGPT - это мультимодальная большая языковая модель, разработанная для понимания изображений и видео на уровне регионов. Модель использует технологию Token Mark для выделения целевых регионов в визуальном пространстве признаков. Для улучшения понимания видео без необходимости трекинга объектов введена вспомогательная задача, использующая согласованность токенов. Авторы также представили большой набор данных RegVID-300k для обучения на видео с инструкциями на уровне регионов.'}, 'en': {'title': 'Omni-RGPT: Bridging Visual and Textual Understanding with Token Mark', 'desc': 'Omni-RGPT is a multimodal large language model that enhances understanding of specific regions in images and videos. It uses a novel approach called Token Mark, which embeds tokens into visual features to highlight target areas, linking them with text prompts. This model also includes an auxiliary task that ensures consistent token representation across video frames, improving video comprehension. With the introduction of the RegVID-300k dataset, Omni-RGPT sets new benchmarks in commonsense reasoning, captioning, and referring expression tasks.'}, 'zh': {'title': 'Omni-RGPT:图像与视频的区域理解新突破', 'desc': '本文介绍了Omni-RGPT,这是一种多模态的大型语言模型,旨在促进图像和视频的区域级理解。为了在时空维度上实现一致的区域表示,我们引入了Token Mark,这是一组突出视觉特征空间中目标区域的标记。通过使用区域提示(如框或掩码),这些标记被直接嵌入到空间区域中,并同时与文本提示结合,以指定目标,从而建立视觉和文本标记之间的直接联系。此外,我们还引入了一个辅助任务,通过利用标记的一致性来指导Token Mark,从而支持稳健的视频理解。'}}}, {'id': 'https://huggingface.co/papers/2501.07730', 'title': 'Democratizing Text-to-Image Masked Generative Models with Compact Text-Aware One-Dimensional Tokens', 'url': 'https://huggingface.co/papers/2501.07730', 'abstract': 'Image tokenizers form the foundation of modern text-to-image generative models but are notoriously difficult to train. Furthermore, most existing text-to-image models rely on large-scale, high-quality private datasets, making them challenging to replicate. In this work, we introduce Text-Aware Transformer-based 1-Dimensional Tokenizer (TA-TiTok), an efficient and powerful image tokenizer that can utilize either discrete or continuous 1-dimensional tokens. TA-TiTok uniquely integrates textual information during the tokenizer decoding stage (i.e., de-tokenization), accelerating convergence and enhancing performance. TA-TiTok also benefits from a simplified, yet effective, one-stage training process, eliminating the need for the complex two-stage distillation used in previous 1-dimensional tokenizers. This design allows for seamless scalability to large datasets. Building on this, we introduce a family of text-to-image Masked Generative Models (MaskGen), trained exclusively on open data while achieving comparable performance to models trained on private data. We aim to release both the efficient, strong TA-TiTok tokenizers and the open-data, open-weight MaskGen models to promote broader access and democratize the field of text-to-image masked generative models.', 'score': 10, 'issue_id': 1673, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': '80f40715084c602b', 'authors': ['Dongwon Kim', 'Ju He', 'Qihang Yu', 'Chenglin Yang', 'Xiaohui Shen', 'Suha Kwak', 'Liang-Chieh Chen'], 'affiliations': ['ByteDance Seed', 'POSTECH'], 'pdf_title_img': 'assets/pdf/title_img/2501.07730.jpg', 'data': {'categories': ['#dataset', '#data', '#training', '#cv', '#open_source'], 'emoji': '🖼️', 'ru': {'title': 'Демократизация генерации изображений с помощью эффективной токенизации и открытых данных', 'desc': 'В этой статье представлен новый подход к токенизации изображений для генеративных моделей текст-в-изображение под названием TA-TiTok. Данный токенизатор использует одномерные токены и интегрирует текстовую информацию на этапе детокенизации, что ускоряет сходимость и улучшает производительность. На основе TA-TiTok авторы разработали семейство моделей MaskGen, обученных исключительно на открытых данных. Целью работы является демократизация области генеративных моделей текст-в-изображение путем публикации эффективных токенизаторов и моделей с открытыми весами.'}, 'en': {'title': 'Democratizing Text-to-Image Generation with TA-TiTok', 'desc': 'This paper presents TA-TiTok, a novel image tokenizer designed for text-to-image generative models, which simplifies the training process and improves performance. Unlike traditional models that require large private datasets, TA-TiTok can effectively utilize open data, making it more accessible for researchers. The tokenizer incorporates textual information during the decoding stage, which helps it learn faster and perform better. Additionally, the authors introduce MaskGen, a family of generative models that leverage TA-TiTok and are trained on publicly available datasets, aiming to democratize access to advanced text-to-image generation technology.'}, 'zh': {'title': '高效的文本到图像生成模型,推动开放数据的使用', 'desc': '本文介绍了一种新的图像标记器,称为TA-TiTok,它可以有效地处理文本到图像的生成任务。TA-TiTok在解码阶段整合了文本信息,从而加快了模型的收敛速度并提高了性能。与以往的标记器不同,TA-TiTok采用了一种简化的一阶段训练过程,避免了复杂的两阶段蒸馏过程。我们还提出了一系列基于开放数据训练的文本到图像生成模型MaskGen,旨在促进更广泛的访问和民主化。'}}}, {'id': 'https://huggingface.co/papers/2501.05131', 'title': '3DIS-FLUX: simple and efficient multi-instance generation with DiT rendering', 'url': 'https://huggingface.co/papers/2501.05131', 'abstract': "The growing demand for controllable outputs in text-to-image generation has driven significant advancements in multi-instance generation (MIG), enabling users to define both instance layouts and attributes. Currently, the state-of-the-art methods in MIG are primarily adapter-based. However, these methods necessitate retraining a new adapter each time a more advanced model is released, resulting in significant resource consumption. A methodology named Depth-Driven Decoupled Instance Synthesis (3DIS) has been introduced, which decouples MIG into two distinct phases: 1) depth-based scene construction and 2) detail rendering with widely pre-trained depth control models. The 3DIS method requires adapter training solely during the scene construction phase, while enabling various models to perform training-free detail rendering. Initially, 3DIS focused on rendering techniques utilizing U-Net architectures such as SD1.5, SD2, and SDXL, without exploring the potential of recent DiT-based models like FLUX. In this paper, we present 3DIS-FLUX, an extension of the 3DIS framework that integrates the FLUX model for enhanced rendering capabilities. Specifically, we employ the FLUX.1-Depth-dev model for depth map controlled image generation and introduce a detail renderer that manipulates the Attention Mask in FLUX's Joint Attention mechanism based on layout information. This approach allows for the precise rendering of fine-grained attributes of each instance. Our experimental results indicate that 3DIS-FLUX, leveraging the FLUX model, outperforms the original 3DIS method, which utilized SD2 and SDXL, and surpasses current state-of-the-art adapter-based methods in terms of both performance and image quality. Project Page: https://limuloo.github.io/3DIS/.", 'score': 9, 'issue_id': 1684, 'pub_date': '2025-01-09', 'pub_date_card': {'ru': '9 января', 'en': 'January 9', 'zh': '1月9日'}, 'hash': 'ca5ad23cb146f3aa', 'authors': ['Dewei Zhou', 'Ji Xie', 'Zongxin Yang', 'Yi Yang'], 'affiliations': ['DBMI, HMS, Harvard University', 'RELER, CCAI, Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.05131.jpg', 'data': {'categories': ['#cv', '#games', '#architecture', '#multimodal', '#optimization'], 'emoji': '🎨', 'ru': {'title': '3DIS-FLUX: Новый уровень контролируемой генерации мульти-объектных изображений', 'desc': 'Статья представляет метод 3DIS-FLUX для управляемой генерации изображений с несколькими объектами. Этот подход разделяет процесс на создание сцены на основе глубины и детализированный рендеринг с использованием предобученных моделей контроля глубины. 3DIS-FLUX интегрирует модель FLUX для улучшенного рендеринга, манипулируя маской внимания в механизме совместного внимания FLUX. Эксперименты показывают, что 3DIS-FLUX превосходит предыдущие методы по производительности и качеству изображений.'}, 'en': {'title': 'Enhancing Text-to-Image Generation with 3DIS-FLUX', 'desc': 'This paper introduces a new method called 3DIS-FLUX for improving text-to-image generation by enhancing the multi-instance generation (MIG) process. The 3DIS framework separates the generation into two phases: constructing the scene based on depth and rendering details using pre-trained models. By integrating the FLUX model, the method allows for better control over the rendering of fine details while reducing the need for retraining adapters. Experimental results show that 3DIS-FLUX outperforms previous methods in both performance and image quality, making it a significant advancement in controllable image generation.'}, 'zh': {'title': '深度驱动解耦实例合成:提升图像生成的可控性与质量', 'desc': '随着对可控文本到图像生成输出的需求增加,多实例生成(MIG)技术得到了显著进展。现有的MIG方法主要基于适配器,但每次新模型发布时都需要重新训练适配器,消耗大量资源。本文提出了一种名为深度驱动解耦实例合成(3DIS)的方法,将MIG分为两个阶段:基于深度的场景构建和细节渲染。通过引入FLUX模型,3DIS-FLUX在细节渲染方面实现了更高的性能和图像质量。'}}}, {'id': 'https://huggingface.co/papers/2501.08328', 'title': 'PokerBench: Training Large Language Models to become Professional Poker Players', 'url': 'https://huggingface.co/papers/2501.08328', 'abstract': 'We introduce PokerBench - a benchmark for evaluating the poker-playing abilities of large language models (LLMs). As LLMs excel in traditional NLP tasks, their application to complex, strategic games like poker poses a new challenge. Poker, an incomplete information game, demands a multitude of skills such as mathematics, reasoning, planning, strategy, and a deep understanding of game theory and human psychology. This makes Poker the ideal next frontier for large language models. PokerBench consists of a comprehensive compilation of 11,000 most important scenarios, split between pre-flop and post-flop play, developed in collaboration with trained poker players. We evaluate prominent models including GPT-4, ChatGPT 3.5, and various Llama and Gemma series models, finding that all state-of-the-art LLMs underperform in playing optimal poker. However, after fine-tuning, these models show marked improvements. We validate PokerBench by having models with different scores compete with each other, demonstrating that higher scores on PokerBench lead to higher win rates in actual poker games. Through gameplay between our fine-tuned model and GPT-4, we also identify limitations of simple supervised fine-tuning for learning optimal playing strategy, suggesting the need for more advanced methodologies for effectively training language models to excel in games. PokerBench thus presents a unique benchmark for a quick and reliable evaluation of the poker-playing ability of LLMs as well as a comprehensive benchmark to study the progress of LLMs in complex game-playing scenarios. The dataset and code will be made available at: https://github.com/pokerllm/pokerbench.', 'score': 9, 'issue_id': 1674, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '7b4dacedffdbfa15', 'authors': ['Richard Zhuang', 'Akshat Gupta', 'Richard Yang', 'Aniket Rahane', 'Zhengyu Li', 'Gopala Anumanchipalli'], 'affiliations': ['Georgia Institute of Technology', 'University of California, Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.08328.jpg', 'data': {'categories': ['#training', '#reasoning', '#games', '#optimization', '#benchmark'], 'emoji': '🃏', 'ru': {'title': 'PokerBench: новый рубеж для оценки стратегических способностей языковых моделей', 'desc': 'PokerBench - это новый бенчмарк для оценки способностей больших языковых моделей (LLM) играть в покер. Он включает 11000 важнейших сценариев игры, разработанных совместно с профессиональными игроками. Авторы оценили производительность современных LLM, таких как GPT-4 и ChatGPT 3.5, обнаружив, что все модели показывают результаты ниже оптимальных. После дообучения модели демонстрируют значительное улучшение, но авторы отмечают ограничения простого обучения с учителем для освоения оптимальной стратегии игры.'}, 'en': {'title': 'PokerBench: Elevating LLMs to Master the Game of Poker', 'desc': 'PokerBench is a new benchmark designed to assess the poker-playing skills of large language models (LLMs). It focuses on the unique challenges of poker, which requires a blend of mathematical skills, strategic reasoning, and an understanding of human psychology. The benchmark includes 11,000 scenarios that cover various aspects of the game, and it has been tested on several leading models, revealing that they initially struggle with optimal poker play. However, after fine-tuning, these models show significant improvement, highlighting the need for advanced training techniques to enhance their performance in complex games.'}, 'zh': {'title': 'PokerBench:评估语言模型扑克能力的新基准', 'desc': '我们介绍了PokerBench,这是一个用于评估大型语言模型(LLMs)扑克游戏能力的基准。扑克是一种不完全信息游戏,需要数学、推理、规划、策略以及对博弈论和人类心理的深刻理解。PokerBench包含11,000个重要场景,分为翻牌前和翻牌后游戏,经过训练的扑克玩家共同开发。通过对不同模型的评估,我们发现尽管当前的LLMs在扑克游戏中表现不佳,但经过微调后,它们的表现有显著提升。'}}}, {'id': 'https://huggingface.co/papers/2501.08319', 'title': 'Enhancing Automated Interpretability with Output-Centric Feature Descriptions', 'url': 'https://huggingface.co/papers/2501.08319', 'abstract': 'Automated interpretability pipelines generate natural language descriptions for the concepts represented by features in large language models (LLMs), such as plants or the first word in a sentence. These descriptions are derived using inputs that activate the feature, which may be a dimension or a direction in the model\'s representation space. However, identifying activating inputs is costly, and the mechanistic role of a feature in model behavior is determined both by how inputs cause a feature to activate and by how feature activation affects outputs. Using steering evaluations, we reveal that current pipelines provide descriptions that fail to capture the causal effect of the feature on outputs. To fix this, we propose efficient, output-centric methods for automatically generating feature descriptions. These methods use the tokens weighted higher after feature stimulation or the highest weight tokens after applying the vocabulary "unembedding" head directly to the feature. Our output-centric descriptions better capture the causal effect of a feature on model outputs than input-centric descriptions, but combining the two leads to the best performance on both input and output evaluations. Lastly, we show that output-centric descriptions can be used to find inputs that activate features previously thought to be "dead".', 'score': 7, 'issue_id': 1677, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '22615e3bb16f93af', 'authors': ['Yoav Gur-Arieh', 'Roy Mayan', 'Chen Agassy', 'Atticus Geiger', 'Mor Geva'], 'affiliations': ['Blavatnik School of Computer Science and AI, Tel Aviv University', 'Pr(Ai)2R Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.08319.jpg', 'data': {'categories': ['#interpretability', '#inference', '#training', '#data'], 'emoji': '🔍', 'ru': {'title': 'Взгляд изнутри: новый метод интерпретации больших языковых моделей', 'desc': 'Статья описывает новый подход к автоматической интерпретации нейронных сетей, фокусируясь на выходных данных модели вместо входных. Авторы предлагают эффективные методы для генерации описаний признаков, основанные на токенах с наибольшим весом после стимуляции признака. Эксперименты показывают, что ориентированные на выход описания лучше отражают причинно-следственное влияние признака на результаты модели. Комбинация подходов, ориентированных на вход и выход, дает наилучшие результаты в оценке как входных, так и выходных данных.'}, 'en': {'title': 'Unlocking Feature Interpretability in Language Models', 'desc': 'This paper discusses how automated interpretability pipelines can create natural language descriptions for features in large language models (LLMs). It highlights the challenge of identifying inputs that activate these features, which is essential for understanding their role in model behavior. The authors propose new methods that focus on the output effects of features, leading to more accurate descriptions of their causal impact. By combining both input-centric and output-centric approaches, the proposed methods improve the overall interpretability of LLMs and can even identify previously overlooked features.'}, 'zh': {'title': '以输出为中心的特征描述生成方法', 'desc': '这篇论文讨论了自动化可解释性管道如何为大型语言模型中的特征生成自然语言描述。特征的描述是通过激活特征的输入生成的,但识别这些输入的过程成本高昂。研究表明,现有的描述方法未能有效捕捉特征对输出的因果影响。为此,作者提出了一种以输出为中心的方法,能够更好地生成特征描述,并结合输入和输出的评估来提高性能。'}}}, {'id': 'https://huggingface.co/papers/2501.08197', 'title': 'OpenCSG Chinese Corpus: A Series of High-quality Chinese Datasets for LLM Training', 'url': 'https://huggingface.co/papers/2501.08197', 'abstract': 'Large language models (LLMs) have demonstrated remarkable capabilities, but their success heavily relies on the quality of pretraining corpora. For Chinese LLMs, the scarcity of high-quality Chinese datasets presents a significant challenge, often limiting their performance. To address this issue, we propose the OpenCSG Chinese Corpus, a series of high-quality datasets specifically designed for LLM pretraining, post-training, and fine-tuning. This corpus includes Fineweb-edu-chinese, Fineweb-edu-chinese-v2, Cosmopedia-chinese, and Smoltalk-chinese, each with distinct characteristics: Fineweb-edu datasets focus on filtered, high-quality content derived from diverse Chinese web sources; Cosmopedia-chinese provides synthetic, textbook-style data for knowledge-intensive training; and Smoltalk-chinese emphasizes stylistic and diverse chat-format data. The OpenCSG Chinese Corpus is characterized by its high-quality text, diverse coverage across domains, and scalable, reproducible data curation processes. Additionally, we conducted extensive experimental analyses, including evaluations on smaller parameter models, which demonstrated significant performance improvements in tasks such as C-Eval, showcasing the effectiveness of the corpus for training Chinese LLMs.', 'score': 5, 'issue_id': 1675, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '27267ae1a569051c', 'authors': ['Yijiong Yu', 'Ziyun Dai', 'Zekun Wang', 'Wei Wang', 'Ran Chen', 'Ji Pei'], 'affiliations': ['OpenCSG', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08197.jpg', 'data': {'categories': ['#data', '#open_source', '#dataset', '#synthetic', '#training', '#low_resource'], 'emoji': '🐉', 'ru': {'title': 'Прорыв в обучении китайских языковых моделей: OpenCSG Chinese Corpus', 'desc': 'Эта статья представляет OpenCSG Chinese Corpus - набор высококачественных китайских датасетов для предобучения, пост-обучения и тонкой настройки больших языковых моделей (LLM). Корпус включает в себя несколько датасетов, каждый с уникальными характеристиками: от отфильтрованного веб-контента до синтетических учебных данных и разговорных форматов. Авторы подчеркивают высокое качество текста, разнообразие тематик и масштабируемость процесса сбора данных. Эксперименты показали значительное улучшение производительности моделей на различных задачах, включая C-Eval.'}, 'en': {'title': 'Empowering Chinese LLMs with OpenCSG Corpus', 'desc': 'This paper introduces the OpenCSG Chinese Corpus, a collection of high-quality datasets aimed at improving the performance of Chinese large language models (LLMs). The corpus includes several datasets, each tailored for different training needs: Fineweb-edu datasets focus on high-quality web content, Cosmopedia-chinese offers synthetic textbook-style data, and Smoltalk-chinese provides diverse chat-format data. The authors highlight the importance of quality pretraining data for LLMs and demonstrate through experiments that using this corpus leads to significant performance gains in various evaluation tasks. Overall, the OpenCSG Chinese Corpus addresses the challenge of limited high-quality datasets for Chinese LLMs, promoting better training outcomes.'}, 'zh': {'title': '提升中文LLM性能的高质量语料库', 'desc': '大型语言模型(LLMs)在处理自然语言方面表现出色,但其成功依赖于高质量的预训练语料库。针对中文LLMs,优质中文数据集的稀缺性成为了一个重大挑战,限制了它们的性能。为了解决这个问题,我们提出了OpenCSG中文语料库,这是一系列专门为LLM预训练、后训练和微调设计的高质量数据集。该语料库包括Fineweb-edu-chinese、Fineweb-edu-chinese-v2、Cosmopedia-chinese和Smoltalk-chinese,涵盖了多样化的内容和风格,显著提升了中文LLMs的训练效果。'}}}, {'id': 'https://huggingface.co/papers/2501.08167', 'title': 'Potential and Perils of Large Language Models as Judges of Unstructured Textual Data', 'url': 'https://huggingface.co/papers/2501.08167', 'abstract': "Rapid advancements in large language models have unlocked remarkable capabilities when it comes to processing and summarizing unstructured text data. This has implications for the analysis of rich, open-ended datasets, such as survey responses, where LLMs hold the promise of efficiently distilling key themes and sentiments. However, as organizations increasingly turn to these powerful AI systems to make sense of textual feedback, a critical question arises, can we trust LLMs to accurately represent the perspectives contained within these text based datasets? While LLMs excel at generating human-like summaries, there is a risk that their outputs may inadvertently diverge from the true substance of the original responses. Discrepancies between the LLM-generated outputs and the actual themes present in the data could lead to flawed decision-making, with far-reaching consequences for organizations. This research investigates the effectiveness of LLMs as judge models to evaluate the thematic alignment of summaries generated by other LLMs. We utilized an Anthropic Claude model to generate thematic summaries from open-ended survey responses, with Amazon's Titan Express, Nova Pro, and Meta's Llama serving as LLM judges. The LLM-as-judge approach was compared to human evaluations using Cohen's kappa, Spearman's rho, and Krippendorff's alpha, validating a scalable alternative to traditional human centric evaluation methods. Our findings reveal that while LLMs as judges offer a scalable solution comparable to human raters, humans may still excel at detecting subtle, context-specific nuances. This research contributes to the growing body of knowledge on AI assisted text analysis. We discuss limitations and provide recommendations for future research, emphasizing the need for careful consideration when generalizing LLM judge models across various contexts and use cases.", 'score': 5, 'issue_id': 1675, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '866161709624c632', 'authors': ['Rewina Bedemariam', 'Natalie Perez', 'Sreyoshi Bhaduri', 'Satya Kapoor', 'Alex Gil', 'Elizabeth Conjar', 'Ikkei Itoku', 'David Theil', 'Aman Chadha', 'Naumaan Nayyar'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08167.jpg', 'data': {'categories': ['#data', '#dataset', '#science', '#ethics', '#multimodal', '#benchmark', '#interpretability'], 'emoji': '🤖', 'ru': {'title': 'LLM как судьи: масштабируемая альтернатива человеческим оценкам в анализе текста', 'desc': 'Исследование посвящено использованию больших языковых моделей (LLM) для анализа неструктурированных текстовых данных, таких как ответы на опросы. Авторы изучают эффективность применения LLM в качестве судей для оценки тематического соответствия сгенерированных другими LLM резюме. Результаты показывают, что LLM-судьи предлагают масштабируемое решение, сопоставимое с оценками людей, хотя люди все еще могут превосходить их в обнаружении тонких, контекстно-зависимых нюансов. Исследование вносит вклад в растущий объем знаний об анализе текста с помощью искусственного интеллекта.'}, 'en': {'title': 'Trusting AI: Evaluating LLMs for Accurate Text Analysis', 'desc': 'This paper explores the use of large language models (LLMs) for summarizing and analyzing unstructured text data, particularly from open-ended survey responses. It raises concerns about the trustworthiness of LLM-generated summaries, as they may not accurately reflect the original sentiments and themes present in the data. The research introduces an LLM-as-judge framework, where one LLM generates summaries while others evaluate their thematic alignment, comparing this method to human evaluations. The findings suggest that while LLMs can provide a scalable alternative to human raters, they may struggle with detecting subtle nuances that humans can identify, highlighting the importance of careful application in different contexts.'}, 'zh': {'title': '信任大型语言模型的总结能力吗?', 'desc': '这篇论文探讨了大型语言模型(LLMs)在处理和总结非结构化文本数据方面的能力,尤其是在分析开放式调查反馈时的应用。研究表明,虽然LLMs能够生成类似人类的总结,但它们的输出可能与原始文本的真实主题存在偏差,这可能导致错误的决策。为了评估LLMs生成的总结与实际主题的一致性,研究使用了LLMs作为评判模型,并与人类评估进行了比较。结果显示,LLMs作为评判者提供了一种可扩展的解决方案,但人类在捕捉细微的上下文特征方面仍然表现更佳。'}}}, {'id': 'https://huggingface.co/papers/2501.07888', 'title': 'Tarsier2: Advancing Large Vision-Language Models from Detailed Video Description to Comprehensive Video Understanding', 'url': 'https://huggingface.co/papers/2501.07888', 'abstract': 'We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM) designed for generating detailed and accurate video descriptions, while also exhibiting superior general video understanding capabilities. Tarsier2 achieves significant advancements through three key upgrades: (1) Scaling pre-training data from 11M to 40M video-text pairs, enriching both volume and diversity; (2) Performing fine-grained temporal alignment during supervised fine-tuning; (3) Using model-based sampling to automatically construct preference data and applying DPO training for optimization. Extensive experiments show that Tarsier2-7B consistently outperforms leading proprietary models, including GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K benchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\% performance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B also sets new state-of-the-art results across 15 public benchmarks, spanning tasks such as video question-answering, video grounding, hallucination test, and embodied question-answering, demonstrating its versatility as a robust generalist vision-language model.', 'score': 5, 'issue_id': 1674, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '54780a4b6f93fb10', 'authors': ['Liping Yuan', 'Jiawei Wang', 'Haomiao Sun', 'Yuchen Zhang', 'Yuan Lin'], 'affiliations': ['ByteDance Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.07888.jpg', 'data': {'categories': ['#dataset', '#training', '#cv', '#hallucinations', '#optimization', '#video', '#benchmark'], 'emoji': '🎥', 'ru': {'title': 'Tarsier2: Революция в понимании видео искусственным интеллектом', 'desc': 'Tarsier2 - это современная крупномасштабная модель для понимания видео и языка (LVLM), разработанная для создания детальных и точных описаний видео. Модель достигает значительных улучшений благодаря увеличению объема обучающих данных, точной временной синхронизации при тонкой настройке и применению обучения с предпочтениями (DPO). Tarsier2-7B превосходит ведущие проприетарные модели, такие как GPT-4o и Gemini 1.5 Pro, в задачах детального описания видео. Модель также устанавливает новые рекорды в 15 публичных бенчмарках, демонстрируя свою универсальность как надежная модель общего назначения для понимания видео и языка.'}, 'en': {'title': 'Tarsier2: Redefining Video Understanding with Advanced LVLM Technology', 'desc': "Tarsier2 is a cutting-edge large vision-language model (LVLM) that excels in generating precise and detailed descriptions of videos while showcasing advanced video comprehension skills. The model's improvements stem from three main enhancements: increasing the pre-training dataset from 11 million to 40 million video-text pairs, implementing fine-grained temporal alignment during fine-tuning, and utilizing model-based sampling for preference data construction with DPO training for optimization. Extensive testing reveals that Tarsier2-7B surpasses top proprietary models like GPT-4o and Gemini 1.5 Pro in video description tasks, achieving notable F1 score improvements on the DREAM-1K benchmark. Additionally, Tarsier2-7B sets new records across 15 public benchmarks, proving its effectiveness in various tasks such as video question-answering and video grounding."}, 'zh': {'title': 'Tarsier2:视频描述的新标杆', 'desc': 'Tarsier2是一种先进的大型视觉语言模型,专门用于生成详细且准确的视频描述,同时具备出色的视频理解能力。该模型通过三个关键升级实现了显著进步:首先,预训练数据从1100万对视频文本扩展到4000万对,增加了数据的数量和多样性;其次,在监督微调过程中进行精细的时间对齐;最后,采用基于模型的采样自动构建偏好数据,并应用DPO训练进行优化。实验结果表明,Tarsier2-7B在视频描述任务中持续超越领先的专有模型,展现出其作为强大通用视觉语言模型的多样性。'}}}, {'id': 'https://huggingface.co/papers/2501.08292', 'title': 'HALoGEN: Fantastic LLM Hallucinations and Where to Find Them', 'url': 'https://huggingface.co/papers/2501.08292', 'abstract': 'Despite their impressive ability to generate high-quality and fluent text, generative large language models (LLMs) also produce hallucinations: statements that are misaligned with established world knowledge or provided input context. However, measuring hallucination can be challenging, as having humans verify model generations on-the-fly is both expensive and time-consuming. In this work, we release HALoGEN, a comprehensive hallucination benchmark consisting of: (1) 10,923 prompts for generative models spanning nine domains including programming, scientific attribution, and summarization, and (2) automatic high-precision verifiers for each use case that decompose LLM generations into atomic units, and verify each unit against a high-quality knowledge source. We use this framework to evaluate ~150,000 generations from 14 language models, finding that even the best-performing models are riddled with hallucinations (sometimes up to 86% of generated atomic facts depending on the domain). We further define a novel error classification for LLM hallucinations based on whether they likely stem from incorrect recollection of training data (Type A errors), or incorrect knowledge in training data (Type B errors), or are fabrication (Type C errors). We hope our framework provides a foundation to enable the principled study of why generative models hallucinate, and advances the development of trustworthy large language models.', 'score': 5, 'issue_id': 1673, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'f6751d682ff824ed', 'authors': ['Abhilasha Ravichander', 'Shrusti Ghela', 'David Wadden', 'Yejin Choi'], 'affiliations': ['Google', 'NVIDIA', 'University of Washington'], 'pdf_title_img': 'assets/pdf/title_img/2501.08292.jpg', 'data': {'categories': ['#dataset', '#hallucinations', '#benchmark'], 'emoji': '🔍', 'ru': {'title': 'HALoGEN: Автоматическая проверка галлюцинаций в языковых моделях', 'desc': 'Эта статья представляет HALoGEN - комплексный инструмент для оценки галлюцинаций в больших языковых моделях (LLM). Авторы создали набор из 10,923 промптов в девяти различных областях и автоматические верификаторы высокой точности для проверки генераций LLM. Исследование выявило, что даже лучшие модели страдают от галлюцинаций, иногда до 86% сгенерированных фактов оказываются неверными. Авторы также предложили новую классификацию ошибок LLM, разделив их на три типа в зависимости от источника галлюцинаций.'}, 'en': {'title': 'HALoGEN: A Benchmark for Measuring Hallucinations in Language Models', 'desc': 'This paper introduces HALoGEN, a new benchmark designed to measure hallucinations in generative large language models (LLMs). Hallucinations refer to incorrect statements generated by these models that do not align with known facts or the given context. The benchmark includes over 10,000 prompts across various domains and employs automatic verifiers to assess the accuracy of model outputs. The study reveals that even top-performing models exhibit significant hallucinations, prompting a classification system for different types of errors to better understand their origins and improve model reliability.'}, 'zh': {'title': '揭示生成模型的幻觉问题', 'desc': '尽管生成性大型语言模型(LLMs)能够生成高质量和流畅的文本,但它们也会产生幻觉,即与已知世界知识或输入上下文不一致的陈述。测量幻觉的难度在于,实时验证模型生成的内容既昂贵又耗时。为此,我们推出了HALoGEN,这是一个全面的幻觉基准,包含10,923个跨越九个领域的提示和自动高精度验证器。我们的研究发现,即使是表现最好的模型,其生成的原子事实中也有高达86%可能存在幻觉,这为理解生成模型的幻觉提供了基础。'}}}, {'id': 'https://huggingface.co/papers/2501.08284', 'title': 'AfriHate: A Multilingual Collection of Hate Speech and Abusive Language Datasets for African Languages', 'url': 'https://huggingface.co/papers/2501.08284', 'abstract': 'Hate speech and abusive language are global phenomena that need socio-cultural background knowledge to be understood, identified, and moderated. However, in many regions of the Global South, there have been several documented occurrences of (1) absence of moderation and (2) censorship due to the reliance on keyword spotting out of context. Further, high-profile individuals have frequently been at the center of the moderation process, while large and targeted hate speech campaigns against minorities have been overlooked. These limitations are mainly due to the lack of high-quality data in the local languages and the failure to include local communities in the collection, annotation, and moderation processes. To address this issue, we present AfriHate: a multilingual collection of hate speech and abusive language datasets in 15 African languages. Each instance in AfriHate is annotated by native speakers familiar with the local culture. We report the challenges related to the construction of the datasets and present various classification baseline results with and without using LLMs. The datasets, individual annotations, and hate speech and offensive language lexicons are available on https://github.com/AfriHate/AfriHate', 'score': 3, 'issue_id': 1676, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '8c76dd102740009c', 'authors': ['Shamsuddeen Hassan Muhammad', 'Idris Abdulmumin', 'Abinew Ali Ayele', 'David Ifeoluwa Adelani', 'Ibrahim Said Ahmad', 'Saminu Mohammad Aliyu', 'Nelson Odhiambo Onyango', 'Lilian D. A. Wanzare', 'Samuel Rutunda', 'Lukman Jibril Aliyu', 'Esubalew Alemneh', 'Oumaima Hourrane', 'Hagos Tesfahun Gebremichael', 'Elyas Abdi Ismail', 'Meriem Beloucif', 'Ebrahim Chekol Jibril', 'Andiswa Bukula', 'Rooweither Mabuya', 'Salomey Osei', 'Abigail Oppong', 'Tadesse Destaw Belay', 'Tadesse Kebede Guge', 'Tesfa Tegegne Asfaw', 'Chiamaka Ijeoma Chukwuneke', 'Paul Röttger', 'Seid Muhie Yimam', 'Nedjma Ousidhoum'], 'affiliations': ['Addis Ababa University', 'Al Akhawayn University', 'Bahir Dar University', 'Bayero University Kano', 'Bocconi University', 'Cardiff University', 'DSFSI, University of Pretoria', 'Digital Umuganda', 'Haramaya University', 'HausaNLP', 'Imperial College London', 'Independent Researcher', 'Instituto Politécnico Nacional', 'Istanbul Technical University', 'Lancaster University', 'Maseno University', 'Mila, McGill University & Canada CIFAR AI Chair', 'Northeastern University', 'SADiLaR', 'University of Deusto', 'University of Hamburg', 'Uppsala University', 'Wollo University'], 'pdf_title_img': 'assets/pdf/title_img/2501.08284.jpg', 'data': {'categories': ['#dataset', '#ethics', '#multilingual', '#data', '#low_resource', '#open_source'], 'emoji': '🌍', 'ru': {'title': 'AfriHate: борьба с языком вражды в Африке с помощью локальных данных и экспертизы', 'desc': 'Статья представляет AfriHate - многоязычный набор данных по языку вражды и оскорбительной лексике на 15 африканских языках. Датасет создан для решения проблемы недостатка качественных данных на местных языках и отсутствия вовлечения локальных сообществ в процессы сбора, разметки и модерации контента. Каждый пример в AfriHate размечен носителями языка, знакомыми с местной культурой. Авторы описывают трудности, связанные с созданием датасетов, и представляют результаты базовых классификационных моделей, в том числе с использованием языковых моделей.'}, 'en': {'title': 'Empowering Local Voices Against Hate Speech with AfriHate', 'desc': 'This paper addresses the challenges of identifying and moderating hate speech in the Global South, particularly in African languages. It highlights the limitations of existing moderation techniques that rely on keyword spotting without cultural context, leading to ineffective censorship and oversight of targeted hate campaigns. To combat this, the authors introduce AfriHate, a multilingual dataset of hate speech and abusive language in 15 African languages, annotated by native speakers. The paper also discusses the difficulties faced during dataset construction and presents baseline classification results, demonstrating the potential of using large language models (LLMs) for this task.'}, 'zh': {'title': '构建多语言仇恨言论数据集,助力社会文化理解', 'desc': '本论文介绍了AfriHate,这是一个包含15种非洲语言的仇恨言论和辱骂语言数据集。该数据集由熟悉当地文化的母语者进行标注,以解决全球南方地区在仇恨言论管理中的数据缺乏问题。研究还探讨了数据集构建过程中的挑战,并展示了使用和不使用大型语言模型(LLMs)进行分类的基线结果。所有数据集、标注和相关词汇表均可在指定网站上获取。'}}}, {'id': 'https://huggingface.co/papers/2501.08120', 'title': 'In-situ graph reasoning and knowledge expansion using Graph-PReFLexOR', 'url': 'https://huggingface.co/papers/2501.08120', 'abstract': "The pursuit of automated scientific discovery has fueled progress from symbolic logic to modern AI, forging new frontiers in reasoning and pattern recognition. Transformers function as potential systems, where every possible relationship remains latent potentiality until tasks impose constraints, akin to measurement. Yet, refining their sampling requires more than probabilistic selection: solutions must conform to specific structures or rules, ensuring consistency and the invocation of general principles. We present Graph-PReFLexOR (Graph-based Preference-based Recursive Language Modeling for Exploratory Optimization of Reasoning), a framework that combines graph reasoning with symbolic abstraction to dynamically expand domain knowledge. Inspired by reinforcement learning, Graph-PReFLexOR defines reasoning as a structured mapping, where tasks yield knowledge graphs, abstract patterns, and ultimately, final answers. Inspired by category theory, it encodes concepts as nodes and their relationships as edges, supporting hierarchical inference and adaptive learning through isomorphic representations. Demonstrations include hypothesis generation, materials design, and creative reasoning, such as discovering relationships between mythological concepts like 'thin places' with materials science. We propose a 'knowledge garden growth' strategy that integrates insights across domains, promoting interdisciplinary connections. Results with a 3-billion-parameter Graph-PReFLexOR model show superior reasoning depth and adaptability, underscoring the potential for transparent, multidisciplinary AI-driven discovery. It lays the groundwork for general autonomous reasoning solutions.", 'score': 1, 'issue_id': 1683, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': 'f8f5360d1fb8fb75', 'authors': ['Markus J. Buehler'], 'affiliations': ['Laboratory for Atomistic and Molecular Mechanics, MIT, Cambridge, MA 02139, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.08120.jpg', 'data': {'categories': ['#multimodal', '#reasoning', '#agents', '#graphs', '#rl', '#science', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Graph-PReFLexOR: Новый горизонт в автономном научном открытии', 'desc': 'Статья представляет Graph-PReFLexOR - фреймворк, объединяющий графовые рассуждения с символьной абстракцией для динамического расширения предметных знаний. Вдохновленный теорией категорий, он кодирует концепции как узлы, а их отношения как ребра, поддерживая иерархический вывод и адаптивное обучение. Демонстрации включают генерацию гипотез, дизайн материалов и творческие рассуждения, такие как обнаружение связей между мифологическими концепциями и материаловедением. Результаты с 3-миллиардной моделью Graph-PReFLexOR показывают превосходную глубину рассуждений и адаптивность, подчеркивая потенциал для прозрачных, междисциплинарных решений на основе ИИ.'}, 'en': {'title': 'Empowering AI with Graph-Based Reasoning for Scientific Discovery', 'desc': 'This paper introduces Graph-PReFLexOR, a novel framework that enhances automated scientific discovery by integrating graph reasoning with symbolic abstraction. It utilizes a structured mapping approach inspired by reinforcement learning, allowing for the generation of knowledge graphs and abstract patterns from various tasks. The framework supports hierarchical inference and adaptive learning, enabling it to explore interdisciplinary connections effectively. Demonstrations of its capabilities include hypothesis generation and creative reasoning, showcasing its potential for deep and adaptable reasoning in AI-driven discovery.'}, 'zh': {'title': '知识花园的成长:跨领域的智能推理', 'desc': '这篇论文介绍了一种名为Graph-PReFLexOR的框架,它结合了图推理和符号抽象,以动态扩展领域知识。该框架通过结构化映射定义推理,利用知识图谱和抽象模式来生成最终答案。它的灵感来自强化学习和范畴理论,将概念编码为节点,关系编码为边,支持层次推理和自适应学习。实验结果表明,Graph-PReFLexOR在推理深度和适应性方面表现优越,为自动化推理解决方案奠定了基础。'}}}, {'id': 'https://huggingface.co/papers/2501.07556', 'title': 'MatchAnything: Universal Cross-Modality Image Matching with Large-Scale Pre-Training', 'url': 'https://huggingface.co/papers/2501.07556', 'abstract': 'Image matching, which aims to identify corresponding pixel locations between images, is crucial in a wide range of scientific disciplines, aiding in image registration, fusion, and analysis. In recent years, deep learning-based image matching algorithms have dramatically outperformed humans in rapidly and accurately finding large amounts of correspondences. However, when dealing with images captured under different imaging modalities that result in significant appearance changes, the performance of these algorithms often deteriorates due to the scarcity of annotated cross-modal training data. This limitation hinders applications in various fields that rely on multiple image modalities to obtain complementary information. To address this challenge, we propose a large-scale pre-training framework that utilizes synthetic cross-modal training signals, incorporating diverse data from various sources, to train models to recognize and match fundamental structures across images. This capability is transferable to real-world, unseen cross-modality image matching tasks. Our key finding is that the matching model trained with our framework achieves remarkable generalizability across more than eight unseen cross-modality registration tasks using the same network weight, substantially outperforming existing methods, whether designed for generalization or tailored for specific tasks. This advancement significantly enhances the applicability of image matching technologies across various scientific disciplines and paves the way for new applications in multi-modality human and artificial intelligence analysis and beyond.', 'score': 0, 'issue_id': 1688, 'pub_date': '2025-01-13', 'pub_date_card': {'ru': '13 января', 'en': 'January 13', 'zh': '1月13日'}, 'hash': 'ad0c408491c545d5', 'authors': ['Xingyi He', 'Hao Yu', 'Sida Peng', 'Dongli Tan', 'Zehong Shen', 'Hujun Bao', 'Xiaowei Zhou'], 'affiliations': ['Shandong University', 'State Key Lab of CAD&CG, Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07556.jpg', 'data': {'categories': ['#synthetic', '#dataset', '#multimodal', '#transfer_learning', '#cv'], 'emoji': '🔍', 'ru': {'title': 'Универсальное сопоставление изображений разных модальностей с помощью глубокого обучения', 'desc': 'Статья представляет новый подход к сопоставлению изображений разных модальностей с использованием глубокого обучения. Авторы предлагают фреймворк для предварительного обучения на синтетических кросс-модальных данных, что позволяет модели распознавать фундаментальные структуры в изображениях. Обученная модель демонстрирует впечатляющую обобщаемость на более чем восемь новых задач кросс-модальной регистрации, значительно превосходя существующие методы. Это достижение открывает новые возможности для применения технологий сопоставления изображений в различных научных дисциплинах.'}, 'en': {'title': 'Enhancing Image Matching Across Modalities with Synthetic Training', 'desc': "This paper presents a new framework for image matching that helps identify corresponding pixel locations between images taken in different ways. Traditional deep learning methods struggle with this task due to a lack of annotated training data for different image types. The proposed solution uses synthetic training signals from diverse sources to improve the model's ability to recognize and match structures across various images. As a result, the model shows excellent performance in unseen cross-modal tasks, making it highly useful for applications in many scientific fields."}, 'zh': {'title': '跨模态图像匹配的新突破', 'desc': '本文提出了一种大规模预训练框架,用于解决图像匹配中的跨模态问题。该框架利用合成的跨模态训练信号,结合来自不同来源的多样化数据,训练模型识别和匹配图像中的基本结构。研究发现,使用该框架训练的匹配模型在超过八个未见的跨模态配准任务中表现出显著的泛化能力,远超现有方法。此进展大大增强了图像匹配技术在各科学领域的适用性,并为多模态人类和人工智能分析的新应用铺平了道路。'}}}, {'id': 'https://huggingface.co/papers/2501.13200', 'title': 'SRMT: Shared Memory for Multi-agent Lifelong Pathfinding', 'url': 'https://huggingface.co/papers/2501.13200', 'abstract': "Multi-agent reinforcement learning (MARL) demonstrates significant progress in solving cooperative and competitive multi-agent problems in various environments. One of the principal challenges in MARL is the need for explicit prediction of the agents' behavior to achieve cooperation. To resolve this issue, we propose the Shared Recurrent Memory Transformer (SRMT) which extends memory transformers to multi-agent settings by pooling and globally broadcasting individual working memories, enabling agents to exchange information implicitly and coordinate their actions. We evaluate SRMT on the Partially Observable Multi-Agent Pathfinding problem in a toy Bottleneck navigation task that requires agents to pass through a narrow corridor and on a POGEMA benchmark set of tasks. In the Bottleneck task, SRMT consistently outperforms a variety of reinforcement learning baselines, especially under sparse rewards, and generalizes effectively to longer corridors than those seen during training. On POGEMA maps, including Mazes, Random, and MovingAI, SRMT is competitive with recent MARL, hybrid, and planning-based algorithms. These results suggest that incorporating shared recurrent memory into the transformer-based architectures can enhance coordination in decentralized multi-agent systems. The source code for training and evaluation is available on GitHub: https://github.com/Aloriosa/srmt.", 'score': 53, 'issue_id': 1846, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '52d8b3716543aa42', 'authors': ['Alsu Sagirova', 'Yuri Kuratov', 'Mikhail Burtsev'], 'affiliations': ['AIRI, Moscow, Russia', 'London Institute for Mathematical Sciences, London, UK', 'Neural Networks and Deep Learning Lab, MIPT, Dolgoprudny, Russia'], 'pdf_title_img': 'assets/pdf/title_img/2501.13200.jpg', 'data': {'categories': ['#training', '#games', '#rl', '#agents', '#benchmark', '#optimization'], 'emoji': '🤖', 'ru': {'title': 'SRMT: Улучшение координации в децентрализованных мультиагентных системах', 'desc': 'В статье представлен новый подход к мультиагентному обучению с подкреплением (MARL) - Shared Recurrent Memory Transformer (SRMT). SRMT расширяет возможности трансформеров с памятью для мультиагентных систем, объединяя и глобально транслируя индивидуальную рабочую память агентов. Этот метод позволяет агентам неявно обмениваться информацией и координировать свои действия. SRMT показал превосходные результаты на задаче частично наблюдаемого мультиагентного поиска пути, превзойдя базовые алгоритмы обучения с подкреплением и продемонстрировав эффективную генерализацию.'}, 'en': {'title': 'Enhancing Agent Coordination with Shared Memory Transformers', 'desc': 'This paper introduces the Shared Recurrent Memory Transformer (SRMT), a novel approach in multi-agent reinforcement learning (MARL) that enhances cooperation among agents. SRMT utilizes a memory transformer architecture to allow agents to share and broadcast their individual memories, facilitating implicit communication and coordination. The effectiveness of SRMT is demonstrated through experiments on the Partially Observable Multi-Agent Pathfinding problem, where it outperforms traditional reinforcement learning methods, particularly in scenarios with sparse rewards. The results indicate that integrating shared memory into transformer models significantly improves the performance of decentralized multi-agent systems.'}, 'zh': {'title': '共享记忆提升多智能体协调能力', 'desc': '多智能体强化学习(MARL)在解决合作和竞争的多智能体问题上取得了显著进展。本文提出了一种共享递归记忆变换器(SRMT),通过汇聚和全局广播个体工作记忆,帮助智能体隐式交换信息并协调行动。我们在部分可观察的多智能体路径规划问题上评估了SRMT,结果显示其在稀疏奖励下表现优于多种强化学习基线,并且在训练时未见过的更长走廊上也能有效泛化。SRMT在多个基准任务中与最新的MARL、混合和基于规划的算法具有竞争力,表明共享递归记忆的引入可以增强去中心化多智能体系统的协调能力。'}}}, {'id': 'https://huggingface.co/papers/2501.13629', 'title': 'Sigma: Differential Rescaling of Query, Key and Value for Efficient Language Models', 'url': 'https://huggingface.co/papers/2501.13629', 'abstract': "We introduce Sigma, an efficient large language model specialized for the system domain, empowered by a novel architecture including DiffQKV attention, and pre-trained on our meticulously collected system domain data. DiffQKV attention significantly enhances the inference efficiency of Sigma by optimizing the Query (Q), Key (K), and Value (V) components in the attention mechanism differentially, based on their varying impacts on the model performance and efficiency indicators. Specifically, we (1) conduct extensive experiments that demonstrate the model's varying sensitivity to the compression of K and V components, leading to the development of differentially compressed KV, and (2) propose augmented Q to expand the Q head dimension, which enhances the model's representation capacity with minimal impacts on the inference speed. Rigorous theoretical and empirical analyses reveal that DiffQKV attention significantly enhances efficiency, achieving up to a 33.36% improvement in inference speed over the conventional grouped-query attention (GQA) in long-context scenarios. We pre-train Sigma on 6T tokens from various sources, including 19.5B system domain data that we carefully collect and 1T tokens of synthesized and rewritten data. In general domains, Sigma achieves comparable performance to other state-of-arts models. In the system domain, we introduce the first comprehensive benchmark AIMicius, where Sigma demonstrates remarkable performance across all tasks, significantly outperforming GPT-4 with an absolute improvement up to 52.5%.", 'score': 37, 'issue_id': 1842, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'd036f75a81877ded', 'authors': ['Zhenghao Lin', 'Zihao Tang', 'Xiao Liu', 'Yeyun Gong', 'Yi Cheng', 'Qi Chen', 'Hang Li', 'Ying Xin', 'Ziyue Yang', 'Kailai Yang', 'Yu Yan', 'Xiao Liang', 'Shuai Lu', 'Yiming Huang', 'Zheheng Luo', 'Lei Qu', 'Xuan Feng', 'Yaoxiang Wang', 'Yuqing Xia', 'Feiyang Chen', 'Yuting Jiang', 'Yasen Hu', 'Hao Ni', 'Binyang Li', 'Guoshuai Zhao', 'Jui-Hao Chiang', 'Zhongxin Guo', 'Chen Lin', 'Kun Kuang', 'Wenjie Li', 'Yelong Shen', 'Jian Jiao', 'Peng Cheng', 'Mao Yang'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.13629.jpg', 'data': {'categories': ['#optimization', '#architecture', '#dataset', '#benchmark', '#long_context', '#training', '#synthetic', '#data', '#inference'], 'emoji': '🖥️', 'ru': {'title': 'Sigma: эффективная ЯМ для системной области с инновационным механизмом внимания', 'desc': 'Исследователи представили Sigma - эффективную большую языковую модель, специализированную для системной области. Модель использует новую архитектуру с DiffQKV-вниманием, которая оптимизирует компоненты Q, K и V механизма внимания для повышения эффективности. Sigma предобучена на тщательно собранных данных системной области объемом 6T токенов. На общих задачах модель показывает результаты на уровне современных аналогов, а в системной области значительно превосходит GPT-4 на новом бенчмарке AIMicius.'}, 'en': {'title': 'Sigma: Revolutionizing System Domain Language Models with DiffQKV Attention', 'desc': 'The paper presents Sigma, a specialized large language model designed for the system domain, utilizing a new architecture called DiffQKV attention. This innovative attention mechanism optimizes the Query, Key, and Value components to improve inference efficiency, particularly in long-context scenarios. Through extensive experiments, the authors show that Sigma achieves significant speed improvements, outperforming traditional models like GPT-4 in various tasks. The model is pre-trained on a vast dataset, including 19.5 billion tokens from the system domain, establishing a new benchmark for performance in this area.'}, 'zh': {'title': 'Sigma:系统领域的高效语言模型', 'desc': '我们介绍了Sigma,这是一个高效的大型语言模型,专门针对系统领域。它采用了一种新颖的架构,包括DiffQKV注意力机制,并在我们精心收集的系统领域数据上进行了预训练。DiffQKV注意力通过优化注意力机制中的查询(Q)、键(K)和值(V)组件,显著提高了推理效率。实验结果表明,Sigma在系统领域的表现优于GPT-4,绝对提升幅度可达52.5%。'}}}, {'id': 'https://huggingface.co/papers/2501.13918', 'title': 'Improving Video Generation with Human Feedback', 'url': 'https://huggingface.co/papers/2501.13918', 'abstract': 'Video generation has achieved significant advances through rectified flow techniques, but issues like unsmooth motion and misalignment between videos and prompts persist. In this work, we develop a systematic pipeline that harnesses human feedback to mitigate these problems and refine the video generation model. Specifically, we begin by constructing a large-scale human preference dataset focused on modern video generation models, incorporating pairwise annotations across multi-dimensions. We then introduce VideoReward, a multi-dimensional video reward model, and examine how annotations and various design choices impact its rewarding efficacy. From a unified reinforcement learning perspective aimed at maximizing reward with KL regularization, we introduce three alignment algorithms for flow-based models by extending those from diffusion models. These include two training-time strategies: direct preference optimization for flow (Flow-DPO) and reward weighted regression for flow (Flow-RWR), and an inference-time technique, Flow-NRG, which applies reward guidance directly to noisy videos. Experimental results indicate that VideoReward significantly outperforms existing reward models, and Flow-DPO demonstrates superior performance compared to both Flow-RWR and standard supervised fine-tuning methods. Additionally, Flow-NRG lets users assign custom weights to multiple objectives during inference, meeting personalized video quality needs. Project page: https://gongyeliu.github.io/videoalign.', 'score': 34, 'issue_id': 1849, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '933a6a47d8d5e20a', 'authors': ['Jie Liu', 'Gongye Liu', 'Jiajun Liang', 'Ziyang Yuan', 'Xiaokun Liu', 'Mingwu Zheng', 'Xiele Wu', 'Qiulin Wang', 'Wenyu Qin', 'Menghan Xia', 'Xintao Wang', 'Xiaohong Liu', 'Fei Yang', 'Pengfei Wan', 'Di Zhang', 'Kun Gai', 'Yujiu Yang', 'Wanli Ouyang'], 'affiliations': ['Kuaishou Technology', 'Shanghai AI Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13918.jpg', 'data': {'categories': ['#dataset', '#training', '#optimization', '#alignment', '#video', '#rlhf'], 'emoji': '🎬', 'ru': {'title': 'Улучшение генерации видео с помощью человеческой обратной связи и обучения с подкреплением', 'desc': 'Данная работа представляет систематический подход к улучшению генерации видео с использованием обратной связи от людей. Авторы создали большой датасет человеческих предпочтений для современных моделей генерации видео и разработали многомерную модель оценки видео VideoReward. Они также предложили три алгоритма выравнивания для моделей на основе потоков: Flow-DPO, Flow-RWR и Flow-NRG. Эксперименты показали, что VideoReward значительно превосходит существующие модели оценки, а Flow-DPO демонстрирует лучшие результаты по сравнению с другими методами.'}, 'en': {'title': 'Enhancing Video Generation with Human Feedback and Reward Models', 'desc': 'This paper addresses challenges in video generation, particularly issues of motion smoothness and alignment with prompts. The authors propose a new pipeline that utilizes human feedback to enhance video generation models. They create a large dataset of human preferences and introduce VideoReward, a model that evaluates video quality based on these preferences. The study also presents three innovative algorithms for improving flow-based video generation, demonstrating that their methods outperform existing models and allow for personalized video quality adjustments.'}, 'zh': {'title': '优化视频生成,提升用户体验', 'desc': '本研究提出了一种系统化的视频生成模型优化方法,旨在解决视频生成中的不平滑运动和视频与提示之间的错位问题。我们构建了一个大规模的人类偏好数据集,专注于现代视频生成模型,并进行了多维度的成对注释。引入的VideoReward模型通过强化学习最大化奖励,并提出了三种对齐算法,以提高流模型的性能。实验结果表明,VideoReward在奖励模型中表现优异,Flow-DPO在性能上优于其他方法,满足用户个性化的视频质量需求。'}}}, {'id': 'https://huggingface.co/papers/2501.13926', 'title': "Can We Generate Images with CoT? Let's Verify and Reinforce Image Generation Step by Step", 'url': 'https://huggingface.co/papers/2501.13926', 'abstract': 'Chain-of-Thought (CoT) reasoning has been extensively explored in large models to tackle complex understanding tasks. However, it still remains an open question whether such strategies can be applied to verifying and reinforcing image generation scenarios. In this paper, we provide the first comprehensive investigation of the potential of CoT reasoning to enhance autoregressive image generation. We focus on three techniques: scaling test-time computation for verification, aligning model preferences with Direct Preference Optimization (DPO), and integrating these techniques for complementary effects. Our results demonstrate that these approaches can be effectively adapted and combined to significantly improve image generation performance. Furthermore, given the pivotal role of reward models in our findings, we propose the Potential Assessment Reward Model (PARM) and PARM++, specialized for autoregressive image generation. PARM adaptively assesses each generation step through a potential assessment approach, merging the strengths of existing reward models, and PARM++ further introduces a reflection mechanism to self-correct the generated unsatisfactory image. Using our investigated reasoning strategies, we enhance a baseline model, Show-o, to achieve superior results, with a significant +24% improvement on the GenEval benchmark, surpassing Stable Diffusion 3 by +15%. We hope our study provides unique insights and paves a new path for integrating CoT reasoning with autoregressive image generation. Code and models are released at https://github.com/ZiyuGuo99/Image-Generation-CoT', 'score': 21, 'issue_id': 1841, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '61611cbe661736ff', 'authors': ['Ziyu Guo', 'Renrui Zhang', 'Chengzhuo Tong', 'Zhizheng Zhao', 'Peng Gao', 'Hongsheng Li', 'Pheng-Ann Heng'], 'affiliations': ['CUHK', 'MMLab', 'MiuLar Lab', 'Peking University', 'Shanghai AI Lab'], 'pdf_title_img': 'assets/pdf/title_img/2501.13926.jpg', 'data': {'categories': ['#rlhf', '#games', '#dataset', '#cv', '#reasoning', '#optimization', '#benchmark'], 'emoji': '🖼️', 'ru': {'title': 'Рассуждения по цепочке мыслей открывают новые горизонты в генерации изображений', 'desc': 'Статья исследует применение рассуждений по цепочке мыслей (Chain-of-Thought) для улучшения автореграссивной генерации изображений. Авторы предлагают три метода: масштабирование вычислений во время тестирования, оптимизацию предпочтений модели и интеграцию этих техник. Они также представляют новые модели вознаграждения PARM и PARM++, специально разработанные для генерации изображений. Результаты показывают значительное улучшение производительности базовой модели Show-o на 24% по сравнению с эталоном GenEval.'}, 'en': {'title': 'Enhancing Image Generation with Chain-of-Thought Reasoning', 'desc': 'This paper explores the use of Chain-of-Thought (CoT) reasoning to improve autoregressive image generation models. It investigates three main techniques: enhancing verification through increased computation, aligning model preferences using Direct Preference Optimization (DPO), and combining these methods for better outcomes. The authors introduce the Potential Assessment Reward Model (PARM) and its enhanced version PARM++, which help assess and correct image generation steps. The results show a significant performance boost, achieving a 24% improvement on the GenEval benchmark compared to previous models.'}, 'zh': {'title': '链式思维提升图像生成性能', 'desc': '本文探讨了链式思维(CoT)推理在自回归图像生成中的应用潜力。我们提出了三种技术:测试时计算的扩展、与直接偏好优化(DPO)对齐模型偏好,以及这些技术的整合。研究结果表明,这些方法可以有效结合,显著提升图像生成性能。此外,我们提出了潜力评估奖励模型(PARM)和PARM++,专门用于自回归图像生成,进一步提高了生成质量。'}}}, {'id': 'https://huggingface.co/papers/2501.13826', 'title': 'Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline Professional Videos', 'url': 'https://huggingface.co/papers/2501.13826', 'abstract': "Humans acquire knowledge through three cognitive stages: perceiving information, comprehending knowledge, and adapting knowledge to solve novel problems. Videos serve as an effective medium for this learning process, facilitating a progression through these cognitive stages. However, existing video benchmarks fail to systematically evaluate the knowledge acquisition capabilities in Large Multimodal Models (LMMs). To address this gap, we introduce Video-MMMU, a multi-modal, multi-disciplinary benchmark designed to assess LMMs' ability to acquire and utilize knowledge from videos. Video-MMMU features a curated collection of 300 expert-level videos and 900 human-annotated questions across six disciplines, evaluating knowledge acquisition through stage-aligned question-answer pairs: Perception, Comprehension, and Adaptation. A proposed knowledge gain metric, {\\Delta}knowledge, quantifies improvement in performance after video viewing. Evaluation of LMMs reveals a steep decline in performance as cognitive demands increase and highlights a significant gap between human and model knowledge acquisition, underscoring the need for methods to enhance LMMs' capability to learn and adapt from videos.", 'score': 18, 'issue_id': 1848, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '4475243a608bc988', 'authors': ['Kairui Hu', 'Penghao Wu', 'Fanyi Pu', 'Wang Xiao', 'Yuanhan Zhang', 'Xiang Yue', 'Bo Li', 'Ziwei Liu'], 'affiliations': ['Carnegie Mellon University', 'S-Lab, Nanyang Technological University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13826.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#science', '#multimodal', '#video'], 'emoji': '🎓', 'ru': {'title': 'Новый рубеж в оценке мультимодального обучения ИИ', 'desc': 'Статья представляет новый бенчмарк Video-MMMU для оценки способности больших мультимодальных моделей (LMM) приобретать знания из видео. Бенчмарк включает 300 экспертных видео и 900 вопросов по шести дисциплинам, оценивая восприятие, понимание и адаптацию знаний. Введена метрика ∆knowledge для измерения прироста знаний после просмотра видео. Результаты показывают значительный разрыв между человеческим и машинным обучением, подчеркивая необходимость улучшения LMM в области обучения на основе видео.'}, 'en': {'title': 'Enhancing Knowledge Acquisition in LMMs through Video Learning', 'desc': 'This paper introduces Video-MMMU, a benchmark designed to evaluate Large Multimodal Models (LMMs) in their ability to learn from videos. It focuses on three cognitive stages: perception, comprehension, and adaptation, using a set of 300 expert videos and 900 questions. The benchmark assesses how well LMMs can acquire knowledge through these stages, revealing a significant performance gap compared to humans. A new metric, Δknowledge, measures the improvement in LMM performance after watching videos, highlighting the need for better learning methods in these models.'}, 'zh': {'title': '提升多模态模型的视频知识获取能力', 'desc': '本文探讨了人类通过三个认知阶段获取知识的过程:感知信息、理解知识和适应知识以解决新问题。视频作为一种有效的学习媒介,能够促进这些认知阶段的进展。然而,现有的视频基准未能系统地评估大型多模态模型(LMMs)在知识获取方面的能力。为此,我们提出了Video-MMMU,这是一个多模态、多学科的基准,旨在评估LMMs从视频中获取和利用知识的能力。'}}}, {'id': 'https://huggingface.co/papers/2501.13919', 'title': 'Temporal Preference Optimization for Long-Form Video Understanding', 'url': 'https://huggingface.co/papers/2501.13919', 'abstract': 'Despite significant advancements in video large multimodal models (video-LMMs), achieving effective temporal grounding in long-form videos remains a challenge for existing models. To address this limitation, we propose Temporal Preference Optimization (TPO), a novel post-training framework designed to enhance the temporal grounding capabilities of video-LMMs through preference learning. TPO adopts a self-training approach that enables models to differentiate between well-grounded and less accurate temporal responses by leveraging curated preference datasets at two granularities: localized temporal grounding, which focuses on specific video segments, and comprehensive temporal grounding, which captures extended temporal dependencies across entire video sequences. By optimizing on these preference datasets, TPO significantly enhances temporal understanding while reducing reliance on manually annotated data. Extensive experiments on three long-form video understanding benchmarks--LongVideoBench, MLVU, and Video-MME--demonstrate the effectiveness of TPO across two state-of-the-art video-LMMs. Notably, LLaVA-Video-TPO establishes itself as the leading 7B model on the Video-MME benchmark, underscoring the potential of TPO as a scalable and efficient solution for advancing temporal reasoning in long-form video understanding. Project page: https://ruili33.github.io/tpo_website.', 'score': 17, 'issue_id': 1843, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '6e08b56893fb98a9', 'authors': ['Rui Li', 'Xiaohan Wang', 'Yuhui Zhang', 'Zeyu Wang', 'Serena Yeung-Levy'], 'affiliations': ['Stanford University', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2501.13919.jpg', 'data': {'categories': ['#multimodal', '#long_context', '#reasoning', '#training', '#optimization', '#video', '#benchmark'], 'emoji': '⏳', 'ru': {'title': 'TPO: Улучшение временного понимания в видео-LMM без ручной разметки', 'desc': 'Статья представляет новый метод под названием Temporal Preference Optimization (TPO) для улучшения временной привязки в видео-LMM моделях. TPO использует самообучение на основе предпочтений для различения хорошо и плохо привязанных во времени ответов. Метод работает на двух уровнях: локальная временная привязка для конкретных сегментов видео и комплексная для всей последовательности. Эксперименты на трех бенчмарках для длинных видео показали эффективность TPO для улучшения временного понимания в видео-LMM.'}, 'en': {'title': 'Enhancing Temporal Understanding in Long Videos with TPO', 'desc': "This paper introduces Temporal Preference Optimization (TPO), a new framework aimed at improving how video large multimodal models (video-LMMs) understand time in long videos. TPO uses a self-training method that helps models learn to tell the difference between accurate and inaccurate timing responses by using specially curated preference datasets. These datasets focus on both specific video segments and the overall flow of the entire video, enhancing the model's ability to grasp temporal relationships. The results show that TPO significantly boosts performance on various benchmarks, making it a promising approach for better temporal reasoning in video analysis."}, 'zh': {'title': '时间偏好优化:提升视频理解的关键', 'desc': '尽管视频大型多模态模型(video-LMMs)取得了显著进展,但在长视频中实现有效的时间定位仍然是一个挑战。为了解决这个问题,我们提出了一种新的后训练框架——时间偏好优化(TPO),旨在通过偏好学习增强视频-LMMs的时间定位能力。TPO采用自我训练的方法,利用精心策划的偏好数据集,使模型能够区分准确的时间响应和不太准确的时间响应。通过在这两个层次上优化偏好数据集,TPO显著提高了时间理解能力,同时减少了对手动标注数据的依赖。'}}}, {'id': 'https://huggingface.co/papers/2501.13920', 'title': 'IMAGINE-E: Image Generation Intelligence Evaluation of State-of-the-art Text-to-Image Models', 'url': 'https://huggingface.co/papers/2501.13920', 'abstract': "With the rapid development of diffusion models, text-to-image(T2I) models have made significant progress, showcasing impressive abilities in prompt following and image generation. Recently launched models such as FLUX.1 and Ideogram2.0, along with others like Dall-E3 and Stable Diffusion 3, have demonstrated exceptional performance across various complex tasks, raising questions about whether T2I models are moving towards general-purpose applicability. Beyond traditional image generation, these models exhibit capabilities across a range of fields, including controllable generation, image editing, video, audio, 3D, and motion generation, as well as computer vision tasks like semantic segmentation and depth estimation. However, current evaluation frameworks are insufficient to comprehensively assess these models' performance across expanding domains. To thoroughly evaluate these models, we developed the IMAGINE-E and tested six prominent models: FLUX.1, Ideogram2.0, Midjourney, Dall-E3, Stable Diffusion 3, and Jimeng. Our evaluation is divided into five key domains: structured output generation, realism, and physical consistency, specific domain generation, challenging scenario generation, and multi-style creation tasks. This comprehensive assessment highlights each model's strengths and limitations, particularly the outstanding performance of FLUX.1 and Ideogram2.0 in structured and specific domain tasks, underscoring the expanding applications and potential of T2I models as foundational AI tools. This study provides valuable insights into the current state and future trajectory of T2I models as they evolve towards general-purpose usability. Evaluation scripts will be released at https://github.com/jylei16/Imagine-e.", 'score': 12, 'issue_id': 1843, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '837193826ae51376', 'authors': ['Jiayi Lei', 'Renrui Zhang', 'Xiangfei Hu', 'Weifeng Lin', 'Zhen Li', 'Wenjian Sun', 'Ruoyi Du', 'Le Zhuo', 'Zhongyu Li', 'Xinyue Li', 'Shitian Zhao', 'Ziyu Guo', 'Yiting Lu', 'Peng Gao', 'Hongsheng Li'], 'affiliations': ['CUHK MMLab', 'Shanghai AI Laboratory', 'Shanghai Jiaotong University'], 'pdf_title_img': 'assets/pdf/title_img/2501.13920.jpg', 'data': {'categories': ['#audio', '#multimodal', '#cv', '#3d', '#diffusion', '#video', '#benchmark', '#survey'], 'emoji': '🎨', 'ru': {'title': 'Новый рубеж в оценке моделей текст-изображение: путь к универсальному ИИ', 'desc': 'Эта статья посвящена оценке современных моделей преобразования текста в изображение (T2I). Авторы разработали новую систему оценки IMAGINE-E для тестирования шести ведущих моделей в пяти ключевых областях. Исследование выявило выдающиеся способности моделей FLUX.1 и Ideogram2.0 в структурированных задачах и задачах специфических доменов. Результаты подчеркивают растущий потенциал моделей T2I как универсальных инструментов искусственного интеллекта.'}, 'en': {'title': 'Evaluating the Future of Text-to-Image Models', 'desc': 'This paper discusses the advancements in text-to-image (T2I) models, particularly focusing on recent models like FLUX.1 and Ideogram2.0. These models not only excel in generating images from text prompts but also show versatility in various tasks such as image editing and video generation. The authors introduce a new evaluation framework called IMAGINE-E to assess the performance of six leading T2I models across multiple domains. The findings reveal that while some models perform exceptionally well in specific tasks, there is a need for better evaluation methods to fully understand their capabilities and limitations.'}, 'zh': {'title': '文本到图像模型的未来:通用性与评估的挑战', 'desc': '随着扩散模型的快速发展,文本到图像(T2I)模型在提示跟随和图像生成方面取得了显著进展。新推出的模型如FLUX.1和Ideogram2.0,以及Dall-E3和Stable Diffusion 3等,展示了在各种复杂任务中的卓越表现,提出了T2I模型是否朝着通用适用性发展的疑问。除了传统的图像生成,这些模型在可控生成、图像编辑、视频、音频、3D和运动生成等多个领域也展现了能力。为了全面评估这些模型的性能,我们开发了IMAGINE-E,并对六个主要模型进行了测试,强调了它们在不同领域的优势和局限性,特别是FLUX.1和Ideogram2.0在结构化和特定领域任务中的出色表现。'}}}, {'id': 'https://huggingface.co/papers/2501.10799', 'title': 'Step-KTO: Optimizing Mathematical Reasoning through Stepwise Binary Feedback', 'url': 'https://huggingface.co/papers/2501.10799', 'abstract': 'Large language models (LLMs) have recently demonstrated remarkable success in mathematical reasoning. Despite progress in methods like chain-of-thought prompting and self-consistency sampling, these advances often focus on final correctness without ensuring that the underlying reasoning process is coherent and reliable. This paper introduces Step-KTO, a training framework that combines process-level and outcome-level binary feedback to guide LLMs toward more trustworthy reasoning trajectories. By providing binary evaluations for both the intermediate reasoning steps and the final answer, Step-KTO encourages the model to adhere to logical progressions rather than relying on superficial shortcuts. Our experiments on challenging mathematical benchmarks show that Step-KTO significantly improves both final answer accuracy and the quality of intermediate reasoning steps. For example, on the MATH-500 dataset, Step-KTO achieves a notable improvement in Pass@1 accuracy over strong baselines. These results highlight the promise of integrating stepwise process feedback into LLM training, paving the way toward more interpretable and dependable reasoning capabilities.', 'score': 11, 'issue_id': 1842, 'pub_date': '2025-01-18', 'pub_date_card': {'ru': '18 января', 'en': 'January 18', 'zh': '1月18日'}, 'hash': 'd43b005a69156930', 'authors': ['Yen-Ting Lin', 'Di Jin', 'Tengyu Xu', 'Tianhao Wu', 'Sainbayar Sukhbaatar', 'Chen Zhu', 'Yun He', 'Yun-Nung Chen', 'Jason Weston', 'Yuandong Tian', 'Arash Rahnama', 'Sinong Wang', 'Hao Ma', 'Han Fang'], 'affiliations': ['Meta FAIR', 'Meta GenAI', 'National Taiwan University', 'UC Berkeley'], 'pdf_title_img': 'assets/pdf/title_img/2501.10799.jpg', 'data': {'categories': ['#interpretability', '#training', '#math', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Шаг за шагом к надежным математическим рассуждениям ИИ', 'desc': 'Статья представляет новый подход к обучению больших языковых моделей (LLM) для математических рассуждений. Метод Step-KTO использует бинарную обратную связь как для промежуточных шагов рассуждения, так и для конечного результата. Это позволяет модели следовать логичному ходу мыслей, а не полагаться на поверхностные шаблоны. Эксперименты на сложных математических тестах показали значительное улучшение как точности конечного ответа, так и качества промежуточных шагов рассуждения.'}, 'en': {'title': 'Enhancing Trustworthy Reasoning in LLMs with Step-KTO', 'desc': 'This paper presents Step-KTO, a new training framework for large language models (LLMs) that enhances their mathematical reasoning abilities. Unlike previous methods that focus solely on the final answer, Step-KTO provides feedback on both the reasoning process and the outcome, promoting logical coherence. By evaluating intermediate reasoning steps alongside the final result, the framework helps LLMs avoid shortcuts and develop more reliable reasoning paths. Experiments show that Step-KTO significantly boosts accuracy and improves the quality of reasoning in challenging mathematical tasks, indicating its potential for creating more interpretable AI systems.'}, 'zh': {'title': '提升推理可信度的Step-KTO框架', 'desc': '大型语言模型(LLMs)在数学推理方面取得了显著成功。尽管链式思维提示和自一致性采样等方法有所进展,但这些方法往往只关注最终结果的正确性,而未能确保推理过程的连贯性和可靠性。本文提出了Step-KTO,这是一种结合过程级和结果级二元反馈的训练框架,旨在引导LLMs朝着更可信的推理轨迹发展。实验结果表明,Step-KTO显著提高了最终答案的准确性和中间推理步骤的质量,展示了逐步过程反馈在LLM训练中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2501.10018', 'title': 'DiffuEraser: A Diffusion Model for Video Inpainting', 'url': 'https://huggingface.co/papers/2501.10018', 'abstract': 'Recent video inpainting algorithms integrate flow-based pixel propagation with transformer-based generation to leverage optical flow for restoring textures and objects using information from neighboring frames, while completing masked regions through visual Transformers. However, these approaches often encounter blurring and temporal inconsistencies when dealing with large masks, highlighting the need for models with enhanced generative capabilities. Recently, diffusion models have emerged as a prominent technique in image and video generation due to their impressive performance. In this paper, we introduce DiffuEraser, a video inpainting model based on stable diffusion, designed to fill masked regions with greater details and more coherent structures. We incorporate prior information to provide initialization and weak conditioning,which helps mitigate noisy artifacts and suppress hallucinations. Additionally, to improve temporal consistency during long-sequence inference, we expand the temporal receptive fields of both the prior model and DiffuEraser, and further enhance consistency by leveraging the temporal smoothing property of Video Diffusion Models. Experimental results demonstrate that our proposed method outperforms state-of-the-art techniques in both content completeness and temporal consistency while maintaining acceptable efficiency.', 'score': 10, 'issue_id': 1846, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '8ebb9334e60b0dd7', 'authors': ['Xiaowen Li', 'Haolan Xue', 'Peiran Ren', 'Liefeng Bo'], 'affiliations': ['Tongyi Lab, Alibaba Group'], 'pdf_title_img': 'assets/pdf/title_img/2501.10018.jpg', 'data': {'categories': ['#diffusion', '#video', '#long_context', '#hallucinations', '#cv'], 'emoji': '🎬', 'ru': {'title': 'DiffuEraser: Улучшенное восстановление видео с помощью диффузионных моделей', 'desc': 'DiffuEraser - это новая модель для восстановления видео на основе стабильной диффузии. Она использует предварительную информацию для инициализации и слабого кондиционирования, что помогает уменьшить шумовые артефакты. Модель расширяет временные рецептивные поля для улучшения временной согласованности при выводе длинных последовательностей. Экспериментальные результаты показывают, что DiffuEraser превосходит современные методы по полноте содержания и временной согласованности.'}, 'en': {'title': 'Enhancing Video Inpainting with Diffusion Models for Better Consistency and Detail', 'desc': 'This paper presents DiffuEraser, a novel video inpainting model that utilizes stable diffusion techniques to improve the restoration of masked regions in videos. By integrating prior information for initialization and weak conditioning, the model effectively reduces noise and visual artifacts. The authors enhance temporal consistency by expanding the temporal receptive fields and utilizing the smoothing properties of Video Diffusion Models. Experimental results show that DiffuEraser surpasses existing methods in terms of content completeness and temporal coherence, while also being efficient.'}, 'zh': {'title': 'DiffuEraser:提升视频修复的细节与一致性', 'desc': '本文介绍了一种名为DiffuEraser的视频修复模型,基于稳定扩散技术,旨在用更丰富的细节和更连贯的结构填补被遮挡的区域。我们通过引入先验信息来提供初始化和弱条件,从而减少噪声伪影和抑制幻觉现象。为了提高长序列推理过程中的时间一致性,我们扩展了先验模型和DiffuEraser的时间感受野,并利用视频扩散模型的时间平滑特性进一步增强一致性。实验结果表明,我们的方法在内容完整性和时间一致性方面优于现有的最先进技术,同时保持了可接受的效率。'}}}, {'id': 'https://huggingface.co/papers/2501.13554', 'title': 'One-Prompt-One-Story: Free-Lunch Consistent Text-to-Image Generation Using a Single Prompt', 'url': 'https://huggingface.co/papers/2501.13554', 'abstract': 'Text-to-image generation models can create high-quality images from input prompts. However, they struggle to support the consistent generation of identity-preserving requirements for storytelling. Existing approaches to this problem typically require extensive training in large datasets or additional modifications to the original model architectures. This limits their applicability across different domains and diverse diffusion model configurations. In this paper, we first observe the inherent capability of language models, coined context consistency, to comprehend identity through context with a single prompt. Drawing inspiration from the inherent context consistency, we propose a novel training-free method for consistent text-to-image (T2I) generation, termed "One-Prompt-One-Story" (1Prompt1Story). Our approach 1Prompt1Story concatenates all prompts into a single input for T2I diffusion models, initially preserving character identities. We then refine the generation process using two novel techniques: Singular-Value Reweighting and Identity-Preserving Cross-Attention, ensuring better alignment with the input description for each frame. In our experiments, we compare our method against various existing consistent T2I generation approaches to demonstrate its effectiveness through quantitative metrics and qualitative assessments. Code is available at https://github.com/byliutao/1Prompt1Story.', 'score': 8, 'issue_id': 1852, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': '15ba8f8e21d0e703', 'pdf_title_img': 'img/title_stub.png', 'data': {'categories': ['#training', '#cv', '#story_generation', '#open_source', '#optimization', '#diffusion', '#dataset'], 'emoji': '🎬', 'ru': {'title': 'Один промпт - одна история: последовательная генерация изображений без дообучения', 'desc': "Статья представляет новый метод генерации последовательных изображений из текста под названием '1Prompt1Story'. Этот подход объединяет все промпты в единый вход для диффузионных моделей, сохраняя идентичность персонажей. Метод использует две новые техники: переоценку сингулярных значений и сохраняющее идентичность кросс-внимание. '1Prompt1Story' не требует дополнительного обучения и применим к различным конфигурациям диффузионных моделей."}, 'en': {'title': 'Consistent Storytelling in Text-to-Image Generation', 'desc': 'This paper addresses the challenge of generating consistent images that preserve character identities in text-to-image (T2I) models. The authors introduce a novel method called "One-Prompt-One-Story" (1Prompt1Story), which allows for the concatenation of multiple prompts into a single input, enhancing the model\'s ability to maintain character consistency. They employ two innovative techniques, Singular-Value Reweighting and Identity-Preserving Cross-Attention, to refine the image generation process and ensure alignment with the input descriptions. The proposed method is evaluated against existing approaches, showing improved performance in both quantitative metrics and qualitative assessments.'}, 'zh': {'title': '一提示一故事:提升文本到图像生成的一致性', 'desc': '本文提出了一种新的文本到图像生成方法,称为"一提示一故事"(1Prompt1Story),旨在解决生成过程中角色身份一致性的问题。该方法通过将所有提示合并为单个输入,初步保持角色身份,并利用两种新技术进行生成过程的优化。我们的方法不需要大量训练数据或对模型架构的修改,具有更广泛的适用性。实验结果表明,1Prompt1Story在定量和定性评估中均优于现有的一致性生成方法。'}}, 'authors': [], 'affiliations': []}, {'id': 'https://huggingface.co/papers/2501.13824', 'title': 'Hallucinations Can Improve Large Language Models in Drug Discovery', 'url': 'https://huggingface.co/papers/2501.13824', 'abstract': 'Concerns about hallucinations in Large Language Models (LLMs) have been raised by researchers, yet their potential in areas where creativity is vital, such as drug discovery, merits exploration. In this paper, we come up with the hypothesis that hallucinations can improve LLMs in drug discovery. To verify this hypothesis, we use LLMs to describe the SMILES string of molecules in natural language and then incorporate these descriptions as part of the prompt to address specific tasks in drug discovery. Evaluated on seven LLMs and five classification tasks, our findings confirm the hypothesis: LLMs can achieve better performance with text containing hallucinations. Notably, Llama-3.1-8B achieves an 18.35% gain in ROC-AUC compared to the baseline without hallucination. Furthermore, hallucinations generated by GPT-4o provide the most consistent improvements across models. Additionally, we conduct empirical analyses and a case study to investigate key factors affecting performance and the underlying reasons. Our research sheds light on the potential use of hallucinations for LLMs and offers new perspectives for future research leveraging LLMs in drug discovery.', 'score': 5, 'issue_id': 1853, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'bd66442952551d3e', 'authors': ['Shuzhou Yuan', 'Michael Färber'], 'affiliations': ['Center for Scalable Data Analytics and Artificial Intelligence (ScaDS.AI), Germany', 'Dresden University of Technology, Germany'], 'pdf_title_img': 'assets/pdf/title_img/2501.13824.jpg', 'data': {'categories': ['#healthcare', '#rlhf', '#science', '#hallucinations'], 'emoji': '💊', 'ru': {'title': 'Галлюцинации LLM ускоряют разработку лекарств', 'desc': 'Исследователи изучили потенциал галлюцинаций в больших языковых моделях (LLM) для улучшения процесса открытия новых лекарств. Они использовали LLM для описания молекул на естественном языке и включили эти описания в промпты для решения задач в области разработки лекарств. Эксперименты на семи LLM и пяти задачах классификации подтвердили гипотезу: модели показали лучшие результаты с текстами, содержащими галлюцинации. Исследование открывает новые перспективы использования LLM в фармацевтике.'}, 'en': {'title': 'Harnessing Hallucinations: Boosting Drug Discovery with LLMs', 'desc': "This paper explores the idea that hallucinations in Large Language Models (LLMs) can enhance their performance in drug discovery tasks. The authors hypothesize that by using LLMs to generate natural language descriptions of molecular SMILES strings, they can improve the models' effectiveness in specific classification tasks. Their experiments show that LLMs, particularly Llama-3.1-8B, achieve significant performance gains when incorporating these hallucinated descriptions, with an 18.35% increase in ROC-AUC. The study provides insights into how hallucinations can be beneficial in creative applications like drug discovery, suggesting new avenues for future research."}, 'zh': {'title': '利用幻觉提升药物发现中的大型语言模型表现', 'desc': '本研究探讨了大型语言模型(LLMs)在药物发现中的潜力,尤其是它们的幻觉现象。我们提出假设,幻觉可以提升LLMs在药物发现任务中的表现。通过将LLMs生成的分子SMILES字符串描述作为提示的一部分,我们在七个LLMs和五个分类任务上进行了评估。结果表明,包含幻觉的文本能显著提高模型性能,尤其是Llama-3.1-8B在ROC-AUC上比基线提高了18.35%。'}}}, {'id': 'https://huggingface.co/papers/2501.13452', 'title': 'EchoVideo: Identity-Preserving Human Video Generation by Multimodal Feature Fusion', 'url': 'https://huggingface.co/papers/2501.13452', 'abstract': 'Recent advancements in video generation have significantly impacted various downstream applications, particularly in identity-preserving video generation (IPT2V). However, existing methods struggle with "copy-paste" artifacts and low similarity issues, primarily due to their reliance on low-level facial image information. This dependence can result in rigid facial appearances and artifacts reflecting irrelevant details. To address these challenges, we propose EchoVideo, which employs two key strategies: (1) an Identity Image-Text Fusion Module (IITF) that integrates high-level semantic features from text, capturing clean facial identity representations while discarding occlusions, poses, and lighting variations to avoid the introduction of artifacts; (2) a two-stage training strategy, incorporating a stochastic method in the second phase to randomly utilize shallow facial information. The objective is to balance the enhancements in fidelity provided by shallow features while mitigating excessive reliance on them. This strategy encourages the model to utilize high-level features during training, ultimately fostering a more robust representation of facial identities. EchoVideo effectively preserves facial identities and maintains full-body integrity. Extensive experiments demonstrate that it achieves excellent results in generating high-quality, controllability and fidelity videos.', 'score': 5, 'issue_id': 1846, 'pub_date': '2025-01-23', 'pub_date_card': {'ru': '23 января', 'en': 'January 23', 'zh': '1月23日'}, 'hash': 'b98d987f7439b94b', 'authors': ['Jiangchuan Wei', 'Shiyue Yan', 'Wenfeng Lin', 'Boyuan Liu', 'Renjie Chen', 'Mingyu Guo'], 'affiliations': ['ByteDance'], 'pdf_title_img': 'assets/pdf/title_img/2501.13452.jpg', 'data': {'categories': ['#video'], 'emoji': '🎭', 'ru': {'title': 'EchoVideo: Новый подход к генерации видео с сохранением идентичности', 'desc': 'EchoVideo - это новый метод генерации видео с сохранением идентичности (IPT2V). Он использует модуль слияния изображения и текста (IITF) для интеграции семантических признаков и получения чистых представлений лиц. Применяется двухэтапная стратегия обучения со стохастическим использованием поверхностной информации о лицах. EchoVideo эффективно сохраняет идентичность лиц и целостность всего тела, демонстрируя отличные результаты в генерации качественных и контролируемых видео.'}, 'en': {'title': 'EchoVideo: Enhancing Identity Preservation in Video Generation', 'desc': "The paper introduces EchoVideo, a novel approach to identity-preserving video generation that addresses common issues like 'copy-paste' artifacts and low similarity in generated videos. It utilizes an Identity Image-Text Fusion Module (IITF) to merge high-level semantic features from text, ensuring clean facial identity representations while avoiding irrelevant details. Additionally, a two-stage training strategy is implemented, which includes a stochastic method to balance the use of shallow facial information with high-level features. This results in improved fidelity and robustness in facial identity representation, leading to high-quality video generation with better controllability."}, 'zh': {'title': 'EchoVideo:提升视频生成的身份保留与质量', 'desc': '近年来,视频生成技术的进步对身份保留视频生成(IPT2V)产生了重要影响。然而,现有方法在生成过程中常常出现“复制粘贴”伪影和低相似度的问题,这主要是因为它们依赖于低级别的面部图像信息。为了解决这些挑战,我们提出了EchoVideo,采用了身份图像-文本融合模块(IITF)和两阶段训练策略,旨在平衡浅层特征的增强与高层特征的利用。实验表明,EchoVideo在生成高质量、可控性和保真度的视频方面表现出色,有效保留了面部身份和全身完整性。'}}}, {'id': 'https://huggingface.co/papers/2501.10979', 'title': 'Control LLM: Controlled Evolution for Intelligence Retention in LLM', 'url': 'https://huggingface.co/papers/2501.10979', 'abstract': "Large Language Models (LLMs) demand significant computational resources, making it essential to enhance their capabilities without retraining from scratch. A key challenge in this domain is catastrophic forgetting (CF), which hampers performance during Continuous Pre-training (CPT) and Continuous Supervised Fine-Tuning (CSFT). We propose Control LLM, a novel approach that leverages parallel pre-trained and expanded transformer blocks, aligning their hidden-states through interpolation strategies This method effectively preserves performance on existing tasks while seamlessly integrating new knowledge. Extensive experiments demonstrate the effectiveness of Control LLM in both CPT and CSFT. On Llama3.1-8B-Instruct, it achieves significant improvements in mathematical reasoning (+14.4% on Math-Hard) and coding performance (+10% on MBPP-PLUS). On Llama3.1-8B, it enhances multilingual capabilities (+10.6% on C-Eval, +6.8% on CMMLU, and +30.2% on CMMLU-0shot-CoT). It surpasses existing methods and achieves SOTA among open-source models tuned from the same base model, using substantially less data and compute. Crucially, these gains are realized while preserving strong original capabilities, with minimal degradation (<4.3% on MMLU) compared to >35% in open-source Math and Coding models. This approach has been successfully deployed in LinkedIn's GenAI-powered job seeker and Ads unit products. To support further research, we release the training and evaluation code (https://github.com/linkedin/ControlLLM) along with models trained on public datasets ( https://huggingface.co/ControlLLM) to the community.", 'score': 3, 'issue_id': 1858, 'pub_date': '2025-01-19', 'pub_date_card': {'ru': '19 января', 'en': 'January 19', 'zh': '1月19日'}, 'hash': 'dd48db75ab08337c', 'authors': ['Haichao Wei', 'Yunxiang Ren', 'Zhoutong Fu', 'Aman Lunia', 'Yi-Lin Chen', 'Alice Leung', 'Ya Xu'], 'affiliations': ['LinkedIn'], 'pdf_title_img': 'assets/pdf/title_img/2501.10979.jpg', 'data': {'categories': ['#dataset', '#training', '#open_source', '#math', '#optimization', '#multilingual'], 'emoji': '🧠', 'ru': {'title': 'Контроль над забыванием: новый метод обучения языковых моделей', 'desc': 'Control LLM - это новый подход к обучению больших языковых моделей, который решает проблему катастрофического забывания при непрерывном предобучении и дообучении. Метод использует параллельные предобученные и расширенные блоки трансформера, интерполируя их скрытые состояния. Эксперименты показали значительное улучшение производительности в математических рассуждениях, программировании и многоязычных задачах без существенной потери изначальных возможностей. Подход успешно применен в продуктах LinkedIn и открыт для исследовательского сообщества.'}, 'en': {'title': 'Enhancing LLMs Without Starting Over!', 'desc': 'This paper introduces Control LLM, a new method designed to improve the performance of Large Language Models (LLMs) without the need for complete retraining. It addresses the issue of catastrophic forgetting that occurs during Continuous Pre-training (CPT) and Continuous Supervised Fine-Tuning (CSFT) by using parallel pre-trained transformer blocks and interpolation strategies to align hidden states. The results show that Control LLM significantly enhances performance in various tasks, including mathematical reasoning and coding, while maintaining strong original capabilities. The method has been validated through extensive experiments and is made available for further research, demonstrating its effectiveness in real-world applications.'}, 'zh': {'title': '提升大型语言模型能力的新方法', 'desc': '大型语言模型(LLMs)需要大量计算资源,因此在不从头开始重新训练的情况下提升其能力至关重要。本文提出了一种新方法Control LLM,通过并行预训练和扩展的变换器块,利用插值策略对齐其隐藏状态,从而有效地保留现有任务的性能并无缝整合新知识。实验结果表明,Control LLM在连续预训练和连续监督微调中表现出色,显著提高了数学推理和编码性能,同时在多语言能力上也有显著提升。该方法在保持原有强大能力的同时,减少了数据和计算的需求,展示了其在开源模型中的领先地位。'}}}, {'id': 'https://huggingface.co/papers/2501.13075', 'title': 'Evolution and The Knightian Blindspot of Machine Learning', 'url': 'https://huggingface.co/papers/2501.13075', 'abstract': "This paper claims that machine learning (ML) largely overlooks an important facet of general intelligence: robustness to a qualitatively unknown future in an open world. Such robustness relates to Knightian uncertainty (KU) in economics, i.e. uncertainty that cannot be quantified, which is excluded from consideration in ML's key formalisms. This paper aims to identify this blind spot, argue its importance, and catalyze research into addressing it, which we believe is necessary to create truly robust open-world AI. To help illuminate the blind spot, we contrast one area of ML, reinforcement learning (RL), with the process of biological evolution. Despite staggering ongoing progress, RL still struggles in open-world situations, often failing under unforeseen situations. For example, the idea of zero-shot transferring a self-driving car policy trained only in the US to the UK currently seems exceedingly ambitious. In dramatic contrast, biological evolution routinely produces agents that thrive within an open world, sometimes even to situations that are remarkably out-of-distribution (e.g. invasive species; or humans, who do undertake such zero-shot international driving). Interestingly, evolution achieves such robustness without explicit theory, formalisms, or mathematical gradients. We explore the assumptions underlying RL's typical formalisms, showing how they limit RL's engagement with the unknown unknowns characteristic of an ever-changing complex world. Further, we identify mechanisms through which evolutionary processes foster robustness to novel and unpredictable challenges, and discuss potential pathways to algorithmically embody them. The conclusion is that the intriguing remaining fragility of ML may result from blind spots in its formalisms, and that significant gains may result from direct confrontation with the challenge of KU.", 'score': 3, 'issue_id': 1845, 'pub_date': '2025-01-22', 'pub_date_card': {'ru': '22 января', 'en': 'January 22', 'zh': '1月22日'}, 'hash': '5be12844b33bd729', 'authors': ['Joel Lehman', 'Elliot Meyerson', 'Tarek El-Gaaly', 'Kenneth O. Stanley', 'Tarin Ziyaee'], 'affiliations': ['Cognizant AI Labs', 'Second Nature AI'], 'pdf_title_img': 'assets/pdf/title_img/2501.13075.jpg', 'data': {'categories': ['#rl', '#agi', '#agents', '#reasoning', '#math'], 'emoji': '🧬', 'ru': {'title': 'Преодоление неизвестного: уроки эволюции для машинного обучения', 'desc': 'Статья утверждает, что машинное обучение упускает важный аспект общего интеллекта: устойчивость к качественно неизвестному будущему в открытом мире. Авторы сравнивают обучение с подкреплением (RL) и биологическую эволюцию, показывая, что RL часто не справляется с непредвиденными ситуациями. В статье исследуются предположения, лежащие в основе формализмов RL, и выявляются механизмы, с помощью которых эволюционные процессы способствуют устойчивости к новым и непредсказуемым вызовам. Авторы приходят к выводу, что хрупкость машинного обучения может быть результатом слепых пятен в его формализмах, и значительные улучшения могут быть достигнуты путем прямого противостояния проблеме неопределенности Найта.'}, 'en': {'title': 'Bridging the Gap: Enhancing ML Robustness through Evolutionary Insights', 'desc': 'This paper highlights a critical gap in machine learning (ML) regarding its ability to handle unknown future scenarios, which is essential for general intelligence. It draws parallels between reinforcement learning (RL) and biological evolution, emphasizing that while RL struggles with unforeseen situations, evolution naturally adapts to them. The authors argue that current ML formalisms overlook Knightian uncertainty, which limits the robustness of AI systems in open-world environments. They propose that by understanding and integrating evolutionary mechanisms, ML can improve its resilience to unpredictable challenges.'}, 'zh': {'title': '机器学习需面对未知不确定性挑战', 'desc': '这篇论文指出,机器学习(ML)在处理开放世界中的未知未来时,忽视了一个重要方面:对未知不确定性的鲁棒性。作者将这种鲁棒性与经济学中的奈特不确定性(Knightian Uncertainty)相联系,认为这是机器学习关键形式化中被排除的因素。通过对比强化学习(RL)与生物进化过程,论文强调了RL在开放世界情境中的局限性,并探讨了生物进化如何在没有明确理论的情况下,培养出适应复杂环境的能力。最后,作者认为,机器学习的脆弱性可能源于其形式化中的盲点,直接面对奈特不确定性挑战可能会带来显著的进步。'}}}, {'id': 'https://huggingface.co/papers/2501.13124', 'title': 'Debate Helps Weak-to-Strong Generalization', 'url': 'https://huggingface.co/papers/2501.13124', 'abstract': 'Common methods for aligning already-capable models with desired behavior rely on the ability of humans to provide supervision. However, future superhuman models will surpass the capability of humans. Therefore, humans will only be able to weakly supervise superhuman models. This expected deficiency of human evaluation would weaken the safety of future AI systems. Scalable oversight and weak-to-strong generalization are two complementary approaches to tackle this issue. In this paper, we attempt to combine the strengths of these two approaches to further improve alignment. Specifically, we investigate ways of improving human supervision with a strong pretrained model and then supervise the strong model with enhanced weak human supervision. To make iterative empirical progress, we consider an analogy: can we use a strong model to improve weak model supervision and then use it to supervise the strong model? We empirically test it by finetuning a small weak model on ground truth labels with the additional help from a large strong model, and then finetuning the strong model on labels generated by the weak model. We find that debate can assist a weak model in extracting trustworthy information from an untrustworthy strong model, which provides leverage as context on samples when training a weak model. We also show that an ensemble of weak models helps exploit long arguments generated by strong model debaters and obtain a more robust supervision estimate. Extensive experiments on the OpenAI weak-to-strong NLP benchmarks show that the combination approach leads to better alignment, which indicates that debate has the potential to help weak-to-strong generalization.', 'score': 3, 'issue_id': 1843, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'cacd0d01e3d119ee', 'authors': ['Hao Lang', 'Fei Huang', 'Yongbin Li'], 'affiliations': ['Tongyi Lab, Alibaba Inc.'], 'pdf_title_img': 'assets/pdf/title_img/2501.13124.jpg', 'data': {'categories': ['#alignment', '#training', '#rlhf'], 'emoji': '🤖', 'ru': {'title': 'Улучшение контроля над ИИ: от слабого к сильному', 'desc': 'Эта статья исследует методы улучшения контроля над сверхчеловеческими моделями искусственного интеллекта. Авторы предлагают комбинированный подход, используя сильную предобученную модель для улучшения слабого человеческого надзора, а затем применяя этот улучшенный надзор для обучения сильной модели. Эксперименты показывают, что метод дебатов помогает слабой модели извлекать достоверную информацию из ненадежной сильной модели. Результаты на бенчмарках OpenAI демонстрируют, что комбинированный подход приводит к лучшему выравниванию моделей с желаемым поведением.'}, 'en': {'title': 'Enhancing AI Alignment through Model Debate and Supervision', 'desc': 'This paper addresses the challenge of aligning superhuman AI models with desired behaviors, given that human supervision may be insufficient. It proposes a novel approach that combines scalable oversight with weak-to-strong generalization to enhance model alignment. The authors explore how a strong pretrained model can improve the supervision of a weak model, and in turn, how the weak model can provide valuable feedback to the strong model. Their experiments demonstrate that using debate between models can help extract reliable information, leading to improved alignment and performance on NLP tasks.'}, 'zh': {'title': '利用辩论提升AI模型的监督能力', 'desc': '本文探讨了如何在未来超人类模型的监督下改善人类的监督能力。由于人类的监督能力有限,未来的AI系统可能会面临安全性问题。我们提出了一种结合可扩展监督和弱到强泛化的方法,通过强大的预训练模型来增强人类的监督。实验结果表明,辩论可以帮助弱模型从强模型中提取可靠信息,从而提高监督的有效性。'}}}, {'id': 'https://huggingface.co/papers/2501.11858', 'title': 'EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents', 'url': 'https://huggingface.co/papers/2501.11858', 'abstract': 'Multimodal Large Language Models (MLLMs) have shown significant advancements, providing a promising future for embodied agents. Existing benchmarks for evaluating MLLMs primarily utilize static images or videos, limiting assessments to non-interactive scenarios. Meanwhile, existing embodied AI benchmarks are task-specific and not diverse enough, which do not adequately evaluate the embodied capabilities of MLLMs. To address this, we propose EmbodiedEval, a comprehensive and interactive evaluation benchmark for MLLMs with embodied tasks. EmbodiedEval features 328 distinct tasks within 125 varied 3D scenes, each of which is rigorously selected and annotated. It covers a broad spectrum of existing embodied AI tasks with significantly enhanced diversity, all within a unified simulation and evaluation framework tailored for MLLMs. The tasks are organized into five categories: navigation, object interaction, social interaction, attribute question answering, and spatial question answering to assess different capabilities of the agents. We evaluated the state-of-the-art MLLMs on EmbodiedEval and found that they have a significant shortfall compared to human level on embodied tasks. Our analysis demonstrates the limitations of existing MLLMs in embodied capabilities, providing insights for their future development. We open-source all evaluation data and simulation framework at https://github.com/thunlp/EmbodiedEval.', 'score': 2, 'issue_id': 1862, 'pub_date': '2025-01-21', 'pub_date_card': {'ru': '21 января', 'en': 'January 21', 'zh': '1月21日'}, 'hash': 'af76793f3055f7e0', 'authors': ['Zhili Cheng', 'Yuge Tu', 'Ran Li', 'Shiqi Dai', 'Jinyi Hu', 'Shengding Hu', 'Jiahao Li', 'Yang Shi', 'Tianyu Yu', 'Weize Chen', 'Lei Shi', 'Maosong Sun'], 'affiliations': ['Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.11858.jpg', 'data': {'categories': ['#3d', '#multimodal', '#benchmark', '#games', '#open_source', '#agents'], 'emoji': '🤖', 'ru': {'title': 'EmbodiedEval: Новый рубеж в оценке воплощенных возможностей MLLM', 'desc': 'Статья представляет новый комплексный бенчмарк EmbodiedEval для оценки мультимодальных больших языковых моделей (MLLM) в задачах воплощенного искусственного интеллекта. EmbodiedEval включает 328 разнообразных задач в 125 трехмерных сценах, охватывающих навигацию, взаимодействие с объектами, социальное взаимодействие и ответы на вопросы. Оценка современных MLLM на EmbodiedEval выявила значительное отставание от человеческого уровня в воплощенных задачах. Результаты демонстрируют ограничения существующих MLLM и предоставляют insights для их дальнейшего развития.'}, 'en': {'title': 'Empowering MLLMs with Interactive Evaluation for Embodied Tasks', 'desc': "This paper introduces EmbodiedEval, a new evaluation benchmark designed for Multimodal Large Language Models (MLLMs) in the context of embodied tasks. Unlike previous benchmarks that rely on static images or videos, EmbodiedEval offers a diverse set of 328 interactive tasks across 125 3D scenes, allowing for a more comprehensive assessment of MLLMs' capabilities. The tasks are categorized into five areas: navigation, object interaction, social interaction, attribute question answering, and spatial question answering, which helps evaluate different aspects of embodied AI. The findings reveal that current MLLMs fall short of human performance in these tasks, highlighting the need for further advancements in their embodied capabilities."}, 'zh': {'title': '全面评估多模态大型语言模型的具身能力', 'desc': '多模态大型语言模型(MLLMs)在智能体领域取得了显著进展,但现有的评估基准主要依赖静态图像或视频,限制了对交互场景的评估。为了更全面地评估MLLMs的能力,我们提出了EmbodiedEval,这是一个包含328个任务和125个多样化3D场景的互动评估基准。该基准涵盖了导航、物体交互、社交互动等五大类任务,旨在全面评估智能体的不同能力。通过对最先进的MLLMs进行评估,我们发现它们在具身任务上与人类水平存在显著差距,揭示了现有模型的局限性,为未来的发展提供了重要见解。'}}}, {'id': 'https://huggingface.co/papers/2501.10283', 'title': 'GSTAR: Gaussian Surface Tracking and Reconstruction', 'url': 'https://huggingface.co/papers/2501.10283', 'abstract': '3D Gaussian Splatting techniques have enabled efficient photo-realistic rendering of static scenes. Recent works have extended these approaches to support surface reconstruction and tracking. However, tracking dynamic surfaces with 3D Gaussians remains challenging due to complex topology changes, such as surfaces appearing, disappearing, or splitting. To address these challenges, we propose GSTAR, a novel method that achieves photo-realistic rendering, accurate surface reconstruction, and reliable 3D tracking for general dynamic scenes with changing topology. Given multi-view captures as input, GSTAR binds Gaussians to mesh faces to represent dynamic objects. For surfaces with consistent topology, GSTAR maintains the mesh topology and tracks the meshes using Gaussians. In regions where topology changes, GSTAR adaptively unbinds Gaussians from the mesh, enabling accurate registration and the generation of new surfaces based on these optimized Gaussians. Additionally, we introduce a surface-based scene flow method that provides robust initialization for tracking between frames. Experiments demonstrate that our method effectively tracks and reconstructs dynamic surfaces, enabling a range of applications. Our project page with the code release is available at https://eth-ait.github.io/GSTAR/.', 'score': 1, 'issue_id': 1847, 'pub_date': '2025-01-17', 'pub_date_card': {'ru': '17 января', 'en': 'January 17', 'zh': '1月17日'}, 'hash': '2ce1394526d61cff', 'authors': ['Chengwei Zheng', 'Lixin Xue', 'Juan Zarate', 'Jie Song'], 'affiliations': ['ETH Zurich', 'HKUST', 'HKUST(GZ)'], 'pdf_title_img': 'assets/pdf/title_img/2501.10283.jpg', 'data': {'categories': ['#3d'], 'emoji': '🌊', 'ru': {'title': 'GSTAR: Гауссово сплаттинг для динамических 3D-сцен', 'desc': 'GSTAR - это новый метод, который позволяет достичь фотореалистичного рендеринга, точной реконструкции поверхности и надежного 3D-трекинга для динамических сцен с изменяющейся топологией. Метод связывает гауссианы с гранями меша для представления динамических объектов и адаптивно отвязывает их в областях с изменяющейся топологией. GSTAR также вводит метод поверхностного потока сцены для надежной инициализации трекинга между кадрами. Эксперименты показывают эффективность метода в отслеживании и реконструкции динамических поверхностей.'}, 'en': {'title': 'GSTAR: Dynamic Surface Tracking with 3D Gaussian Splatting', 'desc': 'The paper presents GSTAR, a new method for rendering and tracking dynamic surfaces using 3D Gaussian Splatting. It effectively handles changes in surface topology, such as when surfaces appear or disappear, by binding Gaussians to mesh faces. For consistent topologies, GSTAR maintains the mesh structure, while it adaptively unbinds Gaussians in areas with topology changes to ensure accurate surface reconstruction. The method also includes a surface-based scene flow technique for improved tracking across frames, demonstrating its effectiveness in various applications.'}, 'zh': {'title': 'GSTAR:动态场景中的高效3D跟踪与重建', 'desc': '3D高斯点技术使得静态场景的照片级真实感渲染变得高效。最近的研究扩展了这些方法,以支持表面重建和跟踪。然而,使用3D高斯点跟踪动态表面仍然面临挑战,因为表面可能出现、消失或分裂。为了解决这些问题,我们提出了GSTAR,这是一种新方法,能够在拓扑变化的动态场景中实现照片级真实感渲染、准确的表面重建和可靠的3D跟踪。'}}}, {'id': 'https://huggingface.co/papers/2501.08828', 'title': 'MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents', 'url': 'https://huggingface.co/papers/2501.08828', 'abstract': 'Multi-modal document retrieval is designed to identify and retrieve various forms of multi-modal content, such as figures, tables, charts, and layout information from extensive documents. Despite its significance, there is a notable lack of a robust benchmark to effectively evaluate the performance of systems in multi-modal document retrieval. To address this gap, this work introduces a new benchmark, named as MMDocIR, encompassing two distinct tasks: page-level and layout-level retrieval. The former focuses on localizing the most relevant pages within a long document, while the latter targets the detection of specific layouts, offering a more fine-grained granularity than whole-page analysis. A layout can refer to a variety of elements such as textual paragraphs, equations, figures, tables, or charts. The MMDocIR benchmark comprises a rich dataset featuring expertly annotated labels for 1,685 questions and bootstrapped labels for 173,843 questions, making it a pivotal resource for advancing multi-modal document retrieval for both training and evaluation. Through rigorous experiments, we reveal that (i) visual retrievers significantly outperform their text counterparts, (ii) MMDocIR train set can effectively benefit the training process of multi-modal document retrieval and (iii) text retrievers leveraging on VLM-text perform much better than those using OCR-text. These findings underscores the potential advantages of integrating visual elements for multi-modal document retrieval.', 'score': 17, 'issue_id': 1698, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'bf9a6df8fecd4ec1', 'authors': ['Kuicai Dong', 'Yujing Chang', 'Xin Deik Goh', 'Dexun Li', 'Ruiming Tang', 'Yong Liu'], 'affiliations': ['Noahs Ark Lab, Huawei'], 'pdf_title_img': 'assets/pdf/title_img/2501.08828.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#dataset'], 'emoji': '🔍', 'ru': {'title': 'MMDocIR: Новый стандарт для мультимодального поиска документов', 'desc': 'Статья представляет новый бенчмарк MMDocIR для оценки систем мультимодального поиска документов. Бенчмарк включает две задачи: поиск на уровне страниц и на уровне макетов. Датасет содержит экспертные аннотации для 1,685 вопросов и автоматически сгенерированные метки для 173,843 вопросов. Эксперименты показали, что визуальные ретриверы превосходят текстовые, а использование визуально-языковых моделей дает лучшие результаты, чем OCR-текст.'}, 'en': {'title': 'Unlocking Multi-Modal Document Retrieval with MMDocIR', 'desc': 'This paper addresses the challenge of multi-modal document retrieval, which involves finding various types of content like figures and tables in large documents. It introduces a new benchmark called MMDocIR, which includes two tasks: page-level retrieval for finding relevant pages and layout-level retrieval for identifying specific layouts within those pages. The benchmark is supported by a comprehensive dataset with thousands of annotated questions, facilitating better training and evaluation of retrieval systems. The results show that visual retrieval methods outperform text-based methods, highlighting the importance of incorporating visual information in multi-modal retrieval tasks.'}, 'zh': {'title': '多模态文档检索的新基准MMDocIR', 'desc': '多模态文档检索旨在从大量文档中识别和提取各种形式的内容,如图形、表格、图表和布局信息。尽管其重要性显著,但目前缺乏有效评估多模态文档检索系统性能的基准。为了解决这一问题,本文提出了一个新的基准MMDocIR,包含页面级和布局级检索两个任务。通过严格的实验,我们发现视觉检索器的表现显著优于文本检索器,且MMDocIR训练集能有效促进多模态文档检索的训练过程。'}}}, {'id': 'https://huggingface.co/papers/2501.08365', 'title': 'Towards Best Practices for Open Datasets for LLM Training', 'url': 'https://huggingface.co/papers/2501.08365', 'abstract': 'Many AI companies are training their large language models (LLMs) on data without the permission of the copyright owners. The permissibility of doing so varies by jurisdiction: in countries like the EU and Japan, this is allowed under certain restrictions, while in the United States, the legal landscape is more ambiguous. Regardless of the legal status, concerns from creative producers have led to several high-profile copyright lawsuits, and the threat of litigation is commonly cited as a reason for the recent trend towards minimizing the information shared about training datasets by both corporate and public interest actors. This trend in limiting data information causes harm by hindering transparency, accountability, and innovation in the broader ecosystem by denying researchers, auditors, and impacted individuals access to the information needed to understand AI models. While this could be mitigated by training language models on open access and public domain data, at the time of writing, there are no such models (trained at a meaningful scale) due to the substantial technical and sociological challenges in assembling the necessary corpus. These challenges include incomplete and unreliable metadata, the cost and complexity of digitizing physical records, and the diverse set of legal and technical skills required to ensure relevance and responsibility in a quickly changing landscape. Building towards a future where AI systems can be trained on openly licensed data that is responsibly curated and governed requires collaboration across legal, technical, and policy domains, along with investments in metadata standards, digitization, and fostering a culture of openness.', 'score': 16, 'issue_id': 1702, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '90686080aa439157', 'authors': ['Stefan Baack', 'Stella Biderman', 'Kasia Odrozek', 'Aviya Skowron', 'Ayah Bdeir', 'Jillian Bommarito', 'Jennifer Ding', 'Maximilian Gahntz', 'Paul Keller', 'Pierre-Carl Langlais', 'Greg Lindahl', 'Sebastian Majstorovic', 'Nik Marda', 'Guilherme Penedo', 'Maarten Van Segbroeck', 'Jennifer Wang', 'Leandro von Werra', 'Mitchell Baker', 'Julie Belião', 'Kasia Chmielinski', 'Marzieh Fadaee', 'Lisa Gutermuth', 'Hynek Kydlíček', 'Greg Leppert', 'EM Lewis-Jong', 'Solana Larsen', 'Shayne Longpre', 'Angela Oduor Lungati', 'Cullen Miller', 'Victor Miller', 'Max Ryabinin', 'Kathleen Siminyu', 'Andrew Strait', 'Mark Surman', 'Anna Tumadóttir', 'Maurice Weber', 'Rebecca Weiss', 'Lee White', 'Thomas Wolf'], 'affiliations': [], 'pdf_title_img': 'assets/pdf/title_img/2501.08365.jpg', 'data': {'categories': ['#open_source', '#ethics', '#data', '#dataset'], 'emoji': '📚', 'ru': {'title': 'Открытые данные для ответственного ИИ: вызовы и перспективы', 'desc': 'Статья рассматривает проблему обучения больших языковых моделей (LLM) на данных без разрешения правообладателей. Анализируются юридические аспекты этой практики в разных странах и связанные с ней судебные иски. Отмечается тенденция к ограничению информации о наборах данных для обучения, что негативно влияет на прозрачность и подотчетность в сфере ИИ. Обсуждаются вызовы создания моделей на основе открытых данных, включая технические и социологические аспекты.'}, 'en': {'title': 'Towards Transparent AI: The Need for Open Data Collaboration', 'desc': 'This paper discusses the legal and ethical challenges surrounding the training of large language models (LLMs) using copyrighted data without permission. It highlights the varying legal frameworks across different countries, particularly the ambiguity in the United States compared to more defined rules in the EU and Japan. The authors argue that the trend of limiting information about training datasets undermines transparency and innovation in AI, making it difficult for researchers and stakeholders to understand the models. They propose that a shift towards using open access and public domain data is necessary, but emphasize the need for collaboration and investment in infrastructure to overcome the technical and sociological barriers involved.'}, 'zh': {'title': '推动开放许可数据的AI训练未来', 'desc': '许多人工智能公司在没有版权拥有者许可的情况下训练大型语言模型(LLMs)。不同国家对这种做法的合法性有不同的规定,欧盟和日本在某些限制下允许,而美国的法律环境则较为模糊。这种限制数据共享的信息趋势,妨碍了透明度、问责制和创新,影响了研究人员和受影响个体获取理解AI模型所需的信息。为了实现未来能够在开放许可数据上训练AI系统,需要在法律、技术和政策领域进行合作,并投资于元数据标准和数字化。'}}}, {'id': 'https://huggingface.co/papers/2501.08983', 'title': 'CityDreamer4D: Compositional Generative Model of Unbounded 4D Cities', 'url': 'https://huggingface.co/papers/2501.08983', 'abstract': '3D scene generation has garnered growing attention in recent years and has made significant progress. Generating 4D cities is more challenging than 3D scenes due to the presence of structurally complex, visually diverse objects like buildings and vehicles, and heightened human sensitivity to distortions in urban environments. To tackle these issues, we propose CityDreamer4D, a compositional generative model specifically tailored for generating unbounded 4D cities. Our main insights are 1) 4D city generation should separate dynamic objects (e.g., vehicles) from static scenes (e.g., buildings and roads), and 2) all objects in the 4D scene should be composed of different types of neural fields for buildings, vehicles, and background stuff. Specifically, we propose Traffic Scenario Generator and Unbounded Layout Generator to produce dynamic traffic scenarios and static city layouts using a highly compact BEV representation. Objects in 4D cities are generated by combining stuff-oriented and instance-oriented neural fields for background stuff, buildings, and vehicles. To suit the distinct characteristics of background stuff and instances, the neural fields employ customized generative hash grids and periodic positional embeddings as scene parameterizations. Furthermore, we offer a comprehensive suite of datasets for city generation, including OSM, GoogleEarth, and CityTopia. The OSM dataset provides a variety of real-world city layouts, while the Google Earth and CityTopia datasets deliver large-scale, high-quality city imagery complete with 3D instance annotations. Leveraging its compositional design, CityDreamer4D supports a range of downstream applications, such as instance editing, city stylization, and urban simulation, while delivering state-of-the-art performance in generating realistic 4D cities.', 'score': 11, 'issue_id': 1698, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '39cd0826d4232170', 'authors': ['Haozhe Xie', 'Zhaoxi Chen', 'Fangzhou Hong', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University, Singapore 637335'], 'pdf_title_img': 'assets/pdf/title_img/2501.08983.jpg', 'data': {'categories': ['#3d', '#dataset'], 'emoji': '🏙️', 'ru': {'title': 'Композиционная генерация 4D-городов с разделением динамики и статики', 'desc': 'CityDreamer4D - это генеративная модель для создания неограниченных 4D-городов. Она разделяет генерацию динамических объектов (например, транспорта) и статических сцен (зданий, дорог). Модель использует разные типы нейронных полей для зданий, транспорта и фона, применяя специализированные генеративные хеш-сетки и периодические позиционные эмбеддинги. CityDreamer4D демонстрирует передовые результаты в генерации реалистичных 4D-городов и поддерживает различные приложения, включая редактирование объектов и городское моделирование.'}, 'en': {'title': 'Revolutionizing Urban Landscapes: CityDreamer4D for Dynamic City Generation', 'desc': "This paper introduces CityDreamer4D, a generative model designed for creating unbounded 4D cities, which include both static and dynamic elements. The model distinguishes between dynamic objects like vehicles and static structures such as buildings, using specialized neural fields for each type. It employs a compact bird's-eye view (BEV) representation to generate realistic traffic scenarios and city layouts. Additionally, the paper provides extensive datasets for training, enabling various applications like instance editing and urban simulation while achieving high-quality results in 4D city generation."}, 'zh': {'title': 'CityDreamer4D:无限4D城市生成的新突破', 'desc': '近年来,3D场景生成受到了越来越多的关注,并取得了显著进展。生成4D城市比3D场景更具挑战性,因为城市环境中存在结构复杂、视觉多样的物体,如建筑和车辆。为了解决这些问题,我们提出了CityDreamer4D,这是一种专门用于生成无限4D城市的组合生成模型。该模型通过将动态物体与静态场景分离,并使用不同类型的神经场来组合城市中的所有物体,从而实现高质量的城市生成。'}}}, {'id': 'https://huggingface.co/papers/2501.08994', 'title': 'RepVideo: Rethinking Cross-Layer Representation for Video Generation', 'url': 'https://huggingface.co/papers/2501.08994', 'abstract': 'Video generation has achieved remarkable progress with the introduction of diffusion models, which have significantly improved the quality of generated videos. However, recent research has primarily focused on scaling up model training, while offering limited insights into the direct impact of representations on the video generation process. In this paper, we initially investigate the characteristics of features in intermediate layers, finding substantial variations in attention maps across different layers. These variations lead to unstable semantic representations and contribute to cumulative differences between features, which ultimately reduce the similarity between adjacent frames and negatively affect temporal coherence. To address this, we propose RepVideo, an enhanced representation framework for text-to-video diffusion models. By accumulating features from neighboring layers to form enriched representations, this approach captures more stable semantic information. These enhanced representations are then used as inputs to the attention mechanism, thereby improving semantic expressiveness while ensuring feature consistency across adjacent frames. Extensive experiments demonstrate that our RepVideo not only significantly enhances the ability to generate accurate spatial appearances, such as capturing complex spatial relationships between multiple objects, but also improves temporal consistency in video generation.', 'score': 10, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '0d164d45ba2a5c71', 'authors': ['Chenyang Si', 'Weichen Fan', 'Zhengyao Lv', 'Ziqi Huang', 'Yu Qiao', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University, Singapore, 639798', 'Shanghai Artificial Intelligence Laboratory, China'], 'pdf_title_img': 'assets/pdf/title_img/2501.08994.jpg', 'data': {'categories': ['#video', '#diffusion', '#architecture'], 'emoji': '🎬', 'ru': {'title': 'RepVideo: стабильные представления для качественной генерации видео', 'desc': 'Статья представляет RepVideo - улучшенную систему представлений для диффузионных моделей генерации видео на основе текста. Авторы обнаружили, что вариации в картах внимания между слоями приводят к нестабильным семантическим представлениям и снижают согласованность соседних кадров. RepVideo решает эту проблему путем накопления признаков из соседних слоев для создания обогащенных представлений. Эксперименты показывают, что RepVideo значительно улучшает способность генерировать точные пространственные образы и повышает временную согласованность при генерации видео.'}, 'en': {'title': 'Enhancing Video Generation with Stable Representations', 'desc': "This paper presents RepVideo, a new framework designed to improve video generation using text-to-video diffusion models. It identifies issues with unstable semantic representations caused by variations in attention maps across different layers of the model. By accumulating features from neighboring layers, RepVideo creates more stable and enriched representations that enhance the model's ability to maintain consistency between adjacent frames. The results show that RepVideo significantly improves both the spatial accuracy of generated videos and their temporal coherence, leading to more realistic video outputs."}, 'zh': {'title': '提升视频生成质量的RepVideo框架', 'desc': '本论文探讨了扩散模型在视频生成中的应用,提出了RepVideo框架以改善视频生成的质量。研究发现中间层特征的注意力图存在显著差异,这导致语义表示的不稳定性,进而影响相邻帧之间的相似性和时间一致性。RepVideo通过从相邻层累积特征,形成更丰富的表示,从而捕捉更稳定的语义信息。实验结果表明,RepVideo显著提高了生成视频的空间表现能力和时间一致性。'}}}, {'id': 'https://huggingface.co/papers/2501.07783', 'title': 'Parameter-Inverted Image Pyramid Networks for Visual Perception and Multimodal Understanding', 'url': 'https://huggingface.co/papers/2501.07783', 'abstract': 'Image pyramids are widely adopted in top-performing methods to obtain multi-scale features for precise visual perception and understanding. However, current image pyramids use the same large-scale model to process multiple resolutions of images, leading to significant computational cost. To address this challenge, we propose a novel network architecture, called Parameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses pretrained models (ViTs or CNNs) as branches to process multi-scale images, where images of higher resolutions are processed by smaller network branches to balance computational cost and performance. To integrate information from different spatial scales, we further propose a novel cross-branch feature interaction mechanism. To validate PIIP, we apply it to various perception models and a representative multimodal large language model called LLaVA, and conduct extensive experiments on various tasks such as object detection, segmentation, image classification and multimodal understanding. PIIP achieves superior performance compared to single-branch and existing multi-resolution approaches with lower computational cost. When applied to InternViT-6B, a large-scale vision foundation model, PIIP can improve its performance by 1%-2% on detection and segmentation with only 40%-60% of the original computation, finally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For multimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and 74.5% on MMBench with only 2.8M training data. Our code is released at https://github.com/OpenGVLab/PIIP.', 'score': 5, 'issue_id': 1701, 'pub_date': '2025-01-14', 'pub_date_card': {'ru': '14 января', 'en': 'January 14', 'zh': '1月14日'}, 'hash': '87295e912b5b0670', 'authors': ['Zhaokai Wang', 'Xizhou Zhu', 'Xue Yang', 'Gen Luo', 'Hao Li', 'Changyao Tian', 'Wenhan Dou', 'Junqi Ge', 'Lewei Lu', 'Yu Qiao', 'Jifeng Dai'], 'affiliations': ['Sensetime', 'Shanghai Artificial Intelligence Laboratory', 'Shanghai Jiao Tong University', 'The Chinese University of Hong Kong', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2501.07783.jpg', 'data': {'categories': ['#architecture', '#multimodal', '#cv'], 'emoji': '🔍', 'ru': {'title': 'Эффективные многомасштабные сети для точного визуального восприятия', 'desc': 'Статья представляет новую архитектуру нейронных сетей под названием Parameter-Inverted Image Pyramid Networks (PIIP). PIIP использует предобученные модели (ViT или CNN) в качестве ветвей для обработки многомасштабных изображений, где изображения с более высоким разрешением обрабатываются меньшими сетевыми ветвями для баланса вычислительных затрат и производительности. Авторы также предлагают новый механизм взаимодействия признаков между ветвями. PIIP демонстрирует превосходную производительность по сравнению с одноветвенными и существующими многоразрешающими подходами при меньших вычислительных затратах в задачах обнаружения объектов, сегментации, классификации изображений и мультимодального понимания.'}, 'en': {'title': 'Efficient Multi-Scale Processing with PIIP Networks', 'desc': 'This paper introduces Parameter-Inverted Image Pyramid Networks (PIIP), a new architecture designed to efficiently process multi-scale images for visual tasks. Unlike traditional methods that use a single large model for all resolutions, PIIP employs smaller branches for higher resolution images, reducing computational costs while maintaining performance. The architecture also features a unique cross-branch interaction mechanism to enhance feature integration across different scales. Experimental results demonstrate that PIIP outperforms existing methods in various tasks, achieving significant accuracy improvements with lower resource usage.'}, 'zh': {'title': '高效多尺度图像处理的新方法', 'desc': '本文提出了一种新的网络架构,称为参数反转图像金字塔网络(PIIP),旨在提高多尺度图像处理的效率。PIIP利用预训练模型作为分支,处理不同分辨率的图像,从而在性能和计算成本之间取得平衡。通过引入跨分支特征交互机制,PIIP能够有效整合来自不同空间尺度的信息。实验结果表明,PIIP在目标检测、分割和多模态理解等任务上表现优于现有方法,同时显著降低了计算成本。'}}}, {'id': 'https://huggingface.co/papers/2501.09012', 'title': 'Multimodal LLMs Can Reason about Aesthetics in Zero-Shot', 'url': 'https://huggingface.co/papers/2501.09012', 'abstract': "We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability shall be elicited to evaluate the aesthetics of artworks. To facilitate this investigation, we construct MM-StyleBench, a novel high-quality dataset for benchmarking artistic stylization. We then develop a principled method for human preference modeling and perform a systematic correlation analysis between MLLMs' responses and human preference. Our experiments reveal an inherent hallucination issue of MLLMs in art evaluation, associated with response subjectivity. ArtCoT is proposed, demonstrating that art-specific task decomposition and the use of concrete language boost MLLMs' reasoning ability for aesthetics. Our findings offer valuable insights into MLLMs for art and can benefit a wide range of downstream applications, such as style transfer and artistic image generation. Code available at https://github.com/songrise/MLLM4Art.", 'score': 5, 'issue_id': 1699, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'e516a920b6534cc0', 'authors': ['Ruixiang Jiang', 'Changwen Chen'], 'affiliations': ['The Hong Kong Polytechnic University'], 'pdf_title_img': 'assets/pdf/title_img/2501.09012.jpg', 'data': {'categories': ['#artificial intelligence', '#reasoning', '#hallucinations', '#multimodal', '#benchmark', '#dataset'], 'emoji': '🎨', 'ru': {'title': 'Искусственный интеллект учится оценивать искусство', 'desc': 'Исследование посвящено использованию мультимодальных языковых моделей (MLLM) для оценки эстетики произведений искусства. Авторы создали набор данных MM-StyleBench для тестирования художественной стилизации и разработали метод моделирования человеческих предпочтений. Эксперименты выявили проблему галлюцинаций MLLM при оценке искусства, связанную с субъективностью ответов. Предложенный метод ArtCoT улучшает способность MLLM к рассуждениям об эстетике путем декомпозиции задач и использования конкретного языка.'}, 'en': {'title': 'Enhancing MLLMs for Art Evaluation through Structured Reasoning', 'desc': "This paper investigates how Multimodal Large Language Models (MLLMs) can assess the aesthetics of artworks. The authors introduce MM-StyleBench, a new dataset designed to benchmark artistic stylization. They also create a method for modeling human preferences and analyze the correlation between MLLMs' evaluations and human judgments. The study highlights a hallucination problem in MLLMs when evaluating art and proposes ArtCoT, which improves reasoning by using task decomposition and specific language, providing insights for applications like style transfer and artistic image generation."}, 'zh': {'title': '提升多模态大语言模型的艺术推理能力', 'desc': '本研究首次探讨了多模态大语言模型(MLLMs)在评估艺术作品美学时的推理能力。我们构建了一个新的高质量数据集MM-StyleBench,用于艺术风格化的基准测试。通过系统的相关性分析,我们发现MLLMs在艺术评估中存在固有的幻觉问题,且与人类偏好存在主观性关联。我们提出了ArtCoT方法,表明艺术特定任务分解和使用具体语言可以提升MLLMs的美学推理能力。'}}}, {'id': 'https://huggingface.co/papers/2501.09019', 'title': 'Ouroboros-Diffusion: Exploring Consistent Content Generation in Tuning-free Long Video Diffusion', 'url': 'https://huggingface.co/papers/2501.09019', 'abstract': "The first-in-first-out (FIFO) video diffusion, built on a pre-trained text-to-video model, has recently emerged as an effective approach for tuning-free long video generation. This technique maintains a queue of video frames with progressively increasing noise, continuously producing clean frames at the queue's head while Gaussian noise is enqueued at the tail. However, FIFO-Diffusion often struggles to keep long-range temporal consistency in the generated videos due to the lack of correspondence modeling across frames. In this paper, we propose Ouroboros-Diffusion, a novel video denoising framework designed to enhance structural and content (subject) consistency, enabling the generation of consistent videos of arbitrary length. Specifically, we introduce a new latent sampling technique at the queue tail to improve structural consistency, ensuring perceptually smooth transitions among frames. To enhance subject consistency, we devise a Subject-Aware Cross-Frame Attention (SACFA) mechanism, which aligns subjects across frames within short segments to achieve better visual coherence. Furthermore, we introduce self-recurrent guidance. This technique leverages information from all previous cleaner frames at the front of the queue to guide the denoising of noisier frames at the end, fostering rich and contextual global information interaction. Extensive experiments of long video generation on the VBench benchmark demonstrate the superiority of our Ouroboros-Diffusion, particularly in terms of subject consistency, motion smoothness, and temporal consistency.", 'score': 4, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'c4c991699f684865', 'authors': ['Jingyuan Chen', 'Fuchen Long', 'Jie An', 'Zhaofan Qiu', 'Ting Yao', 'Jiebo Luo', 'Tao Mei'], 'affiliations': ['HiDream.ai Inc.', 'University of Rochester, Rochester, NY USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.09019.jpg', 'data': {'categories': ['#benchmark', '#video', '#long_context', '#diffusion'], 'emoji': '🐍', 'ru': {'title': 'Бесконечное видео: Ouroboros-Diffusion для непрерывной генерации согласованного контента', 'desc': 'Эта статья представляет новый метод генерации видео произвольной длины под названием Ouroboros-Diffusion. Метод улучшает структурную и сюжетную согласованность видео с помощью нового подхода к выборке латентного пространства и механизма Subject-Aware Cross-Frame Attention. Авторы также вводят самоповторяющееся руководство, использующее информацию из предыдущих очищенных кадров для улучшения шумных кадров. Эксперименты на бенчмарке VBench показывают превосходство Ouroboros-Diffusion в сохранении согласованности субъектов, плавности движения и временной согласованности.'}, 'en': {'title': 'Ouroboros-Diffusion: Enhancing Long Video Consistency and Coherence', 'desc': 'The paper introduces Ouroboros-Diffusion, a new framework for improving long video generation using a pre-trained text-to-video model. It addresses the limitations of FIFO-Diffusion, particularly in maintaining long-range temporal consistency across video frames. The proposed method enhances structural consistency through a novel latent sampling technique and improves subject consistency with a Subject-Aware Cross-Frame Attention mechanism. Additionally, self-recurrent guidance is implemented to utilize information from previous frames, resulting in videos with better visual coherence and smoother transitions.'}, 'zh': {'title': 'Ouroboros-Diffusion:提升视频生成一致性的创新框架', 'desc': 'FIFO视频扩散是一种基于预训练文本到视频模型的长视频生成方法,但在生成视频时常常缺乏长时间的一致性。本文提出了Ouroboros-Diffusion框架,通过引入新的潜在采样技术和主题感知跨帧注意机制,增强了视频的结构和内容一致性。该方法确保了帧之间的平滑过渡,并通过自递归引导技术利用前面清晰帧的信息来改善后面噪声帧的去噪效果。实验结果表明,Ouroboros-Diffusion在主题一致性、运动平滑性和时间一致性方面优于现有方法。'}}}, {'id': 'https://huggingface.co/papers/2501.08809', 'title': 'XMusic: Towards a Generalized and Controllable Symbolic Music Generation Framework', 'url': 'https://huggingface.co/papers/2501.08809', 'abstract': 'In recent years, remarkable advancements in artificial intelligence-generated content (AIGC) have been achieved in the fields of image synthesis and text generation, generating content comparable to that produced by humans. However, the quality of AI-generated music has not yet reached this standard, primarily due to the challenge of effectively controlling musical emotions and ensuring high-quality outputs. This paper presents a generalized symbolic music generation framework, XMusic, which supports flexible prompts (i.e., images, videos, texts, tags, and humming) to generate emotionally controllable and high-quality symbolic music. XMusic consists of two core components, XProjector and XComposer. XProjector parses the prompts of various modalities into symbolic music elements (i.e., emotions, genres, rhythms and notes) within the projection space to generate matching music. XComposer contains a Generator and a Selector. The Generator generates emotionally controllable and melodious music based on our innovative symbolic music representation, whereas the Selector identifies high-quality symbolic music by constructing a multi-task learning scheme involving quality assessment, emotion recognition, and genre recognition tasks. In addition, we build XMIDI, a large-scale symbolic music dataset that contains 108,023 MIDI files annotated with precise emotion and genre labels. Objective and subjective evaluations show that XMusic significantly outperforms the current state-of-the-art methods with impressive music quality. Our XMusic has been awarded as one of the nine Highlights of Collectibles at WAIC 2023. The project homepage of XMusic is https://xmusic-project.github.io.', 'score': 4, 'issue_id': 1697, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': 'd4d018c9adb2579c', 'pdf_title_img': 'img/title_stub.png', 'data': {'categories': ['#audio', '#story_generation', '#multimodal', '#dataset'], 'emoji': '🎵', 'ru': {'title': 'XMusic: ИИ-композитор нового поколения с управляемыми эмоциями', 'desc': 'Статья представляет XMusic - генерализованный фреймворк для генерации символической музыки, поддерживающий различные типы промптов. XMusic состоит из двух ключевых компонентов: XProjector для обработки промптов и XComposer для генерации музыки. Авторы также создали датасет XMIDI, содержащий более 100 тысяч MIDI-файлов с аннотациями эмоций и жанров. Согласно оценкам, XMusic значительно превосходит современные методы по качеству генерируемой музыки.'}, 'en': {'title': 'XMusic: Emotionally Controlled Music Generation Made Easy!', 'desc': 'This paper introduces XMusic, a new framework for generating symbolic music that can be controlled by emotional prompts. It includes two main components: XProjector, which converts various input types into musical elements, and XComposer, which generates and selects high-quality music. The framework uses a multi-task learning approach to ensure the generated music meets quality, emotional, and genre standards. Additionally, the authors created a large dataset, XMIDI, to support their research and demonstrate that XMusic outperforms existing methods in music generation.'}, 'zh': {'title': 'XMusic:情感可控的高质量音乐生成', 'desc': '近年来,人工智能生成内容(AIGC)在图像合成和文本生成领域取得了显著进展,但在音乐生成方面仍面临挑战。本文提出了一种通用的符号音乐生成框架XMusic,能够通过灵活的提示生成可控情感和高质量的符号音乐。XMusic由两个核心组件组成:XProjector和XComposer,前者将多种模态的提示解析为音乐元素,后者则生成和选择高质量的音乐。通过构建大规模的XMIDI数据集和多任务学习方案,XMusic在音乐质量上显著优于现有方法。'}}, 'authors': [], 'affiliations': []}, {'id': 'https://huggingface.co/papers/2501.08970', 'title': 'Trusted Machine Learning Models Unlock Private Inference for Problems Currently Infeasible with Cryptography', 'url': 'https://huggingface.co/papers/2501.08970', 'abstract': 'We often interact with untrusted parties. Prioritization of privacy can limit the effectiveness of these interactions, as achieving certain goals necessitates sharing private data. Traditionally, addressing this challenge has involved either seeking trusted intermediaries or constructing cryptographic protocols that restrict how much data is revealed, such as multi-party computations or zero-knowledge proofs. While significant advances have been made in scaling cryptographic approaches, they remain limited in terms of the size and complexity of applications they can be used for. In this paper, we argue that capable machine learning models can fulfill the role of a trusted third party, thus enabling secure computations for applications that were previously infeasible. In particular, we describe Trusted Capable Model Environments (TCMEs) as an alternative approach for scaling secure computation, where capable machine learning model(s) interact under input/output constraints, with explicit information flow control and explicit statelessness. This approach aims to achieve a balance between privacy and computational efficiency, enabling private inference where classical cryptographic solutions are currently infeasible. We describe a number of use cases that are enabled by TCME, and show that even some simple classic cryptographic problems can already be solved with TCME. Finally, we outline current limitations and discuss the path forward in implementing them.', 'score': 3, 'issue_id': 1702, 'pub_date': '2025-01-15', 'pub_date_card': {'ru': '15 января', 'en': 'January 15', 'zh': '1月15日'}, 'hash': '858fc03ac78b66c1', 'authors': ['Ilia Shumailov', 'Daniel Ramage', 'Sarah Meiklejohn', 'Peter Kairouz', 'Florian Hartmann', 'Borja Balle', 'Eugene Bagdasarian'], 'affiliations': ['Google', 'Google DeepMind', 'Google Research'], 'pdf_title_img': 'assets/pdf/title_img/2501.08970.jpg', 'data': {'categories': ['#data', '#ethics', '#architecture', '#security', '#inference'], 'emoji': '🔐', 'ru': {'title': 'Машинное обучение как доверенный посредник для безопасных вычислений', 'desc': 'Статья представляет новый подход к безопасным вычислениям с использованием машинного обучения - Trusted Capable Model Environments (TCME). TCME предлагается как альтернатива традиционным криптографическим методам для обеспечения конфиденциальности при взаимодействии с ненадежными сторонами. Авторы утверждают, что мощные модели машинного обучения могут выполнять роль доверенной третьей стороны, позволяя проводить безопасные вычисления для приложений, которые ранее были невозможны. В статье описываются возможные применения TCME и обсуждаются текущие ограничения и перспективы развития этого подхода.'}, 'en': {'title': 'Empowering Privacy with Trusted Machine Learning Models', 'desc': 'This paper introduces Trusted Capable Model Environments (TCMEs) as a novel solution for secure computations involving untrusted parties. It suggests that advanced machine learning models can act as trusted intermediaries, allowing for private data sharing while maintaining privacy. The authors highlight how TCMEs can efficiently manage input/output constraints and control information flow, making them suitable for applications where traditional cryptographic methods fall short. They also present various use cases and acknowledge the limitations of their approach, paving the way for future developments in secure machine learning applications.'}, 'zh': {'title': '利用机器学习实现安全计算的新方法', 'desc': '本文探讨了在与不可信方互动时如何平衡隐私和计算效率。我们提出了可信能力模型环境(TCME),作为一种新的安全计算方法,利用机器学习模型充当可信第三方。TCME在输入/输出约束下进行交互,并通过显式的信息流控制和无状态性来保护隐私。我们展示了TCME在解决一些经典密码学问题上的潜力,并讨论了未来的实施路径。'}}}, {'id': 'https://huggingface.co/papers/2501.04693', 'title': 'Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous Sensors via Language Grounding', 'url': 'https://huggingface.co/papers/2501.04693', 'abstract': 'Interacting with the world is a multi-sensory experience: achieving effective general-purpose interaction requires making use of all available modalities -- including vision, touch, and audio -- to fill in gaps from partial observation. For example, when vision is occluded reaching into a bag, a robot should rely on its senses of touch and sound. However, state-of-the-art generalist robot policies are typically trained on large datasets to predict robot actions solely from visual and proprioceptive observations. In this work, we propose FuSe, a novel approach that enables finetuning visuomotor generalist policies on heterogeneous sensor modalities for which large datasets are not readily available by leveraging natural language as a common cross-modal grounding. We combine a multimodal contrastive loss with a sensory-grounded language generation loss to encode high-level semantics. In the context of robot manipulation, we show that FuSe enables performing challenging tasks that require reasoning jointly over modalities such as vision, touch, and sound in a zero-shot setting, such as multimodal prompting, compositional cross-modal prompting, and descriptions of objects it interacts with. We show that the same recipe is applicable to widely different generalist policies, including both diffusion-based generalist policies and large vision-language-action (VLA) models. Extensive experiments in the real world show that FuSeis able to increase success rates by over 20% compared to all considered baselines.', 'score': 0, 'issue_id': 1709, 'pub_date': '2025-01-08', 'pub_date_card': {'ru': '8 января', 'en': 'January 8', 'zh': '1月8日'}, 'hash': '1612a7343aff595b', 'authors': ['Joshua Jones', 'Oier Mees', 'Carmelo Sferrazza', 'Kyle Stachowicz', 'Pieter Abbeel', 'Sergey Levine'], 'affiliations': ['Berkeley AI Research (BAIR), UC Berkeley, USA'], 'pdf_title_img': 'assets/pdf/title_img/2501.04693.jpg', 'data': {'categories': ['#transfer_learning', '#multimodal', '#robotics', '#reasoning'], 'emoji': '🤖', 'ru': {'title': 'Мультисенсорный ИИ: объединение зрения, осязания и звука для улучшения взаимодействия роботов с миром', 'desc': 'Статья представляет FuSe - новый подход к обучению роботов, использующий мультимодальные сенсорные данные. FuSe использует естественный язык как общую основу для объединения различных модальностей, таких как зрение, осязание и звук. Метод сочетает мультимодальную контрастивную функцию потерь с генерацией языка на основе сенсорных данных для кодирования высокоуровневой семантики. Эксперименты показывают, что FuSe позволяет роботам выполнять сложные задачи, требующие рассуждений на основе нескольких модальностей, повышая успешность на 20% по сравнению с базовыми методами.'}, 'en': {'title': 'FuSe: Bridging Sensory Gaps for Smarter Robot Interaction', 'desc': 'This paper introduces FuSe, a method that enhances robot interaction by integrating multiple sensory modalities like vision, touch, and sound. Traditional robot policies often rely solely on visual data, but FuSe allows for fine-tuning these policies using natural language to bridge gaps in sensory information. By employing a multimodal contrastive loss and a sensory-grounded language generation loss, FuSe effectively encodes high-level semantics for better decision-making. The results demonstrate that FuSe significantly improves the success rates of robots in complex tasks, showcasing its versatility across different generalist policies.'}, 'zh': {'title': '多模态交互,提升机器人智能', 'desc': '本论文提出了一种名为FuSe的新方法,旨在通过多模态传感器数据来微调通用机器人策略。FuSe利用自然语言作为跨模态的共同基础,结合多模态对比损失和感知基础的语言生成损失,以编码高层语义。通过这种方法,机器人能够在视觉、触觉和听觉等多种感官信息的共同推理下,完成复杂的操作任务。实验结果表明,FuSe在实际应用中成功率提高了超过20%。'}}}, {'id': 'https://huggingface.co/papers/2412.19412', 'title': 'MINIMA: Modality Invariant Image Matching', 'url': 'https://huggingface.co/papers/2412.19412', 'abstract': 'Image matching for both cross-view and cross-modality plays a critical role in multimodal perception. In practice, the modality gap caused by different imaging systems/styles poses great challenges to the matching task. Existing works try to extract invariant features for specific modalities and train on limited datasets, showing poor generalization. In this paper, we present MINIMA, a unified image matching framework for multiple cross-modal cases. Without pursuing fancy modules, our MINIMA aims to enhance universal performance from the perspective of data scaling up. For such purpose, we propose a simple yet effective data engine that can freely produce a large dataset containing multiple modalities, rich scenarios, and accurate matching labels. Specifically, we scale up the modalities from cheap but rich RGB-only matching data, by means of generative models. Under this setting, the matching labels and rich diversity of the RGB dataset are well inherited by the generated multimodal data. Benefiting from this, we construct MD-syn, a new comprehensive dataset that fills the data gap for general multimodal image matching. With MD-syn, we can directly train any advanced matching pipeline on randomly selected modality pairs to obtain cross-modal ability. Extensive experiments on in-domain and zero-shot matching tasks, including 19 cross-modal cases, demonstrate that our MINIMA can significantly outperform the baselines and even surpass modality-specific methods. The dataset and code are available at https://github.com/LSXI7/MINIMA .', 'score': 0, 'issue_id': 1709, 'pub_date': '2025-12-27', 'pub_date_card': {'ru': '27 декабря', 'en': 'December 27', 'zh': '12月27日'}, 'hash': 'fa772dead5453f7b', 'authors': ['Xingyu Jiang', 'Jiangwei Ren', 'Zizhuo Li', 'Xin Zhou', 'Dingkang Liang', 'Xiang Bai'], 'affiliations': ['Huazhong University of Science and Technology', 'Wuhan University'], 'pdf_title_img': 'assets/pdf/title_img/2412.19412.jpg', 'data': {'categories': ['#dataset', '#data', '#multimodal', '#open_source', '#synthetic'], 'emoji': '🔀', 'ru': {'title': 'Универсальное сопоставление изображений через масштабирование данных', 'desc': 'Статья представляет MINIMA - универсальную систему сопоставления изображений для различных кросс-модальных случаев. Авторы предлагают эффективный механизм генерации большого набора данных с несколькими модальностями, разнообразными сценариями и точными метками сопоставления. Используя этот подход, они создают новый комплексный датасет MD-syn для обучения нейросетей кросс-модальному сопоставлению изображений. Эксперименты показывают, что MINIMA значительно превосходит базовые модели и даже специализированные методы для конкретных модальностей в 19 кросс-модальных задачах.'}, 'en': {'title': 'MINIMA: Bridging the Gap in Cross-Modal Image Matching', 'desc': 'This paper introduces MINIMA, a framework designed for image matching across different views and modalities, addressing the challenges posed by varying imaging systems. The authors highlight the limitations of existing methods that rely on invariant features and small datasets, which often lead to poor performance. MINIMA enhances image matching by scaling up data through a generative model that creates a large, diverse dataset with accurate matching labels. The new dataset, MD-syn, allows for effective training of matching algorithms, resulting in improved performance in both in-domain and zero-shot scenarios compared to traditional methods.'}, 'zh': {'title': 'MINIMA:跨模态图像匹配的新突破', 'desc': '本文提出了一种名为MINIMA的统一图像匹配框架,旨在解决跨视角和跨模态的图像匹配问题。现有方法在特定模态上提取不变特征,但在有限数据集上训练,导致泛化能力差。MINIMA通过一个简单有效的数据引擎,生成包含多种模态和丰富场景的大型数据集,从而提升通用性能。通过构建MD-syn数据集,MINIMA能够在随机选择的模态对上直接训练,显著提高跨模态匹配能力。'}}}]; const articlesContainer = document.getElementById('articles-container'); const sortDropdown = document.getElementById('sort-dropdown'); const categoryFiltersContainer = document.getElementById('category-filters'); @@ -1184,7 +1184,7 @@ function updateTimeDiffs() { const timeDiff = document.getElementById('timeDiff'); - timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-29 22:09',lang=currentLang); + timeDiff.innerHTML = '🔄 ' + getTimeDiff('2025-01-29 23:09',lang=currentLang); } function updateSortingOptions() { const sortingLabels = { @@ -1238,14 +1238,14 @@ } function hideNextLink(format) { if (format === 'monthly') { - if (isCurrentMonth('2025-01-29 22:09')) { + if (isCurrentMonth('2025-01-29 23:09')) { const element = document.getElementById('nav-next'); if (element) { element.style.display = 'none'; } } } else { - if (isToday('2025-01-29 22:09')) { + if (isToday('2025-01-29 23:09')) { const element = document.getElementById('nav-next'); if (element) { element.style.display = 'none';