Apply code formatting

volcengine · Feb 13, 2025 · 8f3e5c6 · 8f3e5c6
1 parent d4685ba
commit 8f3e5c6
Show file tree

Hide file tree

Showing 4 changed files with 138 additions and 138 deletions.
diff --git a/examples/data_preprocess/multiturn.py b/examples/data_preprocess/multiturn.py
@@ -18,39 +18,30 @@ def process_conversation(example, idx, split, tokenizer, max_tokens=32000):
     """Convert a conversation into the expected format"""
     messages = []
     total_tokens = 0
-    
+
     # Add system message
-    system_msg = {
-        "role": "system",
-        "content": "You are a helpful assistant that can understand and generate code."
-    }
+    system_msg = {"role": "system", "content": "You are a helpful assistant that can understand and generate code."}
     total_tokens += count_tokens(system_msg["content"], tokenizer)
     messages.append(system_msg)
-    
+
     # Process each turn
     for i in range(len(example['human'])):
         # Add human message
-        human_msg = {
-            "role": "user",
-            "content": example['human'][i]
-        }
+        human_msg = {"role": "user", "content": example['human'][i]}
         human_tokens = count_tokens(human_msg["content"], tokenizer)
-        
+
         # Add assistant message
-        assistant_msg = {
-            "role": "assistant",
-            "content": example['assistant'][i]
-        }
+        assistant_msg = {"role": "assistant", "content": example['assistant'][i]}
         assistant_tokens = count_tokens(assistant_msg["content"], tokenizer)
-        
+
         # Check if adding these messages would exceed token limit
         if total_tokens + human_tokens + assistant_tokens > max_tokens:
             break
-            
+
         total_tokens += human_tokens + assistant_tokens
         messages.append(human_msg)
         messages.append(assistant_msg)
-    
+
     # Only return if we have at least one complete turn
     if len(messages) >= 3:  # system + at least one human-assistant pair
         return {
@@ -71,49 +62,47 @@ def process_conversation(example, idx, split, tokenizer, max_tokens=32000):
     parser.add_argument('--local_dir', default='~/data/multiturn')
     parser.add_argument('--hdfs_dir', default=None)
     parser.add_argument('--max_tokens', type=int, default=32000)
-    
+
     args = parser.parse_args()
-    
+
     # Load tokenizer for token counting
     tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
-    
+
     # Load OpenHands dataset
     dataset = datasets.load_dataset('SWE-Gym/OpenHands-SFT-Trajectories')
-    
+
     # Split into train/test (90/10 split)
     dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
     train_dataset = dataset['train']
     test_dataset = dataset['test']
-    
+
     # Process the datasets
     train_dataset = train_dataset.map(
         function=lambda x, i: process_conversation(x, i, 'train', tokenizer, args.max_tokens),
         with_indices=True,
-        remove_columns=train_dataset.column_names
-    )
+        remove_columns=train_dataset.column_names)
     test_dataset = test_dataset.map(
         function=lambda x, i: process_conversation(x, i, 'test', tokenizer, args.max_tokens),
         with_indices=True,
-        remove_columns=test_dataset.column_names
-    )
-
+        remove_columns=test_dataset.column_names)
+
     # Filter out None values (conversations that were too long)
     train_dataset = train_dataset.filter(lambda x: x is not None)
     test_dataset = test_dataset.filter(lambda x: x is not None)
-    
+
     # Create output directory
     local_dir = os.path.expanduser(args.local_dir)
     os.makedirs(local_dir, exist_ok=True)
-    
+
     # Save to parquet files
     train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
     test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
-    
+
     if args.hdfs_dir is not None:
         makedirs(args.hdfs_dir)
         copy(src=local_dir, dst=args.hdfs_dir)
-    
+
     # Print statistics
     print(f"Train dataset size: {len(train_dataset)}")
     print(f"Test dataset size: {len(test_dataset)}")
-    print(f"Data saved to {local_dir}")
+    print(f"Data saved to {local_dir}")
diff --git a/tests/verl/utils/dataset/test_multiturn_sft_dataset.py b/tests/verl/utils/dataset/test_multiturn_sft_dataset.py
@@ -12,158 +12,170 @@ def test_multiturn_sft_dataset():
     print("Starting test...")
     # Create a temporary parquet file with test data
     test_data = {
-        'messages': [
-            [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "What is 2+2?"},
-                {"role": "assistant", "content": "2+2 equals 4."},
-                {"role": "user", "content": "And what is 4+4?"},
-                {"role": "assistant", "content": "4+4 equals 8."}
-            ],
-            [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "Tell me a joke."},
-                {"role": "assistant", "content": "Why did the chicken cross the road?"},
-                {"role": "user", "content": "Why?"},
-                {"role": "assistant", "content": "To get to the other side!"}
-            ]
-        ]
+        'messages': [[{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "What is 2+2?"
+        }, {
+            "role": "assistant",
+            "content": "2+2 equals 4."
+        }, {
+            "role": "user",
+            "content": "And what is 4+4?"
+        }, {
+            "role": "assistant",
+            "content": "4+4 equals 8."
+        }],
+                     [{
+                         "role": "system",
+                         "content": "You are a helpful assistant."
+                     }, {
+                         "role": "user",
+                         "content": "Tell me a joke."
+                     }, {
+                         "role": "assistant",
+                         "content": "Why did the chicken cross the road?"
+                     }, {
+                         "role": "user",
+                         "content": "Why?"
+                     }, {
+                         "role": "assistant",
+                         "content": "To get to the other side!"
+                     }]]
     }
-    
+
     # Create test directory if it doesn't exist
     os.makedirs('test_data', exist_ok=True)
     test_file = 'test_data/test.parquet'
-    
+
     # Save test data to parquet
     df = pd.DataFrame(test_data)
     df.to_parquet(test_file)
-    
+
     # Initialize tokenizer and dataset
     tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-7B-Instruct')
-    dataset = MultiTurnSFTDataset(
-        parquet_files=test_file,
-        tokenizer=tokenizer,
-        max_length=512
-    )
-
+    dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, max_length=512)
+
     # Test 1: Dataset Length
     assert len(dataset) == 2, f"Expected dataset length 2, got {len(dataset)}"
-    
+
     # Get items for testing
     item0 = dataset[0]  # Math conversation
     item1 = dataset[1]  # Joke conversation
-    
+
     # Test 2: Required Keys and Types
     required_keys = ['input_ids', 'attention_mask', 'position_ids', 'loss_mask']
     for key in required_keys:
         assert key in item0, f"Missing key {key} in dataset item"
         assert isinstance(item0[key], torch.Tensor), f"Expected torch.Tensor for {key}"
         assert item0[key].dtype == torch.long, f"Expected torch.long for {key}, got {item0[key].dtype}"
-    
+
     # Test 3: Shape Consistency
     assert item0['loss_mask'].shape == item0['input_ids'].shape, \
         "Loss mask shape doesn't match input_ids shape"
     assert item0['attention_mask'].shape == item0['input_ids'].shape, \
         "Attention mask shape doesn't match input_ids shape"
     assert item0['position_ids'].shape == item0['input_ids'].shape, \
         "Position IDs shape doesn't match input_ids shape"
-    
+
     # Test 4: Loss Mask Pattern - Math Conversation
     loss_mask0 = item0['loss_mask']
     input_ids0 = item0['input_ids']
-    
+
     # Find assistant response positions
     assistant_positions0 = torch.where(loss_mask0 == 1)[0]
     assert len(assistant_positions0) > 0, "No assistant positions found in loss mask"
-    
+
     # Decode and verify assistant responses
     assistant_text0 = tokenizer.decode(input_ids0[loss_mask0 == 1])
     print(f"Math conversation assistant text: {assistant_text0}")
     assert "2+2 equals 4" in assistant_text0, "First assistant response not found"
     assert "4+4 equals 8" in assistant_text0, "Second assistant response not found"
-    
+
     # Test 5: Loss Mask Pattern - Joke Conversation
     loss_mask1 = item1['loss_mask']
     input_ids1 = item1['input_ids']
-    
+
     # Find assistant response positions
     assistant_positions1 = torch.where(loss_mask1 == 1)[0]
     assert len(assistant_positions1) > 0, "No assistant positions found in loss mask"
-    
+
     # Decode and verify assistant responses
     assistant_text1 = tokenizer.decode(input_ids1[loss_mask1 == 1])
     print(f"Joke conversation assistant text: {assistant_text1}")
     assert "chicken cross the road" in assistant_text1, "First assistant response not found"
     assert "other side" in assistant_text1, "Second assistant response not found"
-    
+
     # Test 6: Attention Mask Pattern
     attention_mask0 = item0['attention_mask']
     sequence_length = torch.sum(attention_mask0)
     assert sequence_length > 0, "No tokens marked as attended in attention mask"
     assert torch.all(attention_mask0[:sequence_length] == 1), "Incorrect attention mask pattern"
     if sequence_length < len(attention_mask0):
         assert torch.all(attention_mask0[sequence_length:] == 0), "Padding not properly masked"
-    
+
     # Test 7: Position IDs Pattern
     position_ids0 = item0['position_ids']
     assert torch.equal(position_ids0[:sequence_length], torch.arange(sequence_length)), \
         "Position IDs not sequential for non-padded tokens"
     if sequence_length < len(position_ids0):
         assert torch.all(position_ids0[sequence_length:] == 0), "Padding position IDs not zero"
-    
+
     # Test 8: Verify loss mask for assistant responses
     # Get the full conversation text
     full_text = tokenizer.decode(input_ids0)
     print(f"\nFull conversation text:\n{full_text}")
-    
+
     # Get the assistant responses
     assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 1])
     print(f"\nAssistant responses (from loss mask):\n{assistant_text}")
-    
+
     # Verify that loss mask is set for all assistant responses
     for msg in test_data['messages'][0]:  # First conversation
         if msg['role'] == 'assistant':
             # The content should appear in the masked text
             assert msg['content'] in assistant_text, \
                 f"Assistant message '{msg['content']}' not found in masked text"
-            
+
             # The content should NOT appear in the non-masked text
             non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
             assert msg['content'] not in non_assistant_text, \
                 f"Assistant message '{msg['content']}' found in non-assistant text"
-    
+
     # Test 9: Verify non-assistant parts have loss_mask=0
     # Get non-assistant text
     non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
     print(f"\nNon-assistant text (from loss mask):\n{non_assistant_text}")
-    
+
     # Verify that system and user messages are in the non-assistant text
     for msg in test_data['messages'][0]:  # First conversation
         if msg['role'] in ['system', 'user']:
             assert msg['content'] in non_assistant_text, \
                 f"{msg['role'].title()} message '{msg['content']}' not found in non-assistant text"
-            
+
             # And verify they're NOT in the assistant text
             assert msg['content'] not in assistant_text, \
                 f"{msg['role'].title()} message '{msg['content']}' found in assistant text"
-    
+
     # Test 10: Verify padding behavior
     small_dataset = MultiTurnSFTDataset(
         parquet_files=test_file,
         tokenizer=tokenizer,
         max_length=1024  # Larger than needed to test padding
     )
     padded_item = small_dataset[0]
-    
+
     # Get actual sequence length (before padding)
     actual_length = torch.sum(padded_item['attention_mask'])
-    
+
     # Verify padding tokens
     assert torch.all(padded_item['input_ids'][actual_length:] == tokenizer.pad_token_id), \
         "Padding tokens not set correctly"
     assert torch.all(padded_item['attention_mask'][actual_length:] == 0), \
         "Attention mask not set correctly for padding"
     assert torch.all(padded_item['loss_mask'][actual_length:] == 0), \
         "Loss mask not set correctly for padding"
-    
-    print("All tests passed!")
+
+    print("All tests passed!")