Skip to content

Commit

Permalink
Apply code formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
openhands-agent committed Feb 13, 2025
1 parent d4685ba commit 8f3e5c6
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 138 deletions.
55 changes: 22 additions & 33 deletions examples/data_preprocess/multiturn.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,39 +18,30 @@ def process_conversation(example, idx, split, tokenizer, max_tokens=32000):
"""Convert a conversation into the expected format"""
messages = []
total_tokens = 0

# Add system message
system_msg = {
"role": "system",
"content": "You are a helpful assistant that can understand and generate code."
}
system_msg = {"role": "system", "content": "You are a helpful assistant that can understand and generate code."}
total_tokens += count_tokens(system_msg["content"], tokenizer)
messages.append(system_msg)

# Process each turn
for i in range(len(example['human'])):
# Add human message
human_msg = {
"role": "user",
"content": example['human'][i]
}
human_msg = {"role": "user", "content": example['human'][i]}
human_tokens = count_tokens(human_msg["content"], tokenizer)

# Add assistant message
assistant_msg = {
"role": "assistant",
"content": example['assistant'][i]
}
assistant_msg = {"role": "assistant", "content": example['assistant'][i]}
assistant_tokens = count_tokens(assistant_msg["content"], tokenizer)

# Check if adding these messages would exceed token limit
if total_tokens + human_tokens + assistant_tokens > max_tokens:
break

total_tokens += human_tokens + assistant_tokens
messages.append(human_msg)
messages.append(assistant_msg)

# Only return if we have at least one complete turn
if len(messages) >= 3: # system + at least one human-assistant pair
return {
Expand All @@ -71,49 +62,47 @@ def process_conversation(example, idx, split, tokenizer, max_tokens=32000):
parser.add_argument('--local_dir', default='~/data/multiturn')
parser.add_argument('--hdfs_dir', default=None)
parser.add_argument('--max_tokens', type=int, default=32000)

args = parser.parse_args()

# Load tokenizer for token counting
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')

# Load OpenHands dataset
dataset = datasets.load_dataset('SWE-Gym/OpenHands-SFT-Trajectories')

# Split into train/test (90/10 split)
dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
test_dataset = dataset['test']

# Process the datasets
train_dataset = train_dataset.map(
function=lambda x, i: process_conversation(x, i, 'train', tokenizer, args.max_tokens),
with_indices=True,
remove_columns=train_dataset.column_names
)
remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(
function=lambda x, i: process_conversation(x, i, 'test', tokenizer, args.max_tokens),
with_indices=True,
remove_columns=test_dataset.column_names
)

remove_columns=test_dataset.column_names)

# Filter out None values (conversations that were too long)
train_dataset = train_dataset.filter(lambda x: x is not None)
test_dataset = test_dataset.filter(lambda x: x is not None)

# Create output directory
local_dir = os.path.expanduser(args.local_dir)
os.makedirs(local_dir, exist_ok=True)

# Save to parquet files
train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))

if args.hdfs_dir is not None:
makedirs(args.hdfs_dir)
copy(src=local_dir, dst=args.hdfs_dir)

# Print statistics
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(f"Data saved to {local_dir}")
print(f"Data saved to {local_dir}")
108 changes: 60 additions & 48 deletions tests/verl/utils/dataset/test_multiturn_sft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,158 +12,170 @@ def test_multiturn_sft_dataset():
print("Starting test...")
# Create a temporary parquet file with test data
test_data = {
'messages': [
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is 2+2?"},
{"role": "assistant", "content": "2+2 equals 4."},
{"role": "user", "content": "And what is 4+4?"},
{"role": "assistant", "content": "4+4 equals 8."}
],
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."},
{"role": "assistant", "content": "Why did the chicken cross the road?"},
{"role": "user", "content": "Why?"},
{"role": "assistant", "content": "To get to the other side!"}
]
]
'messages': [[{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "What is 2+2?"
}, {
"role": "assistant",
"content": "2+2 equals 4."
}, {
"role": "user",
"content": "And what is 4+4?"
}, {
"role": "assistant",
"content": "4+4 equals 8."
}],
[{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "Tell me a joke."
}, {
"role": "assistant",
"content": "Why did the chicken cross the road?"
}, {
"role": "user",
"content": "Why?"
}, {
"role": "assistant",
"content": "To get to the other side!"
}]]
}

# Create test directory if it doesn't exist
os.makedirs('test_data', exist_ok=True)
test_file = 'test_data/test.parquet'

# Save test data to parquet
df = pd.DataFrame(test_data)
df.to_parquet(test_file)

# Initialize tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-7B-Instruct')
dataset = MultiTurnSFTDataset(
parquet_files=test_file,
tokenizer=tokenizer,
max_length=512
)

dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, max_length=512)

# Test 1: Dataset Length
assert len(dataset) == 2, f"Expected dataset length 2, got {len(dataset)}"

# Get items for testing
item0 = dataset[0] # Math conversation
item1 = dataset[1] # Joke conversation

# Test 2: Required Keys and Types
required_keys = ['input_ids', 'attention_mask', 'position_ids', 'loss_mask']
for key in required_keys:
assert key in item0, f"Missing key {key} in dataset item"
assert isinstance(item0[key], torch.Tensor), f"Expected torch.Tensor for {key}"
assert item0[key].dtype == torch.long, f"Expected torch.long for {key}, got {item0[key].dtype}"

# Test 3: Shape Consistency
assert item0['loss_mask'].shape == item0['input_ids'].shape, \
"Loss mask shape doesn't match input_ids shape"
assert item0['attention_mask'].shape == item0['input_ids'].shape, \
"Attention mask shape doesn't match input_ids shape"
assert item0['position_ids'].shape == item0['input_ids'].shape, \
"Position IDs shape doesn't match input_ids shape"

# Test 4: Loss Mask Pattern - Math Conversation
loss_mask0 = item0['loss_mask']
input_ids0 = item0['input_ids']

# Find assistant response positions
assistant_positions0 = torch.where(loss_mask0 == 1)[0]
assert len(assistant_positions0) > 0, "No assistant positions found in loss mask"

# Decode and verify assistant responses
assistant_text0 = tokenizer.decode(input_ids0[loss_mask0 == 1])
print(f"Math conversation assistant text: {assistant_text0}")
assert "2+2 equals 4" in assistant_text0, "First assistant response not found"
assert "4+4 equals 8" in assistant_text0, "Second assistant response not found"

# Test 5: Loss Mask Pattern - Joke Conversation
loss_mask1 = item1['loss_mask']
input_ids1 = item1['input_ids']

# Find assistant response positions
assistant_positions1 = torch.where(loss_mask1 == 1)[0]
assert len(assistant_positions1) > 0, "No assistant positions found in loss mask"

# Decode and verify assistant responses
assistant_text1 = tokenizer.decode(input_ids1[loss_mask1 == 1])
print(f"Joke conversation assistant text: {assistant_text1}")
assert "chicken cross the road" in assistant_text1, "First assistant response not found"
assert "other side" in assistant_text1, "Second assistant response not found"

# Test 6: Attention Mask Pattern
attention_mask0 = item0['attention_mask']
sequence_length = torch.sum(attention_mask0)
assert sequence_length > 0, "No tokens marked as attended in attention mask"
assert torch.all(attention_mask0[:sequence_length] == 1), "Incorrect attention mask pattern"
if sequence_length < len(attention_mask0):
assert torch.all(attention_mask0[sequence_length:] == 0), "Padding not properly masked"

# Test 7: Position IDs Pattern
position_ids0 = item0['position_ids']
assert torch.equal(position_ids0[:sequence_length], torch.arange(sequence_length)), \
"Position IDs not sequential for non-padded tokens"
if sequence_length < len(position_ids0):
assert torch.all(position_ids0[sequence_length:] == 0), "Padding position IDs not zero"

# Test 8: Verify loss mask for assistant responses
# Get the full conversation text
full_text = tokenizer.decode(input_ids0)
print(f"\nFull conversation text:\n{full_text}")

# Get the assistant responses
assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 1])
print(f"\nAssistant responses (from loss mask):\n{assistant_text}")

# Verify that loss mask is set for all assistant responses
for msg in test_data['messages'][0]: # First conversation
if msg['role'] == 'assistant':
# The content should appear in the masked text
assert msg['content'] in assistant_text, \
f"Assistant message '{msg['content']}' not found in masked text"

# The content should NOT appear in the non-masked text
non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
assert msg['content'] not in non_assistant_text, \
f"Assistant message '{msg['content']}' found in non-assistant text"

# Test 9: Verify non-assistant parts have loss_mask=0
# Get non-assistant text
non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
print(f"\nNon-assistant text (from loss mask):\n{non_assistant_text}")

# Verify that system and user messages are in the non-assistant text
for msg in test_data['messages'][0]: # First conversation
if msg['role'] in ['system', 'user']:
assert msg['content'] in non_assistant_text, \
f"{msg['role'].title()} message '{msg['content']}' not found in non-assistant text"

# And verify they're NOT in the assistant text
assert msg['content'] not in assistant_text, \
f"{msg['role'].title()} message '{msg['content']}' found in assistant text"

# Test 10: Verify padding behavior
small_dataset = MultiTurnSFTDataset(
parquet_files=test_file,
tokenizer=tokenizer,
max_length=1024 # Larger than needed to test padding
)
padded_item = small_dataset[0]

# Get actual sequence length (before padding)
actual_length = torch.sum(padded_item['attention_mask'])

# Verify padding tokens
assert torch.all(padded_item['input_ids'][actual_length:] == tokenizer.pad_token_id), \
"Padding tokens not set correctly"
assert torch.all(padded_item['attention_mask'][actual_length:] == 0), \
"Attention mask not set correctly for padding"
assert torch.all(padded_item['loss_mask'][actual_length:] == 0), \
"Loss mask not set correctly for padding"
print("All tests passed!")

print("All tests passed!")
Loading

0 comments on commit 8f3e5c6

Please sign in to comment.