Skip to content

Commit

Permalink
Merge pull request #40 from carsdotcom/configurable_spot_strategy
Browse files Browse the repository at this point in the history
Add a configurable spot strategy
  • Loading branch information
Macr0Nerd authored Feb 7, 2024
2 parents 0159d2d + 91f3802 commit 98a738c
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 10 deletions.
1 change: 1 addition & 0 deletions docs/yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Each forge command certain parameters. A yaml file with all the parameters can b
- Use the `--all` flag to run the script on all the instances in a cluster.
- E.g. `run_cmd: scripts/run.sh {env} {date} {ip}`
- **service** - `cluster` or `single`
- **spot_strategy** - Select the [spot allocation strategy](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2/client/create_fleet.html).
- **spot_retries** - If using engine mode, sets the number of times to retry a spot instance. Only retries if either market is spot.
- **user_data** - Custom script passed to instance. Will be run only once when the instance starts up.
- **valid_time** - How many hours the fleet will stay up. After this time, all EC2s will be destroyed. The default is 8.
3 changes: 2 additions & 1 deletion src/forge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
'destroy_after_failure': True,
'default_ratio': [8, 8],
'valid_time': 8,
'ec2_max': 768
'ec2_max': 768,
'spot_strategy': 'price-capacity-optimized'
}

# Required arguments for each Forge job
Expand Down
10 changes: 9 additions & 1 deletion src/forge/configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys

import yaml
from schema import Schema, And, Optional, SchemaError, Use
from schema import Schema, And, Optional, Or, SchemaError, Use

from .common import set_config_dir

Expand Down Expand Up @@ -65,6 +65,14 @@ def check_env_yaml(env_yaml):
Optional('excluded_ec2s'): And(list),
Optional('additional_config'): And(list),
Optional('ec2_max'): And(int),
Optional('spot_strategy'): And(str, len,
Or(
'lowest-price',
'diversified',
'capacity-optimized',
'capacity-optimized-prioritized',
'price-capacity-optimized'),
error='Invalid spot allocation strategy'),
Optional('on_demand_failover'): And(bool),
Optional('spot_retries'): And(Use(int), lambda x: x > 0),
})
Expand Down
3 changes: 2 additions & 1 deletion src/forge/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,7 @@ def create_fleet(n, config, task, instance_details):

gpu = config.get('gpu_flag', False)
market = config.get('market', DEFAULT_ARG_VALS['market'])
strategy = config.get('spot_strategy')

market = market[-1] if 'cluster-worker' in n else market[0]

Expand All @@ -582,7 +583,7 @@ def create_fleet(n, config, task, instance_details):
'AllocationStrategy': 'lowest-price'
},
'SpotOptions': {
'AllocationStrategy': 'capacity-optimized',
'AllocationStrategy': strategy,
'InstanceInterruptionBehavior': 'terminate',
'MaintenanceStrategies': {
'CapacityRebalance': {
Expand Down
15 changes: 8 additions & 7 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _load_admin_cfg(env):
'home_dir': os.path.dirname(FORGE_DIR), 'yaml_dir': os.path.join(TEST_DIR, 'data'), 'user': 'test_user',
'ami': 'single_ami', 'forge_version': False, 'config_dir': os.path.join(TEST_CFG_DIR, 'dev'),
'region': 'us-east-1', 'destroy_after_success': True, 'destroy_after_failure': True, 'default_ratio': [8, 8],
'valid_time': 8, 'ec2_max': 768}),
'valid_time': 8, 'ec2_max': 768, 'spot_strategy': 'price-capacity-optimized'}),
# Destroy job; passing the relative path to a yaml
(['forge', 'destroy', '--yaml', os.path.join(TEST_DIR_REL, 'data', 'single_intermediate.yaml')],
{'name': 'test-single-intermediate', 'log_level': 'INFO',
Expand All @@ -53,7 +53,7 @@ def _load_admin_cfg(env):
'yaml_dir': os.path.join(TEST_DIR, 'data'), 'user': 'test_user', 'ami': 'single_ami',
'forge_version': False, 'config_dir': os.path.join(TEST_CFG_DIR, 'dev'), 'region': 'us-east-1',
'destroy_after_success': True, 'destroy_after_failure': True, 'default_ratio': [8, 8], 'valid_time': 8,
'ec2_max': 768}),
'ec2_max': 768, 'spot_strategy': 'price-capacity-optimized'}),
# Destroy job; overriding log_level
(['forge', 'destroy', '--yaml', os.path.join(TEST_DIR, 'data', 'single_intermediate.yaml'), '--log_level', 'debug'],
{'name': 'test-single-intermediate', 'log_level': 'DEBUG',
Expand All @@ -63,7 +63,7 @@ def _load_admin_cfg(env):
'home_dir': os.path.dirname(FORGE_DIR), 'yaml_dir': os.path.join(TEST_DIR, 'data'), 'user': 'test_user',
'ami': 'single_ami', 'forge_version': False, 'config_dir': os.path.join(TEST_CFG_DIR, 'dev'),
'region': 'us-east-1', 'destroy_after_success': True, 'destroy_after_failure': True, 'default_ratio': [8, 8],
'valid_time': 8, 'ec2_max': 768}),
'valid_time': 8, 'ec2_max': 768, 'spot_strategy': 'price-capacity-optimized'}),
# Destroy job; overriding market
(['forge', 'destroy', '--yaml', os.path.join(TEST_DIR, 'data', 'single_intermediate.yaml'),
'--market', 'on-demand'],
Expand All @@ -73,7 +73,8 @@ def _load_admin_cfg(env):
'gpu_flag': False, 'app_dir': TEST_DIR, 'src_dir': FORGE_DIR, 'home_dir': os.path.dirname(FORGE_DIR),
'yaml_dir': os.path.join(TEST_DIR, 'data'), 'user': 'test_user', 'ami': 'single_ami', 'forge_version': False,
'config_dir': os.path.join(TEST_CFG_DIR, 'dev'), 'region': 'us-east-1', 'destroy_after_success': True,
'destroy_after_failure': True, 'default_ratio': [8, 8], 'valid_time': 8, 'ec2_max': 768}),
'destroy_after_failure': True, 'default_ratio': [8, 8], 'valid_time': 8, 'ec2_max': 768,
'spot_strategy': 'price-capacity-optimized'}),
# Destroy job; no market
(['forge', 'destroy', '--yaml', os.path.join(TEST_DIR, 'data', 'single_basic.yaml'), '--forge_env', 'dev'],
{'name': 'test-single-basic', 'log_level': 'INFO', 'yaml': os.path.join(TEST_DIR, 'data', 'single_basic.yaml'),
Expand All @@ -82,7 +83,7 @@ def _load_admin_cfg(env):
'home_dir': os.path.dirname(FORGE_DIR), 'yaml_dir': os.path.join(TEST_DIR, 'data'), 'user': 'test_user',
'ami': 'single_ami', 'forge_version': False, 'config_dir': os.path.join(TEST_CFG_DIR, 'dev'),
'region': 'us-east-1', 'destroy_after_success': True, 'destroy_after_failure': True, 'default_ratio': [8, 8],
'valid_time': 8, 'ec2_max': 768}),
'valid_time': 8, 'ec2_max': 768, 'spot_strategy': 'price-capacity-optimized'}),
# Configure job
(['forge', 'configure'],
{'forge_version': False, 'job': 'configure', 'log_level': 'INFO'}),
Expand All @@ -95,7 +96,7 @@ def _load_admin_cfg(env):
'home_dir': os.path.dirname(FORGE_DIR), 'yaml_dir': os.path.join(TEST_DIR, 'data'), 'user': 'test_user',
'ami': 'single_ami', 'forge_version': False, 'config_dir': os.path.join(TEST_CFG_DIR, 'dev'),
'region': 'us-east-1', 'destroy_after_success': True, 'destroy_after_failure': True, 'default_ratio': [8, 8],
'valid_time': 8, 'ec2_max': 768}),
'valid_time': 8, 'ec2_max': 768, 'spot_strategy': 'price-capacity-optimized'}),
# Create job; setting gpu
(['forge', 'create', '--yaml', os.path.join(TEST_DIR, 'data', 'single_basic.yaml'), '--forge_env', 'dev', '--gpu'],
{'name': 'test-single-basic', 'log_level': 'INFO', 'yaml': os.path.join(TEST_DIR, 'data', 'single_basic.yaml'),
Expand All @@ -104,7 +105,7 @@ def _load_admin_cfg(env):
'home_dir': os.path.dirname(FORGE_DIR), 'yaml_dir': os.path.join(TEST_DIR, 'data'), 'user': 'test_user',
'ami': 'single_ami', 'forge_version': False, 'config_dir': os.path.join(TEST_CFG_DIR, 'dev'),
'region': 'us-east-1', 'destroy_after_success': True, 'destroy_after_failure': True, 'default_ratio': [8, 8],
'valid_time': 8, 'ec2_max': 768}),
'valid_time': 8, 'ec2_max': 768, 'spot_strategy': 'price-capacity-optimized'}),
])
def test_forge_main(mock_pass, mock_execute, mock_keys, mock_config_dir, cli_call, exp_config, load_admin_cfg):
"""Test the config after calling forge via the command line."""
Expand Down

0 comments on commit 98a738c

Please sign in to comment.