Skip to content

Commit

Permalink
Merge branch 'dev' into configurable_spot_strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
Macr0Nerd authored Feb 7, 2024
2 parents 594ffc3 + 0159d2d commit 91f3802
Show file tree
Hide file tree
Showing 15 changed files with 287 additions and 91 deletions.
16 changes: 13 additions & 3 deletions docs/environmental_yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,18 @@ https://github.com/carsdotcom/cars-forge/blob/main/examples/env_yaml_example/exa
constraints: [2.3, 3.0, 3.1]
error: "Invalid Spark version. Only 2.3, 3.0, and 3.1 are supported."
```
- **aws_az** - The [AWS availability zone](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html) where Forge will create the EC2 instance. Currently, Forge can run only in one AZ
- **aws_profile** - [AWS CLI profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use
- **aws_az** - The [AWS availability zone](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html) where Forge will create the EC2 instance. If set, multi-az placement will be disabled.
- **aws_region** - The AWS region for Forge to run in- **aws_profile** - [AWS CLI profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use
- **aws_security_group** - [AWS Security Group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-security-groups.html) for the instance
- **aws_subnet** - [AWS subnet](https://docs.aws.amazon.com/vpc/latest/userguide/configure-subnets.html) where the EC2s will run
- **aws_subnet** - [AWS subnet](https://docs.aws.amazon.com/vpc/latest/userguide/configure-subnets.html) where the EC2s will run
- **aws_multi_az** - [AWS subnet](https://docs.aws.amazon.com/vpc/latest/userguide/configure-subnets.html) where the EC2s will run organized by AZ
- E.g.
```yaml
aws_multi_az:
us-east-1a: subnet-aaaaaaaaaaaaaaaaa
us-east-1b: subnet-bbbbbbbbbbbbbbbbb
us-east-1c: subnet-ccccccccccccccccc
```
- **default_ratio** - Override the default ratio of RAM to CPU if the user does not provide one. Must be a list of the minimum and maximum.
- default is [8, 8]
- **ec2_amis** - A dictionary of dictionaries to store [AMI](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) info.
Expand Down Expand Up @@ -95,6 +103,8 @@ https://github.com/carsdotcom/cars-forge/blob/main/examples/env_yaml_example/exa
```
- **forge_env** - Name of the Forge environment. The user will refer to this in their yaml.
- **forge_pem_secret** - The secret name where the `ec2_key` is stored
- **on_demand_failover** - If using engine mode and all spot attempts (market: spot + spot retries) have failed, run a final attempt using on-demand.
- **spot_retries** - If using engine mode, sets the number of times to retry a spot instance. Only retries if either market is spot.
- **tags** - [Tags](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html) to apply to instances created by Forge. Follows the AWS tag format.
- Forge also exposes all string, numeric, and some extra variables from the combined user and environmental configs that will be replaced at runtime by the matching values (e.g. `{name}` for job name, `{date}` for job date, etc.) See the [variables](variables.md) page for more details.
- E.g.
Expand Down
2 changes: 2 additions & 0 deletions docs/yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Each forge command certain parameters. A yaml file with all the parameters can b
```
- If running via the command line, a range of values is passed as: ``--market on-demand spot``.
- **name** - Name of the instance/cluster
- **on_demand_failover** - If using engine mode and all spot attempts (market: spot + spot retries) have failed, run a final attempt using on-demand.
- **ram** - Minimum amount of RAM required. Can be a range e.g. [16, 32].
- If using a cluster, you must specify both the master and worker. Master first, worker second.
```yaml
Expand Down Expand Up @@ -77,5 +78,6 @@ Each forge command certain parameters. A yaml file with all the parameters can b
- E.g. `run_cmd: scripts/run.sh {env} {date} {ip}`
- **service** - `cluster` or `single`
- **spot_strategy** - Select the [spot allocation strategy](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2/client/create_fleet.html).
- **spot_retries** - If using engine mode, sets the number of times to retry a spot instance. Only retries if either market is spot.
- **user_data** - Custom script passed to instance. Will be run only once when the instance starts up.
- **valid_time** - How many hours the fleet will stay up. After this time, all EC2s will be destroyed. The default is 8.
27 changes: 24 additions & 3 deletions src/forge/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from botocore.exceptions import ClientError, NoCredentialsError

from . import DEFAULT_ARG_VALS, ADDITIONAL_KEYS
from .exceptions import ExitHandlerException

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -117,7 +118,8 @@ def ec2_ip(n, config):
'instance_type': i.get('InstanceType'),
'state': i.get('State').get('Name'),
'launch_time': i.get('LaunchTime'),
'fleet_id': check_fleet_id(n, config)
'fleet_id': check_fleet_id(n, config),
'az': i.get('Placement')['AvailabilityZone']
}
details.append(x)
logger.debug('ec2_ip details is %s', details)
Expand Down Expand Up @@ -320,6 +322,14 @@ def normalize_config(config):
if config.get('aws_az'):
config['region'] = config['aws_az'][:-1]

if config.get('aws_subnet') and not config.get('aws_multi_az'):
config['aws_multi_az'] = {config.get('aws_az'): config.get('aws_subnet')}
elif config.get('aws_subnet') and config.get('aws_multi_az'):
logger.warning('Both aws_multi_az and aws_subnet exist, defaulting to aws_multi_az')

if config.get('aws_region'):
config['region'] = config['aws_region']

if not config.get('ram') and not config.get('cpu') and config.get('ratio'):
DEFAULT_ARG_VALS['default_ratio'] = config.pop('ratio')

Expand Down Expand Up @@ -492,8 +502,8 @@ def get_ec2_pricing(ec2_type, market, config):
float
Hourly price of given EC2 type in given market.
"""
region = config.get('region')
az = config.get('aws_az')
region = config['region']
az = config['aws_az']

if market == 'spot':
client = boto3.client('ec2')
Expand Down Expand Up @@ -529,3 +539,14 @@ def get_ec2_pricing(ec2_type, market, config):
price = float(price)

return price


def exit_callback(config, exit: bool = False):
if config['job'] == 'engine' and (config.get('spot_retries') or (config.get('on_demand_failover') or config.get('market_failover'))):
logger.error('Error occurred, bubbling up error to handler.')
raise ExitHandlerException

if exit:
sys.exit(1)

pass
12 changes: 8 additions & 4 deletions src/forge/configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys

import yaml
from schema import Schema, And, Optional, Or, SchemaError
from schema import Schema, And, Optional, Or, SchemaError, Use

from .common import set_config_dir

Expand Down Expand Up @@ -50,11 +50,13 @@ def check_env_yaml(env_yaml):
"""
schema = Schema({
'forge_env': And(str, len, error='Invalid Environment Name'),
'aws_az': And(str, len, error='Invalid AWS availability zone'),
Optional('aws_region'): And(str, len, error='Invalid AWS region'),
Optional('aws_az'): And(str, len, error='Invalid AWS availability zone'),
Optional('aws_subnet'): And(str, len, error='Invalid AWS Subnet'),
'ec2_amis': And(dict, len, error='Invalid AMI Dictionary'),
'aws_subnet': And(str, len, error='Invalid AWS Subnet'),
Optional('aws_multi_az'): And(dict, len, error='Invalid AWS Subnet'),
'ec2_key': And(str, len, error='Invalid AWS key'),
'aws_security_group': And(str, len, error='Invalid AWS Security Group'),
Optional('aws_security_group'): And(str, len, error='Invalid AWS Security Group'),
'forge_pem_secret': And(str, len, error='Invalid Name of Secret'),
Optional('aws_profile'): And(str, len, error='Invalid AWS profile'),
Optional('ratio'): And(list, len, error='Invalid default ratio'),
Expand All @@ -71,6 +73,8 @@ def check_env_yaml(env_yaml):
'capacity-optimized-prioritized',
'price-capacity-optimized'),
error='Invalid spot allocation strategy'),
Optional('on_demand_failover'): And(bool),
Optional('spot_retries'): And(Use(int), lambda x: x > 0),
})
try:
validated = schema.validate(env_yaml)
Expand Down
Loading

0 comments on commit 91f3802

Please sign in to comment.