#253 - Ingest AWS tags for multiple resource types using resourcegrou…

…pstaggingapi (#276) * Initial commit * Adjust indices * Use `id` as primary identifier in S3Buckets and EC2Instances for consistency * Add EC2 instance integration tests * Update schema * Add S3 bucket integration tests, update schema * Fix schema * Linter happy * Incremental commit * Implement feature with resource type mappings * Add EC2 NICs, SGs, subnets, VPCs. Update tests. * Schema docs and update Readme with supported datatypes * Typo * Add missing cleanup jobs, test data
cartography-cncf · Apr 8, 2020 · d515f6c · d515f6c
1 parent 5860753
commit d515f6c
Show file tree

Hide file tree

Showing 9 changed files with 426 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -33,8 +33,8 @@ You can learn more about the story behind Cartography in our [presentation at BS
 Start [here](docs/setup/install.md).
 
 ## Supported platforms
-- [Amazon Web Services](docs/setup/config/aws.md) -  EC2, Elasticsearch, DynamoDB, IAM,  RDS, Route53, S3, STS
-- [Google Cloud Platform](docs/setup/config/gcp.md) - Cloud Resource Manager, Compute, Storage
+- [Amazon Web Services](docs/setup/config/aws.md) -  EC2, Elasticsearch, Elastic Kubernetes Service, DynamoDB, IAM,  RDS, Route53, S3, STS, Tags
+- [Google Cloud Platform](docs/setup/config/gcp.md) - Cloud Resource Manager, Compute, Storage, Google Kubernetes Engine
 - [Google GSuite](docs/setup/config/gsuite.md) - users, groups
 - [Duo CRXcavator](docs/setup/config/crxcavator.md) - Chrome extensions, GSuite users
 - [Okta](docs/setup/config/okta.md) - users, groups, organizations, roles, applications, factors, trusted origins, reply URIs

diff --git a/cartography/data/indexes.cypher b/cartography/data/indexes.cypher
@@ -9,6 +9,7 @@ CREATE INDEX ON :AWSIpv6CidrBlock(id);
 CREATE INDEX ON :AWSPolicy(arn);
 CREATE INDEX ON :AWSPrincipal(arn);
 CREATE INDEX ON :AWSRole(arn);
+CREATE INDEX ON :AWSTag(id);
 CREATE INDEX ON :AWSUser(arn);
 CREATE INDEX ON :AWSUser(name);
 CREATE INDEX ON :AWSVpc(id);

diff --git a/cartography/data/jobs/cleanup/aws_import_tags_cleanup.json b/cartography/data/jobs/cleanup/aws_import_tags_cleanup.json
@@ -0,0 +1,95 @@
+{
+    "statements": [
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:EC2Instance) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:EC2Instance) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:NetworkInterface) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:NetworkInterface) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:EC2SecurityGroup) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:EC2SecurityGroup) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:EC2Subnet) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:EC2Subnet) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:AWSVpc) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:AWSVpc) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:ESDomain) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:ESDomain) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:RDSInstance) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:RDSInstance) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:DBSubnetGroup) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:DBSubnetGroup) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (n:AWSTag)<-[:TAGGED]-(:S3Bucket) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        },
+        {
+            "query": "MATCH (:AWSTag)<-[r:TAGGED]-(:S3Bucket) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+            "iterative": true,
+            "iterationsize": 100
+        }
+    ],
+    "name": "cleanup AWS Tags"
+}
diff --git a/cartography/intel/aws/__init__.py b/cartography/intel/aws/__init__.py
@@ -10,6 +10,7 @@
 from . import iam
 from . import organizations
 from . import rds
+from . import resourcegroupstaggingapi
 from . import route53
 from . import s3
 from cartography.util import run_analysis_job
@@ -47,6 +48,9 @@ def _sync_one_account(neo4j_session, boto3_session, account_id, sync_tag, common
     # NOTE clean up all DNS records, regardless of which job created them
     run_cleanup_job('aws_account_dns_cleanup.json', neo4j_session, common_job_parameters)
 
+    # AWS Tags - Must always be last.
+    resourcegroupstaggingapi.sync(neo4j_session, boto3_session, regions, sync_tag, common_job_parameters)
+
 
 def _sync_multiple_accounts(neo4j_session, accounts, sync_tag, common_job_parameters):
     logger.debug("Syncing AWS accounts: %s", ', '.join(accounts.values()))

diff --git a/cartography/intel/aws/resourcegroupstaggingapi.py b/cartography/intel/aws/resourcegroupstaggingapi.py
@@ -0,0 +1,119 @@
+import logging
+from string import Template
+
+from cartography.util import run_cleanup_job
+
+logger = logging.getLogger(__name__)
+
+
+def get_short_id_from_ec2_arn(arn):
+    """
+    Return the short-form resource ID from an EC2 ARN.
+    For example, for "arn:aws:ec2:us-east-1:test_account:instance/i-1337", return 'i-1337'.
+    :param arn: The ARN
+    :return: The resource ID
+    """
+    return arn.split('/')[-1]
+
+
+def get_bucket_name_from_arn(bucket_arn):
+    """
+    Return the bucket name from an S3 bucket ARN.
+    For example, for "arn:aws:s3:::bucket_name", return 'bucket_name'.
+    :param arn: The S3 bucket's full ARN
+    :return: The S3 bucket's name
+    """
+    return bucket_arn.split(':')[-1]
+
+
+# We maintain a mapping from AWS resource types to their associated labels and unique identifiers.
+# label: the node label used in cartography for this resource type
+# property: the field of this node that uniquely identified this resource type
+# id_func: [optional] - EC2 instances and S3 buckets in cartography currently use non-ARNs as their primary identifiers
+# so we need to supply a function pointer to translate the ARN returned by the resourcegroupstaggingapi to the form that
+# cartography uses.
+# TODO - we should make EC2 and S3 assets query-able by their full ARN so that we don't need this workaround.
+TAG_RESOURCE_TYPE_MAPPINGS = {
+    'ec2:instance': {'label': 'EC2Instance', 'property': 'id', 'id_func': get_short_id_from_ec2_arn},
+    'ec2:network-interface': {'label': 'NetworkInterface', 'property': 'id', 'id_func': get_short_id_from_ec2_arn},
+    'ec2:security-group': {'label': 'EC2SecurityGroup', 'property': 'id', 'id_func': get_short_id_from_ec2_arn},
+    'ec2:subnet': {'label': 'EC2Subnet', 'property': 'subnetid', 'id_func': get_short_id_from_ec2_arn},
+    'ec2:vpc': {'label': 'AWSVpc', 'property': 'id', 'id_func': get_short_id_from_ec2_arn},
+    'es:domain': {'label': 'ESDomain', 'property': 'id'},
+    'rds:db': {'label': 'RDSInstance', 'property': 'id'},
+    'rds:subgrp': {'label': 'DBSubnetGroup', 'property': 'id'},
+    # Buckets are the only objects in the S3 service: https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-arn-format.html
+    's3': {'label': 'S3Bucket', 'property': 'id', 'id_func': get_bucket_name_from_arn},
+}
+
+
+def get_tags(boto3_session, resource_types, region):
+    """
+    Create boto3 client and retrieve tag data.
+    """
+    client = boto3_session.client('resourcegroupstaggingapi', region_name=region)
+    paginator = client.get_paginator('get_resources')
+    resources = []
+    for page in paginator.paginate(
+        # Only ingest tags for resources that Cartography supports.
+        # This is just a starting list; there may be others supported by this API.
+        ResourceTypeFilters=resource_types,
+    ):
+        resources.extend(page['ResourceTagMappingList'])
+    return resources
+
+
+def load_tags(neo4j_session, tag_data, resource_type, region, aws_update_tag):
+    INGEST_TAG_TEMPLATE = Template("""
+    MATCH (resource:$resource_label{$property:{ResourceId}})
+    MERGE(aws_tag:AWSTag:Tag{id:{TagId}})
+    ON CREATE SET aws_tag.firstseen = timestamp()
+    SET aws_tag.lastupdated = {UpdateTag},
+        aws_tag.key = {TagKey},
+        aws_tag.value =  {TagValue},
+        aws_tag.region = {Region}
+    MERGE (resource)-[r:TAGGED]->(aws_tag)
+    SET r.lastupdated = {UpdateTag},
+        r.firstseen = timestamp()
+    """)
+    for tag_mapping in tag_data:
+        for tag in tag_mapping['Tags']:
+            neo4j_session.run(
+                INGEST_TAG_TEMPLATE.safe_substitute(
+                    resource_label=TAG_RESOURCE_TYPE_MAPPINGS[resource_type]['label'],
+                    property=TAG_RESOURCE_TYPE_MAPPINGS[resource_type]['property'],
+                ),
+                ResourceId=tag_mapping['resource_id'],
+                TagId=f'{tag["Key"]}:{tag["Value"]}',
+                UpdateTag=aws_update_tag,
+                TagKey=tag['Key'],
+                TagValue=tag['Value'],
+                Region=region,
+            )
+
+
+def transform_tags(tag_data, resource_type):
+    for tag_mapping in tag_data:
+        tag_mapping['resource_id'] = compute_resource_id(tag_mapping, resource_type)
+
+
+def compute_resource_id(tag_mapping, resource_type):
+    resource_id = tag_mapping['ResourceARN']
+    if 'id_func' in TAG_RESOURCE_TYPE_MAPPINGS[resource_type]:
+        parse_resource_id_from_arn = TAG_RESOURCE_TYPE_MAPPINGS[resource_type]['id_func']
+        resource_id = parse_resource_id_from_arn(tag_mapping['ResourceARN'])
+    return resource_id
+
+
+def cleanup(neo4j_session, common_job_parameters):
+    run_cleanup_job('aws_import_tags_cleanup.json', neo4j_session, common_job_parameters)
+
+
+def sync(neo4j_session, boto3_session, regions, aws_update_tag, common_job_parameters):
+    for region in regions:
+        logger.info("Syncing AWS tags for region '%s'.", region)
+        for resource_type in TAG_RESOURCE_TYPE_MAPPINGS.keys():
+            tag_data = get_tags(boto3_session, [resource_type], region)
+            transform_tags(tag_data, resource_type)
+            load_tags(neo4j_session, tag_data, resource_type, region, aws_update_tag)
+    cleanup(neo4j_session, common_job_parameters)