Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script to sample a single location #70

Merged
merged 7 commits into from
Mar 6, 2024
Merged
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions buildstockbatch/sample_one_county.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"""Runs the residental quota sampler for a single county+PUMA.

Usage:
python3 sample_one_county.py --help

python3 sample_one_county.py G1900030 G19001800 100,200 path/to/resstock path/to/output_dir

- Generates two files where every building has county=G1900030 and PUMA=G19001800:
path/to/output_dir/buildstock_G1900030_G19001800_100.csv with 100 samples
path/to/output_dir/buildstock_G1900030_G19001800_200.csv with 200 samples

Methodology:
This modifies the conditional probability distributions from the standard ResStock national project

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you explain this a bit further? if we were to use the same number of samples as there is in the national resstock (i.e., county, puma in the 550k sample), that does not change the probability distributions, right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated this - let me know if it's still unclear.

to filter the sample to a single county+PUMA.

To do this, we modify two files:
- ASHRAE IECC Climate Zone 2004.tsv
- Make 100% of the samples fall into the climate zone of the selected location.
- County and PUMA.tsv
- Make 100% of samples (within the chosen climate zone) fall into the selected county + PUMA

All other variables are downstream of these (or don't depend on them).

Assumptions:
This logic is only guaranteed to work for the current ResStock national project. Other changes
to the dependencies between the variables can break it!

In particular, this code assumes:
- ASHRAE climate zone has no dependencies
- County and PUMA depends only on the ASHRAE climate zone
- Each County+PUMA fall entirely in one climate zone
"""
import argparse
import csv
import os
import shutil
import tempfile

from buildstockbatch.utils import ContainerRuntime
from sampler import residential_quota


class SampleOnly:
CONTAINER_RUNTIME = ContainerRuntime.DOCKER

def __init__(self, buildstock_dir, output_dir):
# Sampler uses this to find the sampling scripts
self.buildstock_dir = os.path.abspath(buildstock_dir)

# ResStock national project. Could use a different project, but `County and PUMA.tsv` and
# `ASHRAE IECC Climate Zone 2004.tsv` must exist in the expected format.
self.project_dir = os.path.join(self.buildstock_dir, "project_national")

# Directory containing the conditional probability distributions we plan to modify
self.housing_characteristics_dir = os.path.join(self.project_dir, "housing_characteristics")
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)

@property
def docker_image(self):
return "nrel/openstudio:{}".format(self.os_version)

@property
def os_version(self):
nweires marked this conversation as resolved.
Show resolved Hide resolved
return "3.6.1"

@property
def project_filename(self):
"""Sampler expects this property to exist, but it can be None."""
return None

def get_climate_zone(self, county, PUMA):
"""Given a county and PUMA, find the climate zone that contains them."""
nweires marked this conversation as resolved.
Show resolved Hide resolved
with open(os.path.join(self.housing_characteristics_dir, "County and PUMA.tsv")) as f:
reader = csv.reader(f, delimiter="\t")
headers = next(reader)
# Index of the column with the county and PUMA we're looking for.
try:
location_col = headers.index(f"Option={county}, {PUMA}")
except ValueError as e:
raise ValueError(f"Could not find 'Option={county}, {PUMA}' column in 'County and PUMA.tsv'") from e

zone = None
for row in reader:
# Skip comments
if row[0].strip()[0] == "#":
continue

# Find the zone with a non-zero chance of producing this county + PUMA
if row[location_col] != "0":
if zone:
raise ValueError(f"Found multiple climate zones for {county}, {PUMA}")
zone = row[0]

if not zone:
raise ValueError(f"No climate zone found for {county}, {PUMA}")
return zone

def run_sampler(self, county, PUMA, n_samples):
"""
nweires marked this conversation as resolved.
Show resolved Hide resolved
Args:
county: GISJOIN ID of county
PUMA: GISJOIN ID of PUMA
n_samples: Number of building samples to produce.
"""

climate_zone = self.get_climate_zone(county, PUMA)
# Create a new copy of the probability distribution TSV files, so we can change them without
# affecting the originals.
with tempfile.TemporaryDirectory(prefix="sampling_", dir=self.buildstock_dir) as tmpdir:
temp_housing_characteristics_dir = os.path.join(tmpdir, "housing_characteristics")
shutil.copytree(self.housing_characteristics_dir, temp_housing_characteristics_dir)

# Update climate zone TSV
zone_filename = "ASHRAE IECC Climate Zone 2004.tsv"
nweires marked this conversation as resolved.
Show resolved Hide resolved
zone_tsv = os.path.join(self.housing_characteristics_dir, zone_filename)
new_zone_tsv = os.path.join(temp_housing_characteristics_dir, zone_filename)
with open(zone_tsv) as old_f:
reader = csv.reader(old_f, delimiter="\t")
with open(new_zone_tsv, "w") as new_f:
writer = csv.writer(new_f, delimiter="\t")
headers = next(reader)
writer.writerow(headers)

# This file has a single row of probabilities, which we replace with 0s and a single 1.
zone_header = f"Option={climate_zone}"
writer.writerow(["1" if header == zone_header else "0" for header in headers])

# Update county + PUMA TSV
county_filename = "County and PUMA.tsv"
county_tsv = os.path.join(self.housing_characteristics_dir, county_filename)
new_county_tsv = os.path.join(temp_housing_characteristics_dir, county_filename)
with open(county_tsv) as old_f:
reader = csv.reader(old_f, delimiter="\t")
with open(new_county_tsv, "w") as new_f:
writer = csv.writer(new_f, delimiter="\t")
headers = next(reader)
writer.writerow(headers)

# First value in headers lists the climate zone dependency -
# just use the others, which list the County+PUMA options.
assert headers[0] == "Dependency=ASHRAE IECC Climate Zone 2004"
headers = headers[1:]
for row in reader:
# Skip comments
if row[0].strip()[0] == "#":
continue

elif row[0] == climate_zone:
# Replace probabilities with 1 for our selected location and 0s everywhere else.
county_header = f"Option={county}, {PUMA}"
writer.writerow(
[row[0]] + ["1" if headers[i] == county_header else "0" for i, v in enumerate(row[1:])]
)

else:
# Leave other climate zones unchanged - they won't be used anyway.
writer.writerow(row)

self.cfg = {"project_directory": os.path.basename(tmpdir)}
self.project_dir = tmpdir

# Note: Must create sampler after all instances vars exist, because it makes a copy of this object.
sampler = residential_quota.ResidentialQuotaSampler(self, n_samples)
sampler.run_sampling()

# Copy results from temp dir to output dir
shutil.copy(
os.path.join(temp_housing_characteristics_dir, "buildstock.csv"),
os.path.join(self.output_dir, f"buildstock_{county}_{PUMA}_{n_samples}.csv"),
)


def main():
parser = argparse.ArgumentParser()
parser.add_argument("county", help="County GISJOIN ID - https://www.nhgis.org/geographic-crosswalks#geog-ids")
parser.add_argument("PUMA", help="PUMA GISJOIN ID")
parser.add_argument("n_samples", help="Comma-separated list of samples sizes to generate")
parser.add_argument("buildstock_dir", help="Path to the ResStock directory (expected to contain project_national)")
parser.add_argument(
"output_dir",
default=".",
nargs="?",
help="Optional path where output should be written. Defaults to the current directory.",
)
args = parser.parse_args()

assert (
len(args.county) == 8 and args.county[0] == "G"
), "County should be 8 chars and start with G (e.g. 'G0100010')"
assert len(args.PUMA) == 9 and args.PUMA[0] == "G", "PUMA should be 9 chars and start with G (e.g. 'G01002100')"

sample_sizes = [int(i) for i in args.n_samples.split(",")]
s = SampleOnly(args.buildstock_dir, args.output_dir)
for i in sample_sizes:
print(f"Creating {i} samples...")
s.run_sampler(args.county, args.PUMA, i)


if __name__ == "__main__":
main()
Loading