diff --git a/Makefile b/Makefile index a648af31..d909153e 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ install: .PHONY=clean clean: - rm -f tmd/storage/output/* + rm -f tmd/storage/output/tmd* tmd/storage/output/tmd.csv.gz: \ setup.py \ diff --git a/setup.py b/setup.py index 1a379460..25589a53 100644 --- a/setup.py +++ b/setup.py @@ -2,14 +2,13 @@ setup( name="tmd", - version="0.2.0", + version="0.3.0", packages=find_packages(), python_requires=">=3.10,<3.13", install_requires=[ "policyengine_us==1.55.0", "tables", # required by policyengine_us - "marshmallow<3.22", # to work around paramtools bug - "taxcalc>=4.2.1", # requires paramtools + "taxcalc>=4.2.2", "scikit-learn", "torch", "tensorboard", diff --git a/tests/test_area_make.py b/tests/test_area_make.py index 44a45641..cca0abd3 100644 --- a/tests/test_area_make.py +++ b/tests/test_area_make.py @@ -1,5 +1,5 @@ """ -Tests of tmd/areas/create_area_weights.py script. +Tests of tmd/areas/make_all.py script. """ import sys @@ -12,10 +12,9 @@ @pytest.mark.skip def test_area_make(): """ - Make area weights for faux bb area using the faux bb area targets. + Compare areas/weights/bb.log file with areas/weights/bb.log-expect file. """ - make_all_areas(only_list=["bb"]) - # compare area/weights/bb.log file with area/weights/bb.log-expect file + make_all_areas(make_only_list=["bb"]) wpath = AREAS_FOLDER / "weights" with open(wpath / "bb.log", "r", encoding="utf-8") as afile: act = afile.readlines() @@ -25,5 +24,12 @@ def test_area_make(): context_diff(act, exp, fromfile="ACTUAL", tofile="EXPECT", n=0) ) if len(diffs) > 0: + sys.stdout.write(">>>>> FULL FILE:\n") + sys.stdout.write("------------------------------------------------\n") + sys.stdout.writelines(act) + sys.stdout.write("------------------------------------------------\n") + sys.stdout.write(">>>>> DIFFS FILE:\n") + sys.stdout.write("------------------------------------------------\n") sys.stdout.writelines(diffs) - raise ValueError("ACT vs EXP differences for area/weights/bb.log") + sys.stdout.write("------------------------------------------------\n") + raise ValueError("ACT vs EXP differences for areas/weights/bb.log") diff --git a/tmd/areas/make_all.py b/tmd/areas/make_all.py index a2e43e3d..21cb51d6 100644 --- a/tmd/areas/make_all.py +++ b/tmd/areas/make_all.py @@ -34,7 +34,7 @@ def time_of_newest_other_dependency(): # --- High-level logic of the script -def make_all_areas(only_list=None): +def make_all_areas(make_only_list=None): """ Call create_area_weights.py for each out-of-date or non-existent weights file for which there is a targets file. @@ -58,7 +58,7 @@ def make_all_areas(only_list=None): tpaths = sorted(list(tfolder.glob("*_targets.csv"))) for tpath in tpaths: area = tpath.name.split("_")[0] - if only_list and area not in only_list: + if make_only_list and area not in make_only_list: continue # skip this area wpath = AREAS_FOLDER / "weights" / f"{area}_tmd_weights.csv.gz" if wpath.exists(): diff --git a/tmd/areas/weights/bb.log-expect b/tmd/areas/weights/bb.log-expect index 16ea2a56..d33d7b5f 100644 --- a/tmd/areas/weights/bb.log-expect +++ b/tmd/areas/weights/bb.log-expect @@ -32,8 +32,8 @@ target_matrix sparsity ratio = 0.597 OPTIMIZE WEIGHT RATIOS IN A REGULARIZATION LOOP where REGULARIZATION DELTA starts at 1.000000e-09 and where target_matrix.shape= (225256, 16) - ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 14.9 ->>> final delta loop exectime= 14.9 secs iterations=168 success=True + ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 24.3 +>>> final delta loop exectime= 24.3 secs iterations=168 success=True >>> message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH >>> L-BFGS-B optimized objective function value: 1.261425278e-04 AREA-OPTIMIZED_TARGET_MISSES= 0 diff --git a/tmd/areas/weights/bb.log-expect-github b/tmd/areas/weights/bb.log-expect-github new file mode 100644 index 00000000..1e20449a --- /dev/null +++ b/tmd/areas/weights/bb.log-expect-github @@ -0,0 +1,65 @@ +CREATING WEIGHTS FILE FOR AREA bb ... +INITIAL WEIGHTS STATISTICS: +weights_scale= 9.871864e-02 + s006 wght_us +count 225256.000000 225256.000000 +mean 816.957848 80.648965 +std 1140.733386 112.611644 +min 0.110000 0.010859 +25% 23.590000 2.328773 +50% 389.970000 38.497307 +75% 1282.730000 126.629357 +max 15801.890000 1559.941035 +USING bb_targets.csv FILE CONTAINING 16 TARGETS +DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=16): +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.400000, < 0.800000: 1 1 6.25% 6.25% +>= 0.800000, < 0.900000: 1 2 6.25% 12.50% +>= 0.900000, < 0.990000: 0 2 0.00% 12.50% +>= 0.990000, < 0.999500: 0 2 0.00% 12.50% +>= 0.999500, < 1.000500: 1 3 6.25% 18.75% +>= 1.000500, < 1.010000: 0 3 0.00% 18.75% +>= 1.010000, < 1.100000: 0 3 0.00% 18.75% +>= 1.100000, < 1.200000: 0 3 0.00% 18.75% +>= 1.200000, < 1.600000: 3 6 18.75% 37.50% +>= 1.600000, < 2.000000: 0 6 0.00% 37.50% +>= 2.000000, < 3.000000: 3 9 18.75% 56.25% +>= 3.000000, < 4.000000: 2 11 12.50% 68.75% +>= 4.000000, < 5.000000: 3 14 18.75% 87.50% +>= 5.000000, < inf: 2 16 12.50% 100.00% +US_PROPORTIONALLY_SCALED_TARGET_RMSE= 3.033037311e+00 +target_matrix sparsity ratio = 0.597 +OPTIMIZE WEIGHT RATIOS IN A REGULARIZATION LOOP + where REGULARIZATION DELTA starts at 1.000000e-09 + and where target_matrix.shape= (225256, 16) + ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 141.3 +>>> final delta loop exectime= 141.3 secs iterations=148 success=True +>>> message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH +>>> L-BFGS-B optimized objective function value: 1.267485693e-04 +AREA-OPTIMIZED_TARGET_MISSES= 0 +DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=16): + with REGULARIZATION_DELTA= 1.000000e-09 +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.999500, < 1.000500: 16 16 100.00% 100.00% +AREA-OPTIMIZED_TARGET_RMSE= 8.050660463e-05 +DISTRIBUTION OF AREA/US WEIGHT RATIO (n=225256): + with REGULARIZATION_DELTA= 1.000000e-09 +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.000000, < 0.000001: 195 195 0.09% 0.09% +>= 0.000001, < 0.100000: 51665 51860 22.94% 23.02% +>= 0.100000, < 0.200000: 7918 59778 3.52% 26.54% +>= 0.200000, < 0.500000: 18162 77940 8.06% 34.60% +>= 0.500000, < 0.800000: 28977 106917 12.86% 47.46% +>= 0.800000, < 0.850000: 9350 116267 4.15% 51.62% +>= 0.850000, < 0.900000: 12610 128877 5.60% 57.21% +>= 0.900000, < 0.950000: 14978 143855 6.65% 63.86% +>= 0.950000, < 1.000000: 18899 162754 8.39% 72.25% +>= 1.000000, < 1.050000: 11204 173958 4.97% 77.23% +>= 1.050000, < 1.100000: 5570 179528 2.47% 79.70% +>= 1.100000, < 1.150000: 4483 184011 1.99% 81.69% +>= 1.150000, < 1.200000: 4235 188246 1.88% 83.57% +>= 1.200000, < 2.000000: 28187 216433 12.51% 96.08% +>= 2.000000, < 5.000000: 8344 224777 3.70% 99.79% +>= 5.000000, < 10.000000: 426 225203 0.19% 99.98% +>= 10.000000, < 100.000000: 53 225256 0.02% 100.00% +SUM OF SQUARED AREA/US WEIGHT RATIO DEVIATIONS= 1.266449e+05 diff --git a/tmd/areas/weights/bb.log-expect-mrh b/tmd/areas/weights/bb.log-expect-mrh new file mode 100644 index 00000000..d33d7b5f --- /dev/null +++ b/tmd/areas/weights/bb.log-expect-mrh @@ -0,0 +1,65 @@ +CREATING WEIGHTS FILE FOR AREA bb ... +INITIAL WEIGHTS STATISTICS: +weights_scale= 9.874809e-02 + s006 wght_us +count 225256.000000 225256.000000 +mean 816.774828 80.654957 +std 1140.652664 112.637275 +min 0.110000 0.010862 +25% 23.570000 2.327493 +50% 389.695000 38.481638 +75% 1282.230000 126.617767 +max 15801.890000 1560.406500 +USING bb_targets.csv FILE CONTAINING 16 TARGETS +DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=16): +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.400000, < 0.800000: 1 1 6.25% 6.25% +>= 0.800000, < 0.900000: 1 2 6.25% 12.50% +>= 0.900000, < 0.990000: 0 2 0.00% 12.50% +>= 0.990000, < 0.999500: 0 2 0.00% 12.50% +>= 0.999500, < 1.000500: 1 3 6.25% 18.75% +>= 1.000500, < 1.010000: 0 3 0.00% 18.75% +>= 1.010000, < 1.100000: 0 3 0.00% 18.75% +>= 1.100000, < 1.200000: 0 3 0.00% 18.75% +>= 1.200000, < 1.600000: 3 6 18.75% 37.50% +>= 1.600000, < 2.000000: 0 6 0.00% 37.50% +>= 2.000000, < 3.000000: 3 9 18.75% 56.25% +>= 3.000000, < 4.000000: 2 11 12.50% 68.75% +>= 4.000000, < 5.000000: 3 14 18.75% 87.50% +>= 5.000000, < inf: 2 16 12.50% 100.00% +US_PROPORTIONALLY_SCALED_TARGET_RMSE= 3.031008582e+00 +target_matrix sparsity ratio = 0.597 +OPTIMIZE WEIGHT RATIOS IN A REGULARIZATION LOOP + where REGULARIZATION DELTA starts at 1.000000e-09 + and where target_matrix.shape= (225256, 16) + ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 24.3 +>>> final delta loop exectime= 24.3 secs iterations=168 success=True +>>> message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH +>>> L-BFGS-B optimized objective function value: 1.261425278e-04 +AREA-OPTIMIZED_TARGET_MISSES= 0 +DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=16): + with REGULARIZATION_DELTA= 1.000000e-09 +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.999500, < 1.000500: 16 16 100.00% 100.00% +AREA-OPTIMIZED_TARGET_RMSE= 9.594020637e-05 +DISTRIBUTION OF AREA/US WEIGHT RATIO (n=225256): + with REGULARIZATION_DELTA= 1.000000e-09 +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.000000, < 0.000001: 633 633 0.28% 0.28% +>= 0.000001, < 0.100000: 50851 51484 22.57% 22.86% +>= 0.100000, < 0.200000: 8125 59609 3.61% 26.46% +>= 0.200000, < 0.500000: 17983 77592 7.98% 34.45% +>= 0.500000, < 0.800000: 28654 106246 12.72% 47.17% +>= 0.800000, < 0.850000: 9340 115586 4.15% 51.31% +>= 0.850000, < 0.900000: 12669 128255 5.62% 56.94% +>= 0.900000, < 0.950000: 15103 143358 6.70% 63.64% +>= 0.950000, < 1.000000: 19013 162371 8.44% 72.08% +>= 1.000000, < 1.050000: 11368 173739 5.05% 77.13% +>= 1.050000, < 1.100000: 5612 179351 2.49% 79.62% +>= 1.100000, < 1.150000: 4492 183843 1.99% 81.62% +>= 1.150000, < 1.200000: 4289 188132 1.90% 83.52% +>= 1.200000, < 2.000000: 28332 216464 12.58% 96.10% +>= 2.000000, < 5.000000: 8312 224776 3.69% 99.79% +>= 5.000000, < 10.000000: 428 225204 0.19% 99.98% +>= 10.000000, < 100.000000: 52 225256 0.02% 100.00% +SUM OF SQUARED AREA/US WEIGHT RATIO DEVIATIONS= 1.259953e+05 diff --git a/tmd/storage/output/README.md b/tmd/storage/output/README.md index 1b48cd7c..29a470b3 100644 --- a/tmd/storage/output/README.md +++ b/tmd/storage/output/README.md @@ -1,3 +1,6 @@ # Output files -Flat files suitable for input to Tax-Calculator. +Three national files suitable for input to Tax-Calculator: +- tmd.csv.gz (after gunzip) +- tmd_weights.csv.gz +- tmd_growfactors.csv