Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated Parser Generation and Serialization #333

Open
wants to merge 18 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 23 additions & 46 deletions buildingmotif/api/serializers/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import warnings
from functools import lru_cache
from typing import Any, Literal, Tuple, Type, Union, get_type_hints
from inspect import Parameter, signature
from typing import Any, Type, Union, get_type_hints

from rdflib import URIRef
from typing_extensions import TypedDict
Expand Down Expand Up @@ -96,34 +97,7 @@ def _get_token_by_name(token_name: str) -> Type[Token]:
raise NameError(f'Token of type "{token_name}" does not exist')


@lru_cache
def _get_parser_args_info(
parser: Type[Parser],
) -> Tuple[Union[str, Literal[None]], Union[str, Literal[None]]]:
"""Get information about the args in a parser's constructor.
This has been moved to it's own function to allow for speed improvements by
caching results.

:param parser: parser to inspect
:type parser: Type[Parser]
:return: variable length positional arguments name, variable length keyword arguments name
:rtype: tuple[str, str]"""
flags = parser.__init__.__code__.co_flags
varargs_flag = (flags & 4) != 0
varkeyargs_flag = (flags & 8) != 0
var_names = parser.__init__.__code__.co_varnames[1:]
varargs_name = None
varkeyargs_name = None
if varkeyargs_flag:
varkeyargs_name = var_names[-1]
if varargs_flag:
varargs_name = var_names[-1]
if varkeyargs_flag:
varargs_name = var_names[-2]
return (varargs_name, varkeyargs_name)


def _construct_class(cls: Type[Parser], args: dict) -> Parser:
def _construct_class(cls: Type[Parser], args_dict: dict) -> Parser:
"""Construct class from type and arguments

:param cls: type of class to construct
Expand All @@ -132,21 +106,24 @@ def _construct_class(cls: Type[Parser], args: dict) -> Parser:
:type args: dict
:return: Instance of class
:rtype: Any"""
varargs_name, varkeyargs_name = _get_parser_args_info(cls)
if varkeyargs_name:
varkeyargs_value = args[varkeyargs_name]
del args[varkeyargs_name]
args = {**args, **varkeyargs_value}
if varargs_name:
if varargs_name in args:
varargs_value: list = args[varargs_name]
if not isinstance(varargs_value, list):
raise TypeError(
"Serialized variadic arguments are not encoded as a list"
)
del args[varargs_name]
return cls(*varargs_value, **args)
return cls(**args)
parameters = signature(cls.__init__).parameters
args = []
kwargs = {}
for name, param in parameters.items():
if name not in args_dict:
continue
kind = param.kind
value = args_dict[name]
if kind == Parameter.POSITIONAL_ONLY:
args.append(value)
elif kind in [Parameter.POSITIONAL_OR_KEYWORD, Parameter.KEYWORD_ONLY]:
kwargs[name] = value
elif kind == Parameter.VAR_POSITIONAL:
args = [*args, *value]
elif kind == Parameter.VAR_KEYWORD:
kwargs.update(value)

return cls(*args, **kwargs)


def deserialize(parser_dict: Union[ParserDict, dict]) -> Parser:
Expand Down Expand Up @@ -186,7 +163,7 @@ def _deserialize_token(token_dict: dict) -> Union[Token, Type[Token]]:
:rtype: Token"""
token = _get_token_by_name(token_dict["token"])
if "value" in token_dict:
value_type = _get_token_value_type(token)
value_type = _get_token_value_type(token) # type: ignore
return token(value_type(token_dict["value"]))
return token

Expand Down Expand Up @@ -247,4 +224,4 @@ def _token_like(item: dict) -> bool:
return False
if "token" not in item:
return False
return True
return True
209 changes: 209 additions & 0 deletions buildingmotif/label_parsing/SerializedParserMetrics.py
dllliu marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import importlib.util
import os
import re

from buildingmotif.label_parsing.build_parser import (
generate_parsers_for_clusters,
generate_parsers_for_points,
)
from buildingmotif.label_parsing.combinators import (
COMMON_EQUIP_ABBREVIATIONS_BRICK,
COMMON_POINT_ABBREVIATIONS,
)
from buildingmotif.label_parsing.tools import abbreviationsTool, codeLinter


class SerializedParserMetrics:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me know if you had something else in mind, but I'm wondering why this class is building the parser. I think this metrics class is better suited as an output of another method which builds the parsers.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure if I am misinterpreting, but the generate_parsers_for_points() and generate_parsers_for_clusters() methods in __init__ are building the parsers, while the class just populates the relevant instance variables with the appropriate information.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but these are called from this class. I think the entrypoint should be something like a ParserBuilder class which calls those generation methods, and emits the Metrics class. We are not constructing the metrics. We are constructing parsers! The metrics are a side-effect of this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gtfierro The ParserBuilder class now emits a ParserMetrics class which takes care of serialization and gathering parsing related metrics. The ParserBuilder class still keeps track of the clustering and distance metrics, as that is directly related to parser generation. The notebook and tests have also been updated, let me know what you think.

"""
Combines parsers into a compact class with detailed metrics and other information.
Allows for easier serialization.

Attributes:
parsers(List[str]): list of all parsers
serializers_list (List[Dict]): list of all serialized parsers
clusters(List[List[str]]): list of all clusters (clusters can have only one element)
distance_metrics(Dict): statistics for distance matrix with similarity ratio as distance metric (mean, median, std, min, max, range)
clustering_metrics(Dict): statistics for clustering (number of clusters, noise points, and silhouette score)
flagged_abbreviations(List): LLM-flagged abbreviations
list_of_dicts(List[Dict]): list of dictionaries with abbreviations matched to brick classes. defaults to COMMON_EQUIP_ABBREVIATIONS_BRICK, COMMON_POINT_ABBREVIATIONS
parsed_count(int): total parsed across all clusters
unparsed_count(int): total unparsed across all clusters
total_count(int): total in all clusters

combined_clusters:List[dict]:
for each cluster, each Dict has
-parser(Dict): serialized parser
-source_code(str): parser code
-tokens(List): emitted tokens from running parser on cluster
-parsed_labels(List): building point labels in which parser did not contain an error
-unparsed(List): building point labels in which parser contained an error
-parser_metrics(Dict):
-parsed_count(int): count of parsed for that cluster
-unparsed_count(int): count of unparsed for that cluster
-total_count(int): count of total for that cluster
"""

def __init__(
self,
filename: str,
col_name: str,
num_tries=3,
list_of_dicts=[COMMON_EQUIP_ABBREVIATIONS_BRICK, COMMON_POINT_ABBREVIATIONS],
):
"""
Initializes a new SerializedParserMetrics object.

Args:
filename (str): file path to csv file
col_name (str): relevant column where data is stored
num_tries (Optional, int): max number of times for LLM to try to generate LLM_Token_Predictions object. Defaults to 3.
list_of_dicts (Optional, List): List of dictionaries where abbreviation is matched to brick class. Defaults to [equip_abbreviations, point_abbreviations]
"""
dllliu marked this conversation as resolved.
Show resolved Hide resolved
filename = os.path.abspath(filename)

try:
(
self.parsers,
self.clusters,
self.distance_metrics,
self.clustering_metrics,
self.flagged_abbreviations,
) = generate_parsers_for_clusters(
filename, col_name, num_tries, list_of_dicts
)
except (
ValueError
): # if not enough points to cluster, generate parsers for each point
self.parsers, self.clusters, self.flagged_abbreviations = (
generate_parsers_for_points(
filename, col_name, num_tries, list_of_dicts
)
)
self.distance_metrics, self.clustering_metrics = {}, {}

self.list_of_dicts = list_of_dicts
self.serializers_list = []
self.parsed_count = 0
self.unparsed_count = 0
self.total_count = 0
self.combined_clusters = []

for parser, cluster in zip(self.parsers, self.clusters):
clustered_info = {}
try:
temp_filename = os.path.join(os.getcwd(), "temp_parser.py")

with open(
temp_filename, mode="w"
) as temp_file: # using tempfile library writes file in /appdata directory, dependencies difficult to manage
pattern = re.compile(r"([^\s]+)")
parser_var = pattern.match(parser)[
0
] # matches the parser variable (e.g. parser_lencluster_11_417)
temp_file.write(
"""
from buildingmotif.label_parsing.combinators import *
from buildingmotif.label_parsing.tools import abbreviationsTool
from buildingmotif.label_parsing.parser import parser_on_list
from buildingmotif.api.serializers import parser as serializerTool
import rdflib
"""
)
temp_file.write(
f"""
COMBINED_ABBREVIATIONS = abbreviationsTool.make_combined_abbreviations({list_of_dicts})
"""
)
temp_file.write(
f"""
{parser}"""
)
temp_file.write(
f"""
def get_serialization():
return serializerTool.serialize({parser_var})"""
)
temp_file.write(
f"""
cluster = {cluster}"""
)
temp_file.write(
f"""
def run_parser():
parsed, parsed_elements, unparsed, right, wrong = parser_on_list({parser_var}, cluster)
return parsed, parsed_elements, unparsed, right, wrong"""
)

# Load the module from the temporary file dynamically
spec = importlib.util.spec_from_file_location(
"generated", temp_filename
)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Call the function defined in the module to get parsed results, unparsed elements, and total parsed/unparsed
parsed_arr, parsed_elements, unparsed_arr, right, wrong = (
module.run_parser()
)
serialized = module.get_serialization()
self.parsed_count += right
self.unparsed_count += wrong
self.total_count += right + wrong
clustered_info["parser"] = serialized
clustered_info["source_code"] = parser
clustered_info["parsed_labels"] = parsed_elements
clustered_info["tokens"] = parsed_arr
clustered_info["unparsed_labels"] = unparsed_arr
clustered_info["parser_metrics"] = {
"parsed_count": right,
"unparsed_count": wrong,
"total_count": right + wrong,
}
self.combined_clusters.append(clustered_info)
self.serializers_list.append(serialized)

finally:
if os.path.exists(temp_filename):
os.remove(temp_filename) # remove file when complete

def write_to_directory(self, directory: str):
"""
Writes each parser and cluster to a file along with necessary imports.
Saves in specified directory.

Parameters:
directory(str): each file containing a parser and its cluster will be saved to this directory

Returns:
None
"""

if not os.path.exists(directory):
os.makedirs(directory)

for parser, cluster in zip(self.parsers, self.clusters):
pattern = re.compile(r"([^\s]+)")
parser_var = pattern.match(parser)[0]
filename = parser_var + ".py"
with open(os.path.join(directory, filename), "w") as file:
file.write(
"""
from buildingmotif.label_parsing.combinators import *
from buildingmotif.label_parsing.parser import parser_on_list
from buildingmotif.label_parsing.tools import abbreviationsTool
import rdflib
"""
)
file.write(
f"""
COMBINED_ABBREVIATIONS = abbreviationsTool.make_combined_abbreviations({abbreviationsTool.make_combined_dict(self.list_of_dicts)})
"""
)
file.write(
f"""
{parser}"""
)
file.write(
f"""
cluster = {cluster}"""
)
codeLinter._run(os.path.join(directory, filename))
Loading
Loading