Use YData's MultiTableSynthesizer to generate multi table synthetic data from multiple RDBMS tables
+
Multi table is the way to synthesize data from multiple tables from a database, with a relational in mind...
+
Quickstart example:
+
importos
+
+fromydata.sdk.datasourcesimportDataSource
+fromydata.sdk.synthesizersimportMultiTableSynthesizer
+
+# Do not forget to add your token as env variables
+os.environ["YDATA_TOKEN"]='<TOKEN>'# Remove if already defined
+
+# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.
+# After training a Multi Table Synthesizer, we request a sample.
+# In this case, we don't return the Dataset for the sample, it will be saved in the database
+# that the connector refers to.
+
+X=DataSource.get('<DATASOURCE_UID>')
+
+# Initialize a multi table synthesizer with the connector to write to
+# As long as the synthesizer does not call `fit`, it exists only locally
+# write_connector can be an UID or a Connector instance
+synth=MultiTableSynthesizer(write_connector='<CONNECTOR_UID')
+
+# The synthesizer training is requested
+synth.fit(X)
+
+# We request a synthetic dataset with a fracion of 1.5
+synth.sample(frac=1.5)
+
+
Sample write connector overriding example:
+
importos
+
+fromydata.sdk.connectorsimportConnector
+fromydata.sdk.datasourcesimportDataSource
+fromydata.sdk.synthesizersimportMultiTableSynthesizer
+
+# Do not forget to add your token as env variables
+os.environ["YDATA_TOKEN"]='<TOKEN>'# Remove if already defined
+
+# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.
+# After training a Multi Table Synthesizer, we request a sample.
+# In this case, we don't return the Dataset for the sample, it will be saved in the database
+# that the connector refers to.
+
+X=DataSource.get('<DATASOURCE_UID>')
+
+# For demonstration purposes, we will use a connector instance, but you can just send the UID
+
+write_connector=Connector.get('<CONNECTOR_UID>')
+
+# Initialize a multi table synthesizer with the connector to write to
+# As long as the synthesizer does not call `fit`, it exists only locally
+# write_connector can be an UID or a Connector instance
+synth=MultiTableSynthesizer(write_connector=write_connector)
+
+# The synthesizer training is requested
+synth.fit(X)
+
+# We request a synthetic dataset with a fracion of 1.5
+# In this case we use a Connector instance.
+# You can just use the <CONNECTOR_UID> you don't need to get the connector upfront.
+synth.sample(frac=1.5,write_connector=write_connector)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/0.7/objects.inv b/0.7/objects.inv
index ba0f5672..8fecb4db 100644
Binary files a/0.7/objects.inv and b/0.7/objects.inv differ
diff --git a/0.7/sdk/reference/api/common/client/index.html b/0.7/sdk/reference/api/common/client/index.html
index 82577961..148fa74e 100644
--- a/0.7/sdk/reference/api/common/client/index.html
+++ b/0.7/sdk/reference/api/common/client/index.html
@@ -910,42 +910,42 @@
@typecheckedclassClient(metaclass=SingletonClient):"""Main Client class used to abstract the connection to the backend.
@@ -1723,149 +1730,155 @@
Get client
ifset_as_global:self.__set_global()
-defpost(self,endpoint:str,data:Optional[Dict]=None,json:Optional[Dict]=None,
-project:Project|None=None,files:Optional[Dict]=None,raise_for_status:bool=True)->Response:
-"""POST request to the backend.
-
- Args:
- endpoint (str): POST endpoint
- data (Optional[dict]): (optional) multipart form data
- json (Optional[dict]): (optional) json data
- files (Optional[dict]): (optional) files to be sent
- raise_for_status (bool): raise an exception on error
-
- Returns:
- Response object
- """
-url_data=self.__build_url(
-endpoint,data=data,json=json,files=files,project=project)
-response=self._http_client.post(**url_data)
-
-ifresponse.status_code!=Client.codes.OKandraise_for_status:
-self.__raise_for_status(response)
-
-returnresponse
+defpost(
+self,endpoint:str,data:Optional[Dict]=None,json:Optional[Dict]=None,
+project:Optional[Project]=None,files:Optional[Dict]=None,raise_for_status:bool=True
+)->Response:
+"""POST request to the backend.
+
+ Args:
+ endpoint (str): POST endpoint
+ data (Optional[dict]): (optional) multipart form data
+ json (Optional[dict]): (optional) json data
+ files (Optional[dict]): (optional) files to be sent
+ raise_for_status (bool): raise an exception on error
+
+ Returns:
+ Response object
+ """
+url_data=self.__build_url(
+endpoint,data=data,json=json,files=files,project=project)
+response=self._http_client.post(**url_data)
+
+ifresponse.status_code!=Client.codes.OKandraise_for_status:
+self.__raise_for_status(response)
-defget(self,endpoint:str,params:Optional[Dict]=None,
-project:Project|None=None,cookies:Optional[Dict]=None,raise_for_status:bool=True)->Response:
-"""GET request to the backend.
-
- Args:
- endpoint (str): GET endpoint
- cookies (Optional[dict]): (optional) cookies data
- raise_for_status (bool): raise an exception on error
-
- Returns:
- Response object
- """
-url_data=self.__build_url(endpoint,params=params,
-cookies=cookies,project=project)
-response=self._http_client.get(**url_data)
-
-ifresponse.status_code!=Client.codes.OKandraise_for_status:
-self.__raise_for_status(response)
-
-returnresponse
-
-defget_static_file(self,endpoint:str,project:Project|None=None,raise_for_status:bool=True)->Response:
-"""Retrieve a static file from the backend.
-
- Args:
- endpoint (str): GET endpoint
- raise_for_status (bool): raise an exception on error
-
- Returns:
- Response object
- """
-url_data=self.__build_url(endpoint,project=project)
-url_data['url']=f'{self._scheme}://{self._base_url}/static-content{endpoint}'
-response=self._http_client.get(**url_data)
-
-ifresponse.status_code!=Client.codes.OKandraise_for_status:
-self.__raise_for_status(response)
-
-returnresponse
-
-def_handshake(self):
-"""Client handshake.
-
- It is used to determine is the client can connect with its
- current authorization token.
- """
-response=self.get('/profiles',params={},raise_for_status=False)
-ifresponse.status_code==Client.codes.FOUND:
-parser=LinkExtractor()
-parser.feed(response.text)
-raiseClientHandshakeError(auth_link=parser.link)
-
-def_get_default_project(self,token:str):
-response=self.get('/profiles/me',params={},cookies={'access_token':token})
-data:Dict=response.json()
-returndata['myWorkspace']
-
-def__build_url(self,endpoint:str,params:Optional[Dict]=None,data:Optional[Dict]=None,
-json:Optional[Dict]=None,project:Project|None=None,files:Optional[Dict]=None,
-cookies:Optional[Dict]=None)->Dict:
-"""Build a request for the backend.
-
- Args:
- endpoint (str): backend endpoint
- params (Optional[dict]): URL parameters
- data (Optional[Project]): (optional) multipart form data
- json (Optional[dict]): (optional) json data
- files (Optional[dict]): (optional) files to be sent
- cookies (Optional[dict]): (optional) cookies data
-
- Returns:
- dictionary containing the information to perform a request
- """
-_params=paramsifparamsisnotNoneelse{
-'ns':projectorself._default_project
-}
-
-url_data={
-'url':f'{self._scheme}://{self._base_url}/api{endpoint}',
-'headers':self._headers,
-'params':_params,
+returnresponse
+
+defget(
+self,endpoint:str,params:Optional[Dict]=None,project:Optional[Project]=None,
+cookies:Optional[Dict]=None,raise_for_status:bool=True
+)->Response:
+"""GET request to the backend.
+
+ Args:
+ endpoint (str): GET endpoint
+ cookies (Optional[dict]): (optional) cookies data
+ raise_for_status (bool): raise an exception on error
+
+ Returns:
+ Response object
+ """
+url_data=self.__build_url(endpoint,params=params,
+cookies=cookies,project=project)
+response=self._http_client.get(**url_data)
+
+ifresponse.status_code!=Client.codes.OKandraise_for_status:
+self.__raise_for_status(response)
+
+returnresponse
+
+defget_static_file(
+self,endpoint:str,project:Optional[Project]=None,raise_for_status:bool=True
+)->Response:
+"""Retrieve a static file from the backend.
+
+ Args:
+ endpoint (str): GET endpoint
+ raise_for_status (bool): raise an exception on error
+
+ Returns:
+ Response object
+ """
+url_data=self.__build_url(endpoint,project=project)
+url_data['url']=f'{self._scheme}://{self._base_url}/static-content{endpoint}'
+response=self._http_client.get(**url_data)
+
+ifresponse.status_code!=Client.codes.OKandraise_for_status:
+self.__raise_for_status(response)
+
+returnresponse
+
+def_handshake(self):
+"""Client handshake.
+
+ It is used to determine is the client can connect with its
+ current authorization token.
+ """
+response=self.get('/profiles',params={},raise_for_status=False)
+ifresponse.status_code==Client.codes.FOUND:
+parser=LinkExtractor()
+parser.feed(response.text)
+raiseClientHandshakeError(auth_link=parser.link)
+
+def_get_default_project(self,token:str):
+response=self.get('/profiles/me',params={},cookies={'access_token':token})
+data:Dict=response.json()
+returndata['myWorkspace']
+
+def__build_url(self,endpoint:str,params:Optional[Dict]=None,data:Optional[Dict]=None,
+json:Optional[Dict]=None,project:Optional[Project]=None,files:Optional[Dict]=None,
+cookies:Optional[Dict]=None)->Dict:
+"""Build a request for the backend.
+
+ Args:
+ endpoint (str): backend endpoint
+ params (Optional[dict]): URL parameters
+ data (Optional[Project]): (optional) multipart form data
+ json (Optional[dict]): (optional) json data
+ files (Optional[dict]): (optional) files to be sent
+ cookies (Optional[dict]): (optional) cookies data
+
+ Returns:
+ dictionary containing the information to perform a request
+ """
+_params=paramsifparamsisnotNoneelse{
+'ns':projectorself._default_project}
-ifdataisnotNone:
-url_data['data']=data
-
-ifjsonisnotNone:
-url_data['json']=json
+url_data={
+'url':f'{self._scheme}://{self._base_url}/api{endpoint}',
+'headers':self._headers,
+'params':_params,
+}
-iffilesisnotNone:
-url_data['files']=files
+ifdataisnotNone:
+url_data['data']=data
-ifcookiesisnotNone:
-url_data['cookies']=cookies
+ifjsonisnotNone:
+url_data['json']=json
-returnurl_data
-
-def__set_global(self)->None:
-"""Sets a client instance as global."""
-# If the client is stateful, close it gracefully!
-Client.GLOBAL_CLIENT=self
-
-def__raise_for_status(self,response:Response)->None:
-"""Raise an exception if the response is not OK.
-
- When an exception is raised, we try to convert it to a ResponseError which is
- a wrapper around a backend error. This usually gives enough context and provides
- nice error message.
-
- If it cannot be converted to ResponseError, it is re-raised.
+iffilesisnotNone:
+url_data['files']=files
+
+ifcookiesisnotNone:
+url_data['cookies']=cookies
+
+returnurl_data
+
+def__set_global(self)->None:
+"""Sets a client instance as global."""
+# If the client is stateful, close it gracefully!
+Client.GLOBAL_CLIENT=self
+
+def__raise_for_status(self,response:Response)->None:
+"""Raise an exception if the response is not OK.
- Args:
- response (Response): response to analyze
- """
-try:
-response.raise_for_status()
-exceptHTTPStatusErrorase:
-withsuppress(Exception):
-e=ResponseError(**response.json())
-raisee
+ When an exception is raised, we try to convert it to a ResponseError which is
+ a wrapper around a backend error. This usually gives enough context and provides
+ nice error message.
+
+ If it cannot be converted to ResponseError, it is re-raised.
+
+ Args:
+ response (Response): response to analyze
+ """
+try:
+response.raise_for_status()
+exceptHTTPStatusErrorase:
+withsuppress(Exception):
+e=ResponseError(**response.json())
+raisee
def__build_url(self,endpoint:str,params:Optional[Dict]=None,data:Optional[Dict]=None,
-json:Optional[Dict]=None,project:Project|None=None,files:Optional[Dict]=None,
-cookies:Optional[Dict]=None)->Dict:
-"""Build a request for the backend.
-
- Args:
- endpoint (str): backend endpoint
- params (Optional[dict]): URL parameters
- data (Optional[Project]): (optional) multipart form data
- json (Optional[dict]): (optional) json data
- files (Optional[dict]): (optional) files to be sent
- cookies (Optional[dict]): (optional) cookies data
-
- Returns:
- dictionary containing the information to perform a request
- """
-_params=paramsifparamsisnotNoneelse{
-'ns':projectorself._default_project
-}
-
-url_data={
-'url':f'{self._scheme}://{self._base_url}/api{endpoint}',
-'headers':self._headers,
-'params':_params,
+181
+182
+183
+184
+185
+186
+187
def__build_url(self,endpoint:str,params:Optional[Dict]=None,data:Optional[Dict]=None,
+json:Optional[Dict]=None,project:Optional[Project]=None,files:Optional[Dict]=None,
+cookies:Optional[Dict]=None)->Dict:
+"""Build a request for the backend.
+
+ Args:
+ endpoint (str): backend endpoint
+ params (Optional[dict]): URL parameters
+ data (Optional[Project]): (optional) multipart form data
+ json (Optional[dict]): (optional) json data
+ files (Optional[dict]): (optional) files to be sent
+ cookies (Optional[dict]): (optional) cookies data
+
+ Returns:
+ dictionary containing the information to perform a request
+ """
+_params=paramsifparamsisnotNoneelse{
+'ns':projectorself._default_project}
-ifdataisnotNone:
-url_data['data']=data
-
-ifjsonisnotNone:
-url_data['json']=json
+url_data={
+'url':f'{self._scheme}://{self._base_url}/api{endpoint}',
+'headers':self._headers,
+'params':_params,
+}
-iffilesisnotNone:
-url_data['files']=files
+ifdataisnotNone:
+url_data['data']=data
-ifcookiesisnotNone:
-url_data['cookies']=cookies
+ifjsonisnotNone:
+url_data['json']=json
-returnurl_data
+iffilesisnotNone:
+url_data['files']=files
+
+ifcookiesisnotNone:
+url_data['cookies']=cookies
+
+returnurl_data
def__raise_for_status(self,response:Response)->None:
-"""Raise an exception if the response is not OK.
-
- When an exception is raised, we try to convert it to a ResponseError which is
- a wrapper around a backend error. This usually gives enough context and provides
- nice error message.
-
- If it cannot be converted to ResponseError, it is re-raised.
+205
+206
+207
+208
+209
+210
+211
def__raise_for_status(self,response:Response)->None:
+"""Raise an exception if the response is not OK.
- Args:
- response (Response): response to analyze
- """
-try:
-response.raise_for_status()
-exceptHTTPStatusErrorase:
-withsuppress(Exception):
-e=ResponseError(**response.json())
-raisee
+ When an exception is raised, we try to convert it to a ResponseError which is
+ a wrapper around a backend error. This usually gives enough context and provides
+ nice error message.
+
+ If it cannot be converted to ResponseError, it is re-raised.
+
+ Args:
+ response (Response): response to analyze
+ """
+try:
+response.raise_for_status()
+exceptHTTPStatusErrorase:
+withsuppress(Exception):
+e=ResponseError(**response.json())
+raisee
@@ -2204,7 +2215,6 @@
__set_global()
@@ -2217,13 +2227,13 @@
Source code in ydata/sdk/common/client/client.py
-
@typechecked
+classBaseSynthesizer(ABC,ModelFactoryMixin):
+"""Main synthesizer class.
+
+ This class cannot be directly instanciated because of the specificities between [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer], [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] or [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer] `sample` methods.
- This class cannot be directly instanciated because of the specificities between [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer] and [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] `sample` methods.
-
- Methods
- -------
- - `fit`: train a synthesizer instance.
- - `sample`: request synthetic data.
- - `status`: current status of the synthesizer instance.
-
- Note:
- The synthesizer instance is created in the backend only when the `fit` method is called.
-
- Arguments:
- client (Client): (optional) Client to connect to the backend
- """
-
-def__init__(self,uid:UID|None=None,name:str|None=None,project:Project|None=None,client:Client|None=None):
+ Methods
+ -------
+ - `fit`: train a synthesizer instance.
+ - `sample`: request synthetic data.
+ - `status`: current status of the synthesizer instance.
+
+ Note:
+ The synthesizer instance is created in the backend only when the `fit` method is called.
+
+ Arguments:
+ client (Client): (optional) Client to connect to the backend
+ """
+
+def__init__(
+self,uid:Optional[UID]=None,name:Optional[str]=None,
+project:Optional[Project]=None,client:Optional[Client]=None):self._init_common(client=client)
-self._model=mSynthesizer(uid=uid,name=nameorstr(
-uuid4()))ifuidorprojectelseNone
-self.__project=project
-
-@init_client
-def_init_common(self,client:Optional[Client]=None):
-self._client=client
-self._logger=create_logger(__name__,level=LOG_LEVEL)
-
-deffit(self,X:Union[DataSource,pdDataFrame],
-privacy_level:PrivacyLevel=PrivacyLevel.HIGH_FIDELITY,
-datatype:Optional[Union[DataSourceType,str]]=None,
-sortbykey:Optional[Union[str,List[str]]]=None,
-entities:Optional[Union[str,List[str]]]=None,
-generate_cols:Optional[List[str]]=None,
-exclude_cols:Optional[List[str]]=None,
-dtypes:Optional[Dict[str,Union[str,DataType]]]=None,
-target:Optional[str]=None,
-anonymize:Optional[dict]=None,
-condition_on:Optional[List[str]]=None)->None:
-"""Fit the synthesizer.
-
- The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].
- When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.
-
- The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].
-
- By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.
- The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.
-
- Arguments:
- X (Union[DataSource, pandas.DataFrame]): Training dataset
- privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
- datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]
- sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets
- entities (Union[str, List[str]]): (optional) columns representing entities ID
- generate_cols (List[str]): (optional) columns that should be synthesized
- exclude_cols (List[str]): (optional) columns that should not be synthesized
- dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
- target (Optional[str]): (optional) Target for the dataset
- name (Optional[str]): (optional) Synthesizer instance name
- anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
- condition_on: (Optional[List[str]]): (optional) list of features to condition upon
- """
-ifself._is_initialized():
-raiseAlreadyFittedError()
-
-_datatype=DataSourceType(datatype)ifisinstance(
-X,pdDataFrame)elseDataSourceType(X.datatype)
-
-dataset_attrs=self._init_datasource_attributes(
-sortbykey,entities,generate_cols,exclude_cols,dtypes)
-self._validate_datasource_attributes(X,dataset_attrs,_datatype,target)
-
-# If the training data is a pandas dataframe, we first need to create a data source and then the instance
-ifisinstance(X,pdDataFrame):
-ifX.empty:
-raiseEmptyDataError("The DataFrame is empty")
-_X=LocalDataSource(source=X,datatype=_datatype,client=self._client)
-else:
-ifdatatype!=_datatype:
-warn("When the training data is a DataSource, the argument `datatype` is ignored.",
-DataSourceTypeWarning)
-_X=X
-
-if_X.status!=dsStatus.AVAILABLE:
-raiseDataSourceNotAvailableError(
-f"The datasource '{_X.uid}' is not available (status = {_X.status.value})")
-
-ifisinstance(dataset_attrs,dict):
-dataset_attrs=DataSourceAttrs(**dataset_attrs)
-
-self._fit_from_datasource(
-X=_X,dataset_attrs=dataset_attrs,target=target,
-anonymize=anonymize,privacy_level=privacy_level,condition_on=condition_on)
-
-@staticmethod
-def_init_datasource_attributes(
-sortbykey:Optional[Union[str,List[str]]],
-entities:Optional[Union[str,List[str]]],
-generate_cols:Optional[List[str]],
-exclude_cols:Optional[List[str]],
-dtypes:Optional[Dict[str,Union[str,DataType]]])->DataSourceAttrs:
-dataset_attrs={
-'sortbykey':sortbykeyifsortbykeyisnotNoneelse[],
-'entities':entitiesifentitiesisnotNoneelse[],
-'generate_cols':generate_colsifgenerate_colsisnotNoneelse[],
-'exclude_cols':exclude_colsifexclude_colsisnotNoneelse[],
-'dtypes':{k:DataType(v)fork,vindtypes.items()}ifdtypesisnotNoneelse{}
-}
-returnDataSourceAttrs(**dataset_attrs)
-
-@staticmethod
-def_validate_datasource_attributes(X:Union[DataSource,pdDataFrame],dataset_attrs:DataSourceAttrs,datatype:DataSourceType,target:Optional[str]):
-columns=[]
-ifisinstance(X,pdDataFrame):
-columns=X.columns
-ifdatatypeisNone:
-raiseDataTypeMissingError(
-"Argument `datatype` is mandatory for pandas.DataFrame training data")
-datatype=DataSourceType(datatype)
-else:
-columns=[c.nameforcinX.metadata.columns]
-
-iftargetisnotNoneandtargetnotincolumns:
-raiseDataSourceAttrsError(
-"Invalid target: column '{target}' does not exist")
-
-ifdatatype==DataSourceType.TIMESERIES:
-ifnotdataset_attrs.sortbykey:
-raiseDataSourceAttrsError(
-"The argument `sortbykey` is mandatory for timeseries datasource.")
-
-invalid_fields={}
-forfield,vindataset_attrs.dict().items():
-field_columns=viffield!='dtypes'elsev.keys()
-not_in_cols=[cforcinfield_columnsifcnotincolumns]
-iflen(not_in_cols)>0:
-invalid_fields[field]=not_in_cols
-
-iflen(invalid_fields)>0:
-error_msgs=["\t- Field '{}': columns {} do not exist".format(
-f,', '.join(v))forf,vininvalid_fields.items()]
-raiseDataSourceAttrsError(
-"The dataset attributes are invalid:\n{}".format('\n'.join(error_msgs)))
-
-@staticmethod
-def_metadata_to_payload(
-datatype:DataSourceType,ds_metadata:Metadata,
-dataset_attrs:Optional[DataSourceAttrs]=None,target:str|None=None
-)->dict:
-"""Transform a the metadata and dataset attributes into a valid
- payload.
-
- Arguments:
- datatype (DataSourceType): datasource type
- ds_metadata (Metadata): datasource metadata object
- dataset_attrs ( Optional[DataSourceAttrs] ): (optional) Dataset attributes
- target (Optional[str]): (optional) target column name
-
- Returns:
- metadata payload dictionary
- """
-
-columns=[
-{
-'name':c.name,
-'generation':Trueandc.namenotindataset_attrs.exclude_cols,
-'dataType':DataType(dataset_attrs.dtypes[c.name]).valueifc.nameindataset_attrs.dtypeselsec.datatype,
-'varType':c.vartype,
-}
-forcinds_metadata.columns]
-
-metadata={
-'columns':columns,
-'target':target
-}
-
-ifdataset_attrsisnotNone:
-ifdatatype==DataSourceType.TIMESERIES:
-metadata['sortBy']=[cforcindataset_attrs.sortbykey]
-metadata['entity']=[cforcindataset_attrs.entities]
-
-returnmetadata
-
-def_fit_from_datasource(
-self,
-X:DataSource,
-privacy_level:PrivacyLevel=PrivacyLevel.HIGH_FIDELITY,
-dataset_attrs:Optional[DataSourceAttrs]=None,
-target:Optional[str]=None,
-anonymize:Optional[dict]=None,
-condition_on:Optional[List[str]]=None
-)->None:
-metadata=self._metadata_to_payload(
-DataSourceType(X.datatype),X.metadata,dataset_attrs,target)
-payload={
-'name':self._model.name,
-'dataSourceUID':X.uid,
-'metadata':metadata,
-'extraData':{},
-'privacyLevel':privacy_level.value
-}
-ifanonymizeisnotNone:
-payload["extraData"]["anonymize"]=anonymize
-ifcondition_onisnotNone:
-payload["extraData"]["condition_on"]=condition_on
-
-response=self._client.post(
-'/synthesizer/',json=payload,project=self.__project)
-data:list=response.json()
-self._model,_=self._model_from_api(X.datatype,data)
-whileself.statusnotin[Status.READY,Status.FAILED]:
-self._logger.info('Training the synthesizer...')
-sleep(BACKOFF)
-
-ifself.status==Status.FAILED:
-raiseFittingError('Could not train the synthesizer')
-
-@staticmethod
-def_model_from_api(datatype:str,data:Dict)->Tuple[mSynthesizer,Type["BaseSynthesizer"]]:
-fromydata.sdk.synthesizers._models.synthesizer_mapimportTYPE_TO_CLASS
-synth_cls=TYPE_TO_CLASS.get(SynthesizerType(datatype).value)
-data['status']=synth_cls._resolve_api_status(data['status'])
-data=filter_dict(mSynthesizer,data)
-returnmSynthesizer(**data),synth_cls
+self._model=mSynthesizer(uid=uid,name=nameorstr(uuid4()))
+self._project=project
+
+@init_client
+def_init_common(self,client:Optional[Client]=None):
+self._client=client
+self._logger=create_logger(__name__,level=LOG_LEVEL)
+
+deffit(self,X:Union[DataSource,pdDataFrame],
+privacy_level:PrivacyLevel=PrivacyLevel.HIGH_FIDELITY,
+datatype:Optional[Union[DataSourceType,str]]=None,
+sortbykey:Optional[Union[str,List[str]]]=None,
+entities:Optional[Union[str,List[str]]]=None,
+generate_cols:Optional[List[str]]=None,
+exclude_cols:Optional[List[str]]=None,
+dtypes:Optional[Dict[str,Union[str,DataType]]]=None,
+target:Optional[str]=None,
+anonymize:Optional[dict]=None,
+condition_on:Optional[List[str]]=None)->None:
+"""Fit the synthesizer.
+
+ The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].
+ When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.
+
+ The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].
+
+ By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.
+ The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.
+
+ Arguments:
+ X (Union[DataSource, pandas.DataFrame]): Training dataset
+ privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
+ datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]
+ sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets
+ entities (Union[str, List[str]]): (optional) columns representing entities ID
+ generate_cols (List[str]): (optional) columns that should be synthesized
+ exclude_cols (List[str]): (optional) columns that should not be synthesized
+ dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
+ target (Optional[str]): (optional) Target for the dataset
+ name (Optional[str]): (optional) Synthesizer instance name
+ anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
+ condition_on: (Optional[List[str]]): (optional) list of features to condition upon
+ """
+ifself._is_initialized():
+raiseAlreadyFittedError()
+
+_datatype=DataSourceType(datatype)ifisinstance(
+X,pdDataFrame)elseDataSourceType(X.datatype)
+
+dataset_attrs=self._init_datasource_attributes(
+sortbykey,entities,generate_cols,exclude_cols,dtypes)
+self._validate_datasource_attributes(X,dataset_attrs,_datatype,target)
+
+# If the training data is a pandas dataframe, we first need to create a data source and then the instance
+ifisinstance(X,pdDataFrame):
+ifX.empty:
+raiseEmptyDataError("The DataFrame is empty")
+_X=LocalDataSource(source=X,datatype=_datatype,client=self._client)
+else:
+ifdatatype!=_datatype:
+warn("When the training data is a DataSource, the argument `datatype` is ignored.",
+DataSourceTypeWarning)
+_X=X
+
+if_X.status!=dsStatus.AVAILABLE:
+raiseDataSourceNotAvailableError(
+f"The datasource '{_X.uid}' is not available (status = {_X.status.value})")
+
+ifisinstance(dataset_attrs,dict):
+dataset_attrs=DataSourceAttrs(**dataset_attrs)
+
+self._fit_from_datasource(
+X=_X,dataset_attrs=dataset_attrs,target=target,
+anonymize=anonymize,privacy_level=privacy_level,condition_on=condition_on)
+
+@staticmethod
+def_init_datasource_attributes(
+sortbykey:Optional[Union[str,List[str]]],
+entities:Optional[Union[str,List[str]]],
+generate_cols:Optional[List[str]],
+exclude_cols:Optional[List[str]],
+dtypes:Optional[Dict[str,Union[str,DataType]]])->DataSourceAttrs:
+dataset_attrs={
+'sortbykey':sortbykeyifsortbykeyisnotNoneelse[],
+'entities':entitiesifentitiesisnotNoneelse[],
+'generate_cols':generate_colsifgenerate_colsisnotNoneelse[],
+'exclude_cols':exclude_colsifexclude_colsisnotNoneelse[],
+'dtypes':{k:DataType(v)fork,vindtypes.items()}ifdtypesisnotNoneelse{}
+}
+returnDataSourceAttrs(**dataset_attrs)
+
+@staticmethod
+def_validate_datasource_attributes(X:Union[DataSource,pdDataFrame],dataset_attrs:DataSourceAttrs,datatype:DataSourceType,target:Optional[str]):
+columns=[]
+ifisinstance(X,pdDataFrame):
+columns=X.columns
+ifdatatypeisNone:
+raiseDataTypeMissingError(
+"Argument `datatype` is mandatory for pandas.DataFrame training data")
+datatype=DataSourceType(datatype)
+else:
+columns=[c.nameforcinX.metadata.columns]
+
+iftargetisnotNoneandtargetnotincolumns:
+raiseDataSourceAttrsError(
+"Invalid target: column '{target}' does not exist")
+
+ifdatatype==DataSourceType.TIMESERIES:
+ifnotdataset_attrs.sortbykey:
+raiseDataSourceAttrsError(
+"The argument `sortbykey` is mandatory for timeseries datasource.")
+
+invalid_fields={}
+forfield,vindataset_attrs.dict().items():
+field_columns=viffield!='dtypes'elsev.keys()
+not_in_cols=[cforcinfield_columnsifcnotincolumns]
+iflen(not_in_cols)>0:
+invalid_fields[field]=not_in_cols
+
+iflen(invalid_fields)>0:
+error_msgs=["\t- Field '{}': columns {} do not exist".format(
+f,', '.join(v))forf,vininvalid_fields.items()]
+raiseDataSourceAttrsError(
+"The dataset attributes are invalid:\n{}".format('\n'.join(error_msgs)))
+
+@staticmethod
+def_metadata_to_payload(
+datatype:DataSourceType,ds_metadata:Metadata,
+dataset_attrs:Optional[DataSourceAttrs]=None,target:Optional[str]=None
+)->dict:
+"""Transform a the metadata and dataset attributes into a valid
+ payload.
+
+ Arguments:
+ datatype (DataSourceType): datasource type
+ ds_metadata (Metadata): datasource metadata object
+ dataset_attrs ( Optional[DataSourceAttrs] ): (optional) Dataset attributes
+ target (Optional[str]): (optional) target column name
+
+ Returns:
+ metadata payload dictionary
+ """
+
+columns=[
+{
+'name':c.name,
+'generation':Trueandc.namenotindataset_attrs.exclude_cols,
+'dataType':DataType(dataset_attrs.dtypes[c.name]).valueifc.nameindataset_attrs.dtypeselsec.datatype,
+'varType':c.vartype,
+}
+forcinds_metadata.columns]
+
+metadata={
+'columns':columns,
+'target':target
+}
+
+ifdataset_attrsisnotNone:
+ifdatatype==DataSourceType.TIMESERIES:
+metadata['sortBy']=[cforcindataset_attrs.sortbykey]
+metadata['entity']=[cforcindataset_attrs.entities]
+
+returnmetadata
+
+def_fit_from_datasource(
+self,
+X:DataSource,
+privacy_level:Optional[PrivacyLevel]=None,
+dataset_attrs:Optional[DataSourceAttrs]=None,
+target:Optional[str]=None,
+anonymize:Optional[dict]=None,
+condition_on:Optional[List[str]]=None
+)->None:
+payload=self._create_payload()
+
+payload['dataSourceUID']=X.uid
+
+ifprivacy_level:
+payload['privacy_level']=privacy_level.value
+
+ifX.metadataisnotNoneandX.datatypeisnotNone:
+payload['metadata']=self._metadata_to_payload(
+DataSourceType(X.datatype),X.metadata,dataset_attrs,target)
+
+ifanonymizeisnotNone:
+payload["extraData"]["anonymize"]=anonymize
+ifcondition_onisnotNone:
+payload["extraData"]["condition_on"]=condition_on
+
+response=self._client.post(
+'/synthesizer/',json=payload,project=self._project)
+data=response.json()
+self._model=mSynthesizer(**data)
+whileself._check_fitting_not_finished(self.status):
+self._logger.info('Training the synthesizer...')
+sleep(BACKOFF)
+
+def_create_payload(self)->dict:
+payload={
+'extraData':{}
+}
+
+ifself._modelandself._model.name:
+payload['name']=self._model.name
+
+returnpayload
-@abstractmethod
-defsample(self)->pdDataFrame:
-"""Abstract method to sample from a synthesizer."""
-
-def_sample(self,payload:Dict)->pdDataFrame:
-"""Sample from a synthesizer.
-
- Arguments:
- payload (dict): payload configuring the sample request
-
- Returns:
- pandas `DataFrame`
- """
-response=self._client.post(
-f"/synthesizer/{self.uid}/sample",json=payload,project=self.__project)
+def_check_fitting_not_finished(self,status:Status)->bool:
+self._logger.debug(f'checking status {status}')
+
+ifstatus.statein[Status.State.READY,Status.State.REPORT]:
+returnFalse
+
+self._logger.debug(f'status not ready yet {status.state}')
+
+ifstatus.prepareandPrepareState(status.prepare.state)==PrepareState.FAILED:
+raiseFittingError('Could not train the synthesizer')
+
+ifstatus.trainingandTrainingState(status.training.state)==TrainingState.FAILED:
+raiseFittingError('Could not train the synthesizer')
+
+returnTrue
-data:Dict=response.json()
-sample_uid=data.get('uid')
-sample_status=None
-whilesample_statusnotin['finished','failed']:
-self._logger.info('Sampling from the synthesizer...')
-response=self._client.get(
-f'/synthesizer/{self.uid}/history',project=self.__project)
-history:Dict=response.json()
-sample_data=next((sforsinhistoryifs.get('uid')==sample_uid),None)
-sample_status=sample_data.get('status',{}).get('state')
-sleep(BACKOFF)
-
-response=self._client.get_static_file(
-f'/synthesizer/{self.uid}/sample/{sample_uid}/sample.csv',project=self.__project)
-data=StringIO(response.content.decode())
-returnread_csv(data)
-
-@property
-defuid(self)->UID:
-"""Get the status of a synthesizer instance.
-
- Returns:
- Synthesizer status
- """
-ifnotself._is_initialized():
-returnStatus.NOT_INITIALIZED
-
-returnself._model.uid
-
-@property
-defstatus(self)->Status:
-"""Get the status of a synthesizer instance.
+@abstractmethod
+defsample(self)->pdDataFrame:
+"""Abstract method to sample from a synthesizer."""
+
+def_sample(self,payload:Dict)->pdDataFrame:
+"""Sample from a synthesizer.
+
+ Arguments:
+ payload (dict): payload configuring the sample request
+
+ Returns:
+ pandas `DataFrame`
+ """
+response=self._client.post(
+f"/synthesizer/{self.uid}/sample",json=payload,project=self._project)
+
+data:Dict=response.json()
+sample_uid=data.get('uid')
+sample_status=None
+whilesample_statusnotin['finished','failed']:
+self._logger.info('Sampling from the synthesizer...')
+response=self._client.get(
+f'/synthesizer/{self.uid}/history',project=self._project)
+history:Dict=response.json()
+sample_data=next((sforsinhistoryifs.get('uid')==sample_uid),None)
+sample_status=sample_data.get('status',{}).get('state')
+sleep(BACKOFF)
+
+response=self._client.get_static_file(
+f'/synthesizer/{self.uid}/sample/{sample_uid}/sample.csv',project=self._project)
+data=StringIO(response.content.decode())
+returnread_csv(data)
- Returns:
- Synthesizer status
- """
-ifnotself._is_initialized():
-returnStatus.NOT_INITIALIZED
-
-try:
-self=self.get(self._model.uid,self._client)
-returnself._model.status
-exceptException:# noqa: PIE786
-returnStatus.UNKNOWN
+@property
+defuid(self)->UID:
+"""Get the status of a synthesizer instance.
+
+ Returns:
+ Synthesizer status
+ """
+ifnotself._is_initialized():
+returnStatus.State.NOT_INITIALIZED
+
+returnself._model.uid
-defget(self):
-assertself._is_initialized()andself._model.uid,InputError(
-"Please provide the synthesizer `uid`")
+@property
+defstatus(self)->Status:
+"""Get the status of a synthesizer instance.
-response=self._client.get(f'/synthesizer/{self.uid}',project=self.__project)
-data=filter_dict(mSynthesizer,response.json())
-self._model=mSynthesizer(**data)
-
-returnself
+ Returns:
+ Synthesizer status
+ """
+ifnotself._is_initialized():
+returnStatus.not_initialized()
-@staticmethod
-@init_client
-deflist(client:Optional[Client]=None)->SynthesizersList:
-"""List the synthesizer instances.
-
- Arguments:
- client (Client): (optional) Client to connect to the backend
-
- Returns:
- List of synthesizers
- """
-def__process_data(data:list)->list:
-to_del=['metadata','report','mode']
-foreindata:
-forkinto_del:
-e.pop(k,None)
-returndata
-
-response=client.get('/synthesizer')
-data:list=response.json()
-data=__process_data(data)
-
-returnSynthesizersList(data)
+try:
+self=self.get()
+returnself._model.status
+exceptException:# noqa: PIE786
+returnStatus.unknown()
+
+defget(self):
+assertself._is_initialized()andself._model.uid,InputError(
+"Please provide the synthesizer `uid`")
+
+response=self._client.get(f'/synthesizer/{self.uid}',project=self._project)
+data=response.json()
+self._model=mSynthesizer(**data)
+
+returnself
+
+@staticmethod
+@init_client
+deflist(client:Optional[Client]=None)->SynthesizersList:
+"""List the synthesizer instances.
+
+ Arguments:
+ client (Client): (optional) Client to connect to the backend
-def_is_initialized(self)->bool:
-"""Determine if a synthesizer is instanciated or not.
-
- Returns:
- True if the synthesizer is instanciated
- """
-returnself._modelisnotNone
-
-@staticmethod
-def_resolve_api_status(api_status:Dict)->Status:
-"""Determine the status of the Synthesizer.
-
- The status of the synthesizer instance is determined by the state of
- its different components.
-
- Arguments:
- api_status (dict): json from the endpoint GET /synthesizer
-
- Returns:
- Synthesizer Status
- """
-status=Status(api_status.get('state',Status.UNKNOWN.name))
-ifstatus==Status.PREPARE:
-ifPrepareState(api_status.get('prepare',{}).get(
-'state',PrepareState.UNKNOWN.name))==PrepareState.FAILED:
-returnStatus.FAILED
-elifstatus==Status.TRAIN:
-ifTrainingState(api_status.get('training',{}).get(
-'state',TrainingState.UNKNOWN.name))==TrainingState.FAILED:
-returnStatus.FAILED
-elifstatus==Status.REPORT:
-returnStatus.READY
-returnstatus
+ Returns:
+ List of synthesizers
+ """
+def__process_data(data:list)->list:
+to_del=['metadata','report','mode']
+foreindata:
+forkinto_del:
+e.pop(k,None)
+returndata
+
+response=client.get('/synthesizer')
+data:list=response.json()
+data=__process_data(data)
+
+returnSynthesizersList(data)
+
+def_is_initialized(self)->bool:
+"""Determine if a synthesizer is instanciated or not.
+
+ Returns:
+ True if the synthesizer is instanciated
+ """
+returnself._modelisnotNone
+
+@staticmethod
+def_resolve_api_status(api_status:Dict)->Status:
+"""Determine the status of the Synthesizer.
+
+ The status of the synthesizer instance is determined by the state of
+ its different components.
+
+ Arguments:
+ api_status (dict): json from the endpoint GET /synthesizer
+
+ Returns:
+ Synthesizer Status
+ """
+status=Status(api_status.get('state',Status.UNKNOWN.name))
+ifstatus==Status.PREPARE:
+ifPrepareState(api_status.get('prepare',{}).get(
+'state',PrepareState.UNKNOWN.name))==PrepareState.FAILED:
+returnStatus.FAILED
+elifstatus==Status.TRAIN:
+ifTrainingState(api_status.get('training',{}).get(
+'state',TrainingState.UNKNOWN.name))==TrainingState.FAILED:
+returnStatus.FAILED
+elifstatus==Status.REPORT:
+returnStatus.READY
+returnstatus
deffit(self,X:Union[DataSource,pdDataFrame],
-privacy_level:PrivacyLevel=PrivacyLevel.HIGH_FIDELITY,
-datatype:Optional[Union[DataSourceType,str]]=None,
-sortbykey:Optional[Union[str,List[str]]]=None,
-entities:Optional[Union[str,List[str]]]=None,
-generate_cols:Optional[List[str]]=None,
-exclude_cols:Optional[List[str]]=None,
-dtypes:Optional[Dict[str,Union[str,DataType]]]=None,
-target:Optional[str]=None,
-anonymize:Optional[dict]=None,
-condition_on:Optional[List[str]]=None)->None:
-"""Fit the synthesizer.
-
- The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].
- When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.
-
- The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].
-
- By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.
- The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.
-
- Arguments:
- X (Union[DataSource, pandas.DataFrame]): Training dataset
- privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
- datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]
- sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets
- entities (Union[str, List[str]]): (optional) columns representing entities ID
- generate_cols (List[str]): (optional) columns that should be synthesized
- exclude_cols (List[str]): (optional) columns that should not be synthesized
- dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
- target (Optional[str]): (optional) Target for the dataset
- name (Optional[str]): (optional) Synthesizer instance name
- anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
- condition_on: (Optional[List[str]]): (optional) list of features to condition upon
- """
-ifself._is_initialized():
-raiseAlreadyFittedError()
-
-_datatype=DataSourceType(datatype)ifisinstance(
-X,pdDataFrame)elseDataSourceType(X.datatype)
-
-dataset_attrs=self._init_datasource_attributes(
-sortbykey,entities,generate_cols,exclude_cols,dtypes)
-self._validate_datasource_attributes(X,dataset_attrs,_datatype,target)
-
-# If the training data is a pandas dataframe, we first need to create a data source and then the instance
-ifisinstance(X,pdDataFrame):
-ifX.empty:
-raiseEmptyDataError("The DataFrame is empty")
-_X=LocalDataSource(source=X,datatype=_datatype,client=self._client)
-else:
-ifdatatype!=_datatype:
-warn("When the training data is a DataSource, the argument `datatype` is ignored.",
-DataSourceTypeWarning)
-_X=X
-
-if_X.status!=dsStatus.AVAILABLE:
-raiseDataSourceNotAvailableError(
-f"The datasource '{_X.uid}' is not available (status = {_X.status.value})")
-
-ifisinstance(dataset_attrs,dict):
-dataset_attrs=DataSourceAttrs(**dataset_attrs)
-
-self._fit_from_datasource(
-X=_X,dataset_attrs=dataset_attrs,target=target,
-anonymize=anonymize,privacy_level=privacy_level,condition_on=condition_on)
+129
deffit(self,X:Union[DataSource,pdDataFrame],
+privacy_level:PrivacyLevel=PrivacyLevel.HIGH_FIDELITY,
+datatype:Optional[Union[DataSourceType,str]]=None,
+sortbykey:Optional[Union[str,List[str]]]=None,
+entities:Optional[Union[str,List[str]]]=None,
+generate_cols:Optional[List[str]]=None,
+exclude_cols:Optional[List[str]]=None,
+dtypes:Optional[Dict[str,Union[str,DataType]]]=None,
+target:Optional[str]=None,
+anonymize:Optional[dict]=None,
+condition_on:Optional[List[str]]=None)->None:
+"""Fit the synthesizer.
+
+ The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].
+ When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.
+
+ The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].
+
+ By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.
+ The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.
+
+ Arguments:
+ X (Union[DataSource, pandas.DataFrame]): Training dataset
+ privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
+ datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]
+ sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets
+ entities (Union[str, List[str]]): (optional) columns representing entities ID
+ generate_cols (List[str]): (optional) columns that should be synthesized
+ exclude_cols (List[str]): (optional) columns that should not be synthesized
+ dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
+ target (Optional[str]): (optional) Target for the dataset
+ name (Optional[str]): (optional) Synthesizer instance name
+ anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
+ condition_on: (Optional[List[str]]): (optional) list of features to condition upon
+ """
+ifself._is_initialized():
+raiseAlreadyFittedError()
+
+_datatype=DataSourceType(datatype)ifisinstance(
+X,pdDataFrame)elseDataSourceType(X.datatype)
+
+dataset_attrs=self._init_datasource_attributes(
+sortbykey,entities,generate_cols,exclude_cols,dtypes)
+self._validate_datasource_attributes(X,dataset_attrs,_datatype,target)
+
+# If the training data is a pandas dataframe, we first need to create a data source and then the instance
+ifisinstance(X,pdDataFrame):
+ifX.empty:
+raiseEmptyDataError("The DataFrame is empty")
+_X=LocalDataSource(source=X,datatype=_datatype,client=self._client)
+else:
+ifdatatype!=_datatype:
+warn("When the training data is a DataSource, the argument `datatype` is ignored.",
+DataSourceTypeWarning)
+_X=X
+
+if_X.status!=dsStatus.AVAILABLE:
+raiseDataSourceNotAvailableError(
+f"The datasource '{_X.uid}' is not available (status = {_X.status.value})")
+
+ifisinstance(dataset_attrs,dict):
+dataset_attrs=DataSourceAttrs(**dataset_attrs)
+
+self._fit_from_datasource(
+X=_X,dataset_attrs=dataset_attrs,target=target,
+anonymize=anonymize,privacy_level=privacy_level,condition_on=condition_on)
@@ -2472,7 +2505,6 @@
list(client=None)
@@ -2543,51 +2575,51 @@
Source code in ydata/sdk/synthesizers/synthesizer.py
-
classMultiTableSynthesizer(BaseSynthesizer):
+"""MultiTable synthesizer class.
+
+ Methods
+ -------
+ - `fit`: train a synthesizer instance.
+ - `sample`: request synthetic data.
+ - `status`: current status of the synthesizer instance.
+
+ Note:
+ The synthesizer instance is created in the backend only when the `fit` method is called.
+
+ Arguments:
+ write_connector (UID): Connector of type RDBMS to be used to write the samples
+ name (str): (optional) Name to be used when creating the synthesizer. Calculated internally if not provided
+ client (Client): (optional) Client to connect to the backend
+ """
+
+def__init__(
+self,write_connector:Union[Connector,UID],uid:Optional[UID]=None,name:Optional[str]=None,
+project:Optional[Project]=None,client:Optional[Client]=None):
+
+super().__init__(uid,name,project,client)
+
+connector=self._check_or_fetch_connector(write_connector)
+self.__write_connector=connector.uid
+
+deffit(self,X:DataSource,
+privacy_level:PrivacyLevel=PrivacyLevel.HIGH_FIDELITY,
+datatype:Optional[Union[DataSourceType,str]]=None,
+sortbykey:Optional[Union[str,List[str]]]=None,
+entities:Optional[Union[str,List[str]]]=None,
+generate_cols:Optional[List[str]]=None,
+exclude_cols:Optional[List[str]]=None,
+dtypes:Optional[Dict[str,Union[str,DataType]]]=None,
+target:Optional[str]=None,
+anonymize:Optional[dict]=None,
+condition_on:Optional[List[str]]=None)->None:
+"""Fit the synthesizer.
+
+ The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].
+ Except X, all the other arguments are for now ignored until they are supported.
+
+ Arguments:
+ X (DataSource): DataSource to Train
+ """
+
+self._fit_from_datasource(X)
+
+defsample(self,frac:Union[int,float]=1,write_connector:Optional[Union[Connector,UID]]=None)->None:
+"""Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]
+ instance.
+ The sample is saved in the connector that was provided in the synthesizer initialization
+ or in the
+
+ Arguments:
+ frac (int | float): fraction of the sample to be returned
+ """
+
+assertfrac>=0.1,InputError(
+"It is not possible to generate an empty synthetic data schema. Please validate the input provided. ")
+assertfrac<=5,InputError(
+"It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.")
+
+payload={
+'fraction':frac,
+}
+
+ifwrite_connectorisnotNone:
+connector=self._check_or_fetch_connector(write_connector)
+payload['writeConnector']=connector.uid
+
+response=self._client.post(
+f"/synthesizer/{self.uid}/sample",json=payload,project=self._project)
+
+data=response.json()
+sample_uid=data.get('uid')
+sample_status=None
+whilesample_statusnotin['finished','failed']:
+self._logger.info('Sampling from the synthesizer...')
+response=self._client.get(
+f'/synthesizer/{self.uid}/history',project=self._project)
+history=response.json()
+sample_data=next((sforsinhistoryifs.get('uid')==sample_uid),None)
+sample_status=sample_data.get('status',{}).get('state')
+sleep(BACKOFF)
+
+print(
+f"Sample created and saved into connector with ID {self.__write_connectororwrite_connector}")
+
+def_create_payload(self)->dict:
+payload=super()._create_payload()
+payload['writeConnector']=self.__write_connector
+
+returnpayload
+
+def_check_or_fetch_connector(self,write_connector:Union[Connector,UID])->Connector:
+self._logger.debug(f'Write connector is {write_connector}')
+ifisinstance(write_connector,str):
+self._logger.debug(f'Write connector is of type `UID` {write_connector}')
+write_connector=Connector.get(write_connector)
+self._logger.debug(f'Using fetched connector {write_connector}')
+
+ifwrite_connector.uidisNone:
+raiseInputError("Invalid connector provided as input for write")
+
+ifwrite_connector.typenotin[ConnectorType.AZURE_SQL,ConnectorType.MYSQL,ConnectorType.SNOWFLAKE]:
+raiseConnectorError(
+f"Invalid type `{write_connector.type}` for the provided connector")
+
+returnwrite_connector
+
deffit(self,X:DataSource,
+privacy_level:PrivacyLevel=PrivacyLevel.HIGH_FIDELITY,
+datatype:Optional[Union[DataSourceType,str]]=None,
+sortbykey:Optional[Union[str,List[str]]]=None,
+entities:Optional[Union[str,List[str]]]=None,
+generate_cols:Optional[List[str]]=None,
+exclude_cols:Optional[List[str]]=None,
+dtypes:Optional[Dict[str,Union[str,DataType]]]=None,
+target:Optional[str]=None,
+anonymize:Optional[dict]=None,
+condition_on:Optional[List[str]]=None)->None:
+"""Fit the synthesizer.
+
+ The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].
+ Except X, all the other arguments are for now ignored until they are supported.
+
+ Arguments:
+ X (DataSource): DataSource to Train
+ """
+
+self._fit_from_datasource(X)
+
+
+
+
+
+
+
+
+
+
+
+
+ sample(frac=1,write_connector=None)
+
+
+
+
+
+
+
Sample from a MultiTableSynthesizer
+instance.
+The sample is saved in the connector that was provided in the synthesizer initialization
+or in the
defsample(self,frac:Union[int,float]=1,write_connector:Optional[Union[Connector,UID]]=None)->None:
+"""Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]
+ instance.
+ The sample is saved in the connector that was provided in the synthesizer initialization
+ or in the
+
+ Arguments:
+ frac (int | float): fraction of the sample to be returned
+ """
+
+assertfrac>=0.1,InputError(
+"It is not possible to generate an empty synthetic data schema. Please validate the input provided. ")
+assertfrac<=5,InputError(
+"It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.")
+
+payload={
+'fraction':frac,
+}
+
+ifwrite_connectorisnotNone:
+connector=self._check_or_fetch_connector(write_connector)
+payload['writeConnector']=connector.uid
+
+response=self._client.post(
+f"/synthesizer/{self.uid}/sample",json=payload,project=self._project)
+
+data=response.json()
+sample_uid=data.get('uid')
+sample_status=None
+whilesample_statusnotin['finished','failed']:
+self._logger.info('Sampling from the synthesizer...')
+response=self._client.get(
+f'/synthesizer/{self.uid}/history',project=self._project)
+history=response.json()
+sample_data=next((sforsinhistoryifs.get('uid')==sample_uid),None)
+sample_status=sample_data.get('status',{}).get('state')
+sleep(BACKOFF)
+
+print(
+f"Sample created and saved into connector with ID {self.__write_connectororwrite_connector}")
+