From 50c1c277e69bd3c2c4730203bc9c2734a1864326 Mon Sep 17 00:00:00 2001 From: Azory YData Bot Date: Tue, 16 Jan 2024 17:05:04 +0000 Subject: [PATCH] Deployed 6785ad8 to 0.7 with MkDocs 1.5.1 and mike 1.1.2 --- 0.7/assets/_mkdocstrings.css | 91 +- .../synthesizer_multitable/index.html | 1260 ++++++++++++ 0.7/objects.inv | Bin 1105 -> 1029 bytes .../reference/api/common/client/index.html | 598 +++--- 0.7/sdk/reference/api/common/types/index.html | 2 + .../api/connectors/connector/index.html | 25 +- .../api/datasources/datasource/index.html | 274 +-- .../api/datasources/metadata/index.html | 3 +- .../api/synthesizers/base/index.html | 976 ++++----- .../api/synthesizers/multitable/index.html | 1755 +++++++++++++++++ .../api/synthesizers/regular/index.html | 11 +- .../api/synthesizers/timeseries/index.html | 11 +- 0.7/search/search_index.json | 2 +- 0.7/sitemap.xml.gz | Bin 127 -> 127 bytes .../synthesizer_multitable/index.html | 16 + .../api/synthesizers/multitable/index.html | 16 + 16 files changed, 3967 insertions(+), 1073 deletions(-) create mode 100644 0.7/examples/synthesizer_multitable/index.html create mode 100644 0.7/sdk/reference/api/synthesizers/multitable/index.html create mode 100644 latest/examples/synthesizer_multitable/index.html create mode 100644 latest/sdk/reference/api/synthesizers/multitable/index.html diff --git a/0.7/assets/_mkdocstrings.css b/0.7/assets/_mkdocstrings.css index 049a254b..4b7d98b8 100644 --- a/0.7/assets/_mkdocstrings.css +++ b/0.7/assets/_mkdocstrings.css @@ -26,39 +26,84 @@ float: right; } -/* Keep headings consistent. */ -h1.doc-heading, -h2.doc-heading, -h3.doc-heading, -h4.doc-heading, -h5.doc-heading, -h6.doc-heading { - font-weight: 400; - line-height: 1.5; - color: inherit; - text-transform: none; +/* Symbols in Navigation and ToC. */ +:root, +[data-md-color-scheme="default"] { + --doc-symbol-attribute-fg-color: #953800; + --doc-symbol-function-fg-color: #8250df; + --doc-symbol-method-fg-color: #8250df; + --doc-symbol-class-fg-color: #0550ae; + --doc-symbol-module-fg-color: #5cad0f; + + --doc-symbol-attribute-bg-color: #9538001a; + --doc-symbol-function-bg-color: #8250df1a; + --doc-symbol-method-bg-color: #8250df1a; + --doc-symbol-class-bg-color: #0550ae1a; + --doc-symbol-module-bg-color: #5cad0f1a; +} + +[data-md-color-scheme="slate"] { + --doc-symbol-attribute-fg-color: #ffa657; + --doc-symbol-function-fg-color: #d2a8ff; + --doc-symbol-method-fg-color: #d2a8ff; + --doc-symbol-class-fg-color: #79c0ff; + --doc-symbol-module-fg-color: #baff79; + + --doc-symbol-attribute-bg-color: #ffa6571a; + --doc-symbol-function-bg-color: #d2a8ff1a; + --doc-symbol-method-bg-color: #d2a8ff1a; + --doc-symbol-class-bg-color: #79c0ff1a; + --doc-symbol-module-bg-color: #baff791a; +} + +code.doc-symbol { + border-radius: .1rem; + font-size: .85em; + padding: 0 .3em; + font-weight: bold; +} + +code.doc-symbol-attribute { + color: var(--doc-symbol-attribute-fg-color); + background-color: var(--doc-symbol-attribute-bg-color); +} + +code.doc-symbol-attribute::after { + content: "attr"; +} + +code.doc-symbol-function { + color: var(--doc-symbol-function-fg-color); + background-color: var(--doc-symbol-function-bg-color); +} + +code.doc-symbol-function::after { + content: "func"; } -h1.doc-heading { - font-size: 1.6rem; +code.doc-symbol-method { + color: var(--doc-symbol-method-fg-color); + background-color: var(--doc-symbol-method-bg-color); } -h2.doc-heading { - font-size: 1.2rem; +code.doc-symbol-method::after { + content: "meth"; } -h3.doc-heading { - font-size: 1.15rem; +code.doc-symbol-class { + color: var(--doc-symbol-class-fg-color); + background-color: var(--doc-symbol-class-bg-color); } -h4.doc-heading { - font-size: 1.10rem; +code.doc-symbol-class::after { + content: "class"; } -h5.doc-heading { - font-size: 1.05rem; +code.doc-symbol-module { + color: var(--doc-symbol-module-fg-color); + background-color: var(--doc-symbol-module-bg-color); } -h6.doc-heading { - font-size: 1rem; +code.doc-symbol-module::after { + content: "mod"; } \ No newline at end of file diff --git a/0.7/examples/synthesizer_multitable/index.html b/0.7/examples/synthesizer_multitable/index.html new file mode 100644 index 00000000..6000f424 --- /dev/null +++ b/0.7/examples/synthesizer_multitable/index.html @@ -0,0 +1,1260 @@ + + + + + + + + + + + + + + + + + + Synthesize Multi Table - YData Fabric + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + +

Synthesize Multi Table

+

Use YData's MultiTableSynthesizer to generate multi table synthetic data from multiple RDBMS tables

+

Multi table is the way to synthesize data from multiple tables from a database, with a relational in mind...

+

Quickstart example:

+
import os
+
+from ydata.sdk.datasources import DataSource
+from ydata.sdk.synthesizers import MultiTableSynthesizer
+
+# Do not forget to add your token as env variables
+os.environ["YDATA_TOKEN"] = '<TOKEN>'  # Remove if already defined
+
+# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.
+# After training a Multi Table Synthesizer, we request a sample.
+# In this case, we don't return the Dataset for the sample, it will be saved in the database
+# that the connector refers to.
+
+X = DataSource.get('<DATASOURCE_UID>')
+
+# Initialize a multi table synthesizer with the connector to write to
+# As long as the synthesizer does not call `fit`, it exists only locally
+# write_connector can be an UID or a Connector instance
+synth = MultiTableSynthesizer(write_connector='<CONNECTOR_UID')
+
+# The synthesizer training is requested
+synth.fit(X)
+
+# We request a synthetic dataset with a fracion of 1.5
+synth.sample(frac=1.5)
+
+

Sample write connector overriding example:

+
import os
+
+from ydata.sdk.connectors import Connector
+from ydata.sdk.datasources import DataSource
+from ydata.sdk.synthesizers import MultiTableSynthesizer
+
+# Do not forget to add your token as env variables
+os.environ["YDATA_TOKEN"] = '<TOKEN>'  # Remove if already defined
+
+# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.
+# After training a Multi Table Synthesizer, we request a sample.
+# In this case, we don't return the Dataset for the sample, it will be saved in the database
+# that the connector refers to.
+
+X = DataSource.get('<DATASOURCE_UID>')
+
+# For demonstration purposes, we will use a connector instance, but you can just send the UID
+
+write_connector = Connector.get('<CONNECTOR_UID>')
+
+# Initialize a multi table synthesizer with the connector to write to
+# As long as the synthesizer does not call `fit`, it exists only locally
+# write_connector can be an UID or a Connector instance
+synth = MultiTableSynthesizer(write_connector=write_connector)
+
+# The synthesizer training is requested
+synth.fit(X)
+
+# We request a synthetic dataset with a fracion of 1.5
+# In this case we use a Connector instance.
+# You can just use the <CONNECTOR_UID> you don't need to get the connector upfront.
+synth.sample(frac=1.5, write_connector=write_connector)
+
+ + + + + + +
+
+ + + + +
+ + + +
+ +
+ + + + +
+ +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/0.7/objects.inv b/0.7/objects.inv index ba0f5672d65a3669e9ca46b14b1c36fb2da0b7af..8fecb4dbc2f8143138e50c3efec8f0ba96f3bd77 100644 GIT binary patch delta 915 zcmV;E18n@!2!#lcfPb0Ga@sHyhWCAn%#>}-B;9pqz)3tcxnO99Zal(>RIM+JC1-{{ zeSN{Uj4ex8(o8pr0O|jJAC8WMblL*~jQkJw5sZ9j_;DDTYmQS9Dsv4fROHr`*(xjL}hcxmDiX+vK z1RxB(L!5Y+0DqEVoh%HA_a4MMpz(>`A<-&cgbDfH0R>PW=on+AkC7Yq>ujx6C8kPB zP=IM&HoVhS#eRty*v+t>>Up(x6X#90lSS500?szx-;cDQTB zx=CsC;OopKuQK%+mIII@-iknB}b{pT3yCst7a*+S18L^qYIu=XSuM9v39XkYOR!(G1o1% zN)0yh41be-8zhx`pB{a~Paq&7KLMmkVn1o5F#*DAIK}xvoF;y@r(R@jw`dQqiqsgY zO=kzK+O%l^-vfVB%0@N8DO)ik>Kc_*BooQ&t0 zDUZiGxhx-_vm%S%U+WalrCx~W5~m6$vd*bu2`zSFu+{V+}OItuw%XYq53_>FT!$De^{x-e)=bKIFJKIiB;m_6kYd!mE7C!XsXvn+&SJWAf&b{SQ`bSv;}$$dUj6 delta 991 zcmV<510ei`2+;_TfPb1@Z<{a}h41$(DlNAoT6ecQl8_XYrYR(?vKvH+(|IEhVKXWF z_2VOe310@kmUh#+2|nlHjeTt5UJed*l;j6-G4frcxWVZZ_{t{4e;oH>pg+jFJ72!L zo?>AOSF^z$WEjejpy*f*=1Up`&hywBNn z{~d!eQw?=J*B181)L+&4-d^ep&Cs{|gwCNf1cd7_8;bp)uO6hLRt-ksU2jqvU9O+Q zC0(vit`{>>WuE@(9%osPY1V^vuS!)F+bvt1zxF#%cvThKevhzsu%j@YL&wN>DGq#S zlhDBg*~cKX34e8HL}YpqMC}hRcy}b;(FdYZ#nUoj*FIto$ul|!1oSo%aJLw(g{r_* zi7566tJ`i~yA}jDP2uY!BV_syeKoh_i9hY7*4|n_zby^*5wtw^DL5dHnUZ!A2W`_%aNbu{1&D5^DD{hD!%JT>y?8Ns-BW&Yg0^^f@e-K2W8}F$XaobAU-K!$ghiS8; zgH&H0u*K)*P zRYPB>Tlnha%VqLfx=^--k9Y{)D32tn1r{_|VK5r79(M4IrAHrT9(ZM=9-8;uq1h9| zeK6T~wzYs2ug_I+kbgrILxC{T!;o&{8w@BH9)@!h_+T_w-C;2G6#|T>lpcoEw5otX zv47%2v8rnkAW|kg45wlB0wZFy+n}0v-@v=OMibtLE+{*mCod>3uE#Gvv-X4Y&JU?_ zOiS?z5ftM3QOB5*@2*Xjux;}(xjd`eBX1}t6MaOU6I$hK1O9m;%sRK8tY6=6GHC@G zN78w~&-|gx;tRPrwsHNew0`4=*!s6vlvtOvxV6G{6qbLs6`$T@jFEE0{m}vGN1W4{ z8K`P{?|iw6_Fq!coX@Bz@NHxa - __build_url() + __build_url
  • - __raise_for_status() + __raise_for_status
  • - __set_global() + __set_global
  • - get() + get
  • - get_static_file() + get_static_file
  • - post() + post
  • @@ -1220,6 +1220,7 @@

    Get client

    +
    @@ -1696,7 +1697,13 @@

    Get client

    202 203 204 -205
    @typechecked
    +205
    +206
    +207
    +208
    +209
    +210
    +211
    @typechecked
     class Client(metaclass=SingletonClient):
         """Main Client class used to abstract the connection to the backend.
     
    @@ -1723,149 +1730,155 @@ 

    Get client

    if set_as_global: self.__set_global() - def post(self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None, - project: Project | None = None, files: Optional[Dict] = None, raise_for_status: bool = True) -> Response: - """POST request to the backend. - - Args: - endpoint (str): POST endpoint - data (Optional[dict]): (optional) multipart form data - json (Optional[dict]): (optional) json data - files (Optional[dict]): (optional) files to be sent - raise_for_status (bool): raise an exception on error - - Returns: - Response object - """ - url_data = self.__build_url( - endpoint, data=data, json=json, files=files, project=project) - response = self._http_client.post(**url_data) - - if response.status_code != Client.codes.OK and raise_for_status: - self.__raise_for_status(response) - - return response + def post( + self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None, + project: Optional[Project] = None, files: Optional[Dict] = None, raise_for_status: bool = True + ) -> Response: + """POST request to the backend. + + Args: + endpoint (str): POST endpoint + data (Optional[dict]): (optional) multipart form data + json (Optional[dict]): (optional) json data + files (Optional[dict]): (optional) files to be sent + raise_for_status (bool): raise an exception on error + + Returns: + Response object + """ + url_data = self.__build_url( + endpoint, data=data, json=json, files=files, project=project) + response = self._http_client.post(**url_data) + + if response.status_code != Client.codes.OK and raise_for_status: + self.__raise_for_status(response) - def get(self, endpoint: str, params: Optional[Dict] = None, - project: Project | None = None, cookies: Optional[Dict] = None, raise_for_status: bool = True) -> Response: - """GET request to the backend. - - Args: - endpoint (str): GET endpoint - cookies (Optional[dict]): (optional) cookies data - raise_for_status (bool): raise an exception on error - - Returns: - Response object - """ - url_data = self.__build_url(endpoint, params=params, - cookies=cookies, project=project) - response = self._http_client.get(**url_data) - - if response.status_code != Client.codes.OK and raise_for_status: - self.__raise_for_status(response) - - return response - - def get_static_file(self, endpoint: str, project: Project | None = None, raise_for_status: bool = True) -> Response: - """Retrieve a static file from the backend. - - Args: - endpoint (str): GET endpoint - raise_for_status (bool): raise an exception on error - - Returns: - Response object - """ - url_data = self.__build_url(endpoint, project=project) - url_data['url'] = f'{self._scheme}://{self._base_url}/static-content{endpoint}' - response = self._http_client.get(**url_data) - - if response.status_code != Client.codes.OK and raise_for_status: - self.__raise_for_status(response) - - return response - - def _handshake(self): - """Client handshake. - - It is used to determine is the client can connect with its - current authorization token. - """ - response = self.get('/profiles', params={}, raise_for_status=False) - if response.status_code == Client.codes.FOUND: - parser = LinkExtractor() - parser.feed(response.text) - raise ClientHandshakeError(auth_link=parser.link) - - def _get_default_project(self, token: str): - response = self.get('/profiles/me', params={}, cookies={'access_token': token}) - data: Dict = response.json() - return data['myWorkspace'] - - def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None, - json: Optional[Dict] = None, project: Project | None = None, files: Optional[Dict] = None, - cookies: Optional[Dict] = None) -> Dict: - """Build a request for the backend. - - Args: - endpoint (str): backend endpoint - params (Optional[dict]): URL parameters - data (Optional[Project]): (optional) multipart form data - json (Optional[dict]): (optional) json data - files (Optional[dict]): (optional) files to be sent - cookies (Optional[dict]): (optional) cookies data - - Returns: - dictionary containing the information to perform a request - """ - _params = params if params is not None else { - 'ns': project or self._default_project - } - - url_data = { - 'url': f'{self._scheme}://{self._base_url}/api{endpoint}', - 'headers': self._headers, - 'params': _params, + return response + + def get( + self, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None, + cookies: Optional[Dict] = None, raise_for_status: bool = True + ) -> Response: + """GET request to the backend. + + Args: + endpoint (str): GET endpoint + cookies (Optional[dict]): (optional) cookies data + raise_for_status (bool): raise an exception on error + + Returns: + Response object + """ + url_data = self.__build_url(endpoint, params=params, + cookies=cookies, project=project) + response = self._http_client.get(**url_data) + + if response.status_code != Client.codes.OK and raise_for_status: + self.__raise_for_status(response) + + return response + + def get_static_file( + self, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True + ) -> Response: + """Retrieve a static file from the backend. + + Args: + endpoint (str): GET endpoint + raise_for_status (bool): raise an exception on error + + Returns: + Response object + """ + url_data = self.__build_url(endpoint, project=project) + url_data['url'] = f'{self._scheme}://{self._base_url}/static-content{endpoint}' + response = self._http_client.get(**url_data) + + if response.status_code != Client.codes.OK and raise_for_status: + self.__raise_for_status(response) + + return response + + def _handshake(self): + """Client handshake. + + It is used to determine is the client can connect with its + current authorization token. + """ + response = self.get('/profiles', params={}, raise_for_status=False) + if response.status_code == Client.codes.FOUND: + parser = LinkExtractor() + parser.feed(response.text) + raise ClientHandshakeError(auth_link=parser.link) + + def _get_default_project(self, token: str): + response = self.get('/profiles/me', params={}, cookies={'access_token': token}) + data: Dict = response.json() + return data['myWorkspace'] + + def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None, + json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None, + cookies: Optional[Dict] = None) -> Dict: + """Build a request for the backend. + + Args: + endpoint (str): backend endpoint + params (Optional[dict]): URL parameters + data (Optional[Project]): (optional) multipart form data + json (Optional[dict]): (optional) json data + files (Optional[dict]): (optional) files to be sent + cookies (Optional[dict]): (optional) cookies data + + Returns: + dictionary containing the information to perform a request + """ + _params = params if params is not None else { + 'ns': project or self._default_project } - if data is not None: - url_data['data'] = data - - if json is not None: - url_data['json'] = json + url_data = { + 'url': f'{self._scheme}://{self._base_url}/api{endpoint}', + 'headers': self._headers, + 'params': _params, + } - if files is not None: - url_data['files'] = files + if data is not None: + url_data['data'] = data - if cookies is not None: - url_data['cookies'] = cookies + if json is not None: + url_data['json'] = json - return url_data - - def __set_global(self) -> None: - """Sets a client instance as global.""" - # If the client is stateful, close it gracefully! - Client.GLOBAL_CLIENT = self - - def __raise_for_status(self, response: Response) -> None: - """Raise an exception if the response is not OK. - - When an exception is raised, we try to convert it to a ResponseError which is - a wrapper around a backend error. This usually gives enough context and provides - nice error message. - - If it cannot be converted to ResponseError, it is re-raised. + if files is not None: + url_data['files'] = files + + if cookies is not None: + url_data['cookies'] = cookies + + return url_data + + def __set_global(self) -> None: + """Sets a client instance as global.""" + # If the client is stateful, close it gracefully! + Client.GLOBAL_CLIENT = self + + def __raise_for_status(self, response: Response) -> None: + """Raise an exception if the response is not OK. - Args: - response (Response): response to analyze - """ - try: - response.raise_for_status() - except HTTPStatusError as e: - with suppress(Exception): - e = ResponseError(**response.json()) - raise e + When an exception is raised, we try to convert it to a ResponseError which is + a wrapper around a backend error. This usually gives enough context and provides + nice error message. + + If it cannot be converted to ResponseError, it is re-raised. + + Args: + response (Response): response to analyze + """ + try: + response.raise_for_status() + except HTTPStatusError as e: + with suppress(Exception): + e = ResponseError(**response.json()) + raise e
    @@ -1886,7 +1899,6 @@

    Get client

    -

    __build_url(endpoint, params=None, data=None, json=None, project=None, files=None, cookies=None) @@ -2023,13 +2035,7 @@

    Source code in ydata/sdk/common/client/client.py -
    143
    -144
    -145
    -146
    -147
    -148
    -149
    +            
    149
     150
     151
     152
    @@ -2061,45 +2067,51 @@ 

    178 179 180 -181

    def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,
    -                json: Optional[Dict] = None, project: Project | None = None, files: Optional[Dict] = None,
    -                cookies: Optional[Dict] = None) -> Dict:
    -    """Build a request for the backend.
    -
    -    Args:
    -        endpoint (str): backend endpoint
    -        params (Optional[dict]): URL parameters
    -        data (Optional[Project]): (optional) multipart form data
    -        json (Optional[dict]): (optional) json data
    -        files (Optional[dict]): (optional) files to be sent
    -        cookies (Optional[dict]): (optional) cookies data
    -
    -    Returns:
    -        dictionary containing the information to perform a request
    -    """
    -    _params = params if params is not None else {
    -        'ns': project or self._default_project
    -    }
    -
    -    url_data = {
    -        'url': f'{self._scheme}://{self._base_url}/api{endpoint}',
    -        'headers': self._headers,
    -        'params': _params,
    +181
    +182
    +183
    +184
    +185
    +186
    +187
    def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,
    +                json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,
    +                cookies: Optional[Dict] = None) -> Dict:
    +    """Build a request for the backend.
    +
    +    Args:
    +        endpoint (str): backend endpoint
    +        params (Optional[dict]): URL parameters
    +        data (Optional[Project]): (optional) multipart form data
    +        json (Optional[dict]): (optional) json data
    +        files (Optional[dict]): (optional) files to be sent
    +        cookies (Optional[dict]): (optional) cookies data
    +
    +    Returns:
    +        dictionary containing the information to perform a request
    +    """
    +    _params = params if params is not None else {
    +        'ns': project or self._default_project
         }
     
    -    if data is not None:
    -        url_data['data'] = data
    -
    -    if json is not None:
    -        url_data['json'] = json
    +    url_data = {
    +        'url': f'{self._scheme}://{self._base_url}/api{endpoint}',
    +        'headers': self._headers,
    +        'params': _params,
    +    }
     
    -    if files is not None:
    -        url_data['files'] = files
    +    if data is not None:
    +        url_data['data'] = data
     
    -    if cookies is not None:
    -        url_data['cookies'] = cookies
    +    if json is not None:
    +        url_data['json'] = json
     
    -    return url_data
    +    if files is not None:
    +        url_data['files'] = files
    +
    +    if cookies is not None:
    +        url_data['cookies'] = cookies
    +
    +    return url_data
     
    @@ -2111,7 +2123,6 @@

    __raise_for_status(response) @@ -2158,13 +2169,7 @@

    Source code in ydata/sdk/common/client/client.py -
    188
    -189
    -190
    -191
    -192
    -193
    -194
    +            
    194
     195
     196
     197
    @@ -2175,24 +2180,30 @@ 

    202 203 204 -205

    def __raise_for_status(self, response: Response) -> None:
    -    """Raise an exception if the response is not OK.
    -
    -    When an exception is raised, we try to convert it to a ResponseError which is
    -    a wrapper around a backend error. This usually gives enough context and provides
    -    nice error message.
    -
    -    If it cannot be converted to ResponseError, it is re-raised.
    +205
    +206
    +207
    +208
    +209
    +210
    +211
    def __raise_for_status(self, response: Response) -> None:
    +    """Raise an exception if the response is not OK.
     
    -    Args:
    -        response (Response): response to analyze
    -    """
    -    try:
    -        response.raise_for_status()
    -    except HTTPStatusError as e:
    -        with suppress(Exception):
    -            e = ResponseError(**response.json())
    -        raise e
    +    When an exception is raised, we try to convert it to a ResponseError which is
    +    a wrapper around a backend error. This usually gives enough context and provides
    +    nice error message.
    +
    +    If it cannot be converted to ResponseError, it is re-raised.
    +
    +    Args:
    +        response (Response): response to analyze
    +    """
    +    try:
    +        response.raise_for_status()
    +    except HTTPStatusError as e:
    +        with suppress(Exception):
    +            e = ResponseError(**response.json())
    +        raise e
     
    @@ -2204,7 +2215,6 @@

    __set_global() @@ -2217,13 +2227,13 @@

    Source code in ydata/sdk/common/client/client.py -
    - + @@ -1221,6 +1221,7 @@

    Metadata

    + diff --git a/0.7/sdk/reference/api/synthesizers/base/index.html b/0.7/sdk/reference/api/synthesizers/base/index.html index fb788e96..278749f3 100644 --- a/0.7/sdk/reference/api/synthesizers/base/index.html +++ b/0.7/sdk/reference/api/synthesizers/base/index.html @@ -1097,21 +1097,21 @@
  • - fit() + fit
  • - list() + list
  • - sample() + sample
  • @@ -1261,7 +1261,7 @@

    Main synthesizer class.

    -

    This class cannot be directly instanciated because of the specificities between RegularSynthesizer and TimeSeriesSynthesizer sample methods.

    +

    This class cannot be directly instanciated because of the specificities between RegularSynthesizer, TimeSeriesSynthesizer or MultiTableSynthesizer sample methods.

    Methods

    • fit: train a synthesizer instance.
    • @@ -1305,7 +1305,9 @@

      Methods

      Source code in ydata/sdk/synthesizers/synthesizer.py -
    def __set_global(self) -> None:
    -    """Sets a client instance as global."""
    -    # If the client is stateful, close it gracefully!
    -    Client.GLOBAL_CLIENT = self
    +            
    def __set_global(self) -> None:
    +    """Sets a client instance as global."""
    +    # If the client is stateful, close it gracefully!
    +    Client.GLOBAL_CLIENT = self
     
    @@ -2235,7 +2245,6 @@

    get(endpoint, params=None, project=None, cookies=None, raise_for_status=True) @@ -2330,9 +2339,7 @@

    Source code in ydata/sdk/common/client/client.py -
    - + @@ -1368,7 +1368,7 @@ - + @@ -1552,7 +1552,7 @@ returnself._model.uid@property -deftype(self)->str: +deftype(self)->ConnectorType:returnself._model.type@staticmethod @@ -1680,7 +1680,6 @@ -

    create(connector_type, credentials, name=None, client=None) @@ -1839,7 +1838,6 @@

    get(uid, client=None) @@ -1968,7 +1966,6 @@

    -

    list(client=None) @@ -2078,6 +2075,7 @@

    +

    ConnectorType

    @@ -2106,7 +2104,6 @@

    -

    AWS_S3 = 'aws-s3' @@ -2129,7 +2126,6 @@

    -

    AZURE_BLOB = 'azure-blob' @@ -2152,7 +2148,6 @@

    -

    AZURE_SQL = 'azure-sql' @@ -2175,7 +2170,6 @@

    -

    BIGQUERY = 'google-bigquery' @@ -2198,7 +2192,6 @@

    -

    FILE = 'file' @@ -2221,7 +2214,6 @@

    -

    GCS = 'gcs' @@ -2244,7 +2236,6 @@

    -

    MYSQL = 'mysql' @@ -2267,7 +2258,6 @@

    -

    SNOWFLAKE = 'snowflake' @@ -2294,6 +2284,7 @@

    + diff --git a/0.7/sdk/reference/api/datasources/datasource/index.html b/0.7/sdk/reference/api/datasources/datasource/index.html index 448aad3d..4c69a33c 100644 --- a/0.7/sdk/reference/api/datasources/datasource/index.html +++ b/0.7/sdk/reference/api/datasources/datasource/index.html @@ -1000,21 +1000,21 @@
  • - create() + create
  • - get() + get
  • - list() + list
  • @@ -1032,55 +1032,6 @@ Status - - -
  • - - AVAILABLE - - -
  • - -
  • - - DELETED - - -
  • - -
  • - - FAILED - - -
  • - -
  • - - PREPARING - - -
  • - -
  • - - UNAVAILABLE - - -
  • - -
  • - - UNKNOWN - - -
  • - -
  • - - VALIDATING - -
  • @@ -1410,7 +1361,7 @@

    - + @@ -1421,7 +1372,7 @@ - + @@ -1432,7 +1383,7 @@ - + @@ -1443,7 +1394,7 @@ - + @@ -1622,18 +1573,7 @@ 182183184 -185 -186 -187 -188 -189 -190 -191 -192 -193 -194 -195 -196
     86
    - 87
    - 88
    +            
     88
      89
      90
      91
    @@ -2349,26 +2356,32 @@ 

    102 103 104 -105

    def get(self, endpoint: str, params: Optional[Dict] = None,
    -        project: Project | None = None, cookies: Optional[Dict] = None, raise_for_status: bool = True) -> Response:
    -    """GET request to the backend.
    -
    -    Args:
    -        endpoint (str): GET endpoint
    -        cookies (Optional[dict]): (optional) cookies data
    -        raise_for_status (bool): raise an exception on error
    -
    -    Returns:
    -        Response object
    -    """
    -    url_data = self.__build_url(endpoint, params=params,
    -                                cookies=cookies, project=project)
    -    response = self._http_client.get(**url_data)
    -
    -    if response.status_code != Client.codes.OK and raise_for_status:
    -        self.__raise_for_status(response)
    -
    -    return response
    +105
    +106
    +107
    +108
    +109
    def get(
    +    self, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None,
    +    cookies: Optional[Dict] = None, raise_for_status: bool = True
    +) -> Response:
    +    """GET request to the backend.
    +
    +    Args:
    +        endpoint (str): GET endpoint
    +        cookies (Optional[dict]): (optional) cookies data
    +        raise_for_status (bool): raise an exception on error
    +
    +    Returns:
    +        Response object
    +    """
    +    url_data = self.__build_url(endpoint, params=params,
    +                                cookies=cookies, project=project)
    +    response = self._http_client.get(**url_data)
    +
    +    if response.status_code != Client.codes.OK and raise_for_status:
    +        self.__raise_for_status(response)
    +
    +    return response
     
    @@ -2380,7 +2393,6 @@

    -

    get_static_file(endpoint, project=None, raise_for_status=True) @@ -2461,11 +2473,7 @@

    Source code in ydata/sdk/common/client/client.py -
    107
    -108
    -109
    -110
    -111
    +            
    111
     112
     113
     114
    @@ -2478,24 +2486,32 @@ 

    121 122 123 -124

    def get_static_file(self, endpoint: str, project: Project | None = None, raise_for_status: bool = True) -> Response:
    -    """Retrieve a static file from the backend.
    -
    -    Args:
    -        endpoint (str): GET endpoint
    -        raise_for_status (bool): raise an exception on error
    -
    -    Returns:
    -        Response object
    -    """
    -    url_data = self.__build_url(endpoint, project=project)
    -    url_data['url'] = f'{self._scheme}://{self._base_url}/static-content{endpoint}'
    -    response = self._http_client.get(**url_data)
    -
    -    if response.status_code != Client.codes.OK and raise_for_status:
    -        self.__raise_for_status(response)
    -
    -    return response
    +124
    +125
    +126
    +127
    +128
    +129
    +130
    def get_static_file(
    +    self, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True
    +) -> Response:
    +    """Retrieve a static file from the backend.
    +
    +    Args:
    +        endpoint (str): GET endpoint
    +        raise_for_status (bool): raise an exception on error
    +
    +    Returns:
    +        Response object
    +    """
    +    url_data = self.__build_url(endpoint, project=project)
    +    url_data['url'] = f'{self._scheme}://{self._base_url}/static-content{endpoint}'
    +    response = self._http_client.get(**url_data)
    +
    +    if response.status_code != Client.codes.OK and raise_for_status:
    +        self.__raise_for_status(response)
    +
    +    return response
     
    @@ -2507,7 +2523,6 @@

    post(endpoint, data=None, json=None, project=None, files=None, raise_for_status=True) @@ -2651,28 +2666,32 @@

    81 82 83 -84

    def post(self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None,
    -         project: Project | None = None, files: Optional[Dict] = None, raise_for_status: bool = True) -> Response:
    -    """POST request to the backend.
    -
    -    Args:
    -        endpoint (str): POST endpoint
    -        data (Optional[dict]): (optional) multipart form data
    -        json (Optional[dict]): (optional) json data
    -        files (Optional[dict]): (optional) files to be sent
    -        raise_for_status (bool): raise an exception on error
    -
    -    Returns:
    -        Response object
    -    """
    -    url_data = self.__build_url(
    -        endpoint, data=data, json=json, files=files, project=project)
    -    response = self._http_client.post(**url_data)
    -
    -    if response.status_code != Client.codes.OK and raise_for_status:
    -        self.__raise_for_status(response)
    -
    -    return response
    +84
    +85
    +86
    def post(
    +    self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None,
    +    project: Optional[Project] = None, files: Optional[Dict] = None, raise_for_status: bool = True
    +) -> Response:
    +    """POST request to the backend.
    +
    +    Args:
    +        endpoint (str): POST endpoint
    +        data (Optional[dict]): (optional) multipart form data
    +        json (Optional[dict]): (optional) json data
    +        files (Optional[dict]): (optional) files to be sent
    +        raise_for_status (bool): raise an exception on error
    +
    +    Returns:
    +        Response object
    +    """
    +    url_data = self.__build_url(
    +        endpoint, data=data, json=json, files=files, project=project)
    +    response = self._http_client.post(**url_data)
    +
    +    if response.status_code != Client.codes.OK and raise_for_status:
    +        self.__raise_for_status(response)
    +
    +    return response
     
    @@ -2685,6 +2704,7 @@

    + diff --git a/0.7/sdk/reference/api/common/types/index.html b/0.7/sdk/reference/api/common/types/index.html index e130036b..da860399 100644 --- a/0.7/sdk/reference/api/common/types/index.html +++ b/0.7/sdk/reference/api/common/types/index.html @@ -1170,6 +1170,7 @@

    Types

    +
    @@ -1180,6 +1181,7 @@

    Types

    +
    diff --git a/0.7/sdk/reference/api/connectors/connector/index.html b/0.7/sdk/reference/api/connectors/connector/index.html index 4a7b1a1b..38cd28c6 100644 --- a/0.7/sdk/reference/api/connectors/connector/index.html +++ b/0.7/sdk/reference/api/connectors/connector/index.html @@ -946,21 +946,21 @@
  • - create() + create
  • - get() + get
  • - list() + list
  • @@ -1357,7 +1357,7 @@
    uiduid UID
    typetype ConnectorType
    uiduid UID
    datatypedatatype DataSourceType
    statusstatus Status
    metadatametadata Metadata
    class DataSource(ModelFactoryMixin):
    +185
    class DataSource(ModelFactoryMixin):
         """A [`DataSource`][ydata.sdk.datasources.DataSource] represents a dataset
         to be used by a Synthesizer as training data.
     
    @@ -1790,25 +1730,14 @@
             return datasource
     
         @staticmethod
    -    def _resolve_api_status(api_status: Dict) -> Status:
    -        status = Status(api_status.get('state', Status.UNKNOWN.name))
    -        validation = ValidationState(api_status.get('validation', {}).get(
    -            'state', ValidationState.UNKNOWN.name))
    -        if validation == ValidationState.FAILED:
    -            status = Status.FAILED
    -        return status
    -
    -    @staticmethod
    -    def _model_from_api(data: Dict, datasource_type: Type[mDataSource]) -> mDataSource:
    -        data['datatype'] = data.pop('dataType')
    -        data['state'] = data['status']
    -        data['status'] = DataSource._resolve_api_status(data['status'])
    -        data = filter_dict(datasource_type, data)
    -        model = datasource_type(**data)
    -        return model
    -
    -    def __repr__(self):
    -        return self._model.__repr__()
    +    def _model_from_api(data: Dict, datasource_type: Type[mDataSource]) -> mDataSource:
    +        data['datatype'] = data.pop('dataType', None)
    +        data = filter_dict(datasource_type, data)
    +        model = datasource_type(**data)
    +        return model
    +
    +    def __repr__(self):
    +        return self._model.__repr__()
     
    @@ -1829,7 +1758,6 @@ -

    create(connector, datatype=DataSourceType.TABULAR, name=None, wait_for_metadata=True, client=None, **config) @@ -2013,7 +1941,6 @@

    get(uid, client=None) @@ -2146,7 +2073,6 @@

    list(client=None) @@ -2277,6 +2203,7 @@

    Status

    @@ -2287,10 +2214,8 @@

    - Bases: StringEnum

    + Bases: BaseModel

    - -

    Represent the status of a DataSource.

    @@ -2303,174 +2228,14 @@

    -

    - AVAILABLE = 'available' - - - class-attribute - instance-attribute - - -

    - - -
    - -

    The DataSource is available and ready to be used.

    -
    - -
    - - - - -

    - DELETED = 'deleted' - - - class-attribute - instance-attribute - - -

    - - -
    - -

    The DataSource is to be deleted or has been deleted.

    -
    - -
    - - - - -

    - FAILED = 'failed' - - - class-attribute - instance-attribute - - -

    - - -
    - -

    The DataSource preparation or validation has failed.

    -
    - -
    - -
    - - - - -

    - PREPARING = 'preparing' - - - class-attribute - instance-attribute - - -

    - - -
    - -

    The DataSource is being prepared.

    -
    - -
    - -
    - - - - -

    - UNAVAILABLE = 'unavailable' - - - class-attribute - instance-attribute - - -

    - - -
    - -

    The DataSource is unavailable at the moment.

    -
    - -
    - -
    - - - - -

    - UNKNOWN = 'unknown' - - - class-attribute - instance-attribute - - -

    - - -
    - -

    The DataSource status could not be retrieved.

    -
    - -
    - -
    - - - - -

    - VALIDATING = 'validating' - - - class-attribute - instance-attribute - - -

    - - -
    - -

    The DataSource is being validated.

    -
    - -
    - - - - - - - -

    DataSourceType

    @@ -2500,7 +2265,6 @@

    -

    TABULAR = 'tabular' @@ -2523,7 +2287,6 @@

    -

    TIMESERIES = 'timeseries' @@ -2550,6 +2313,7 @@

    + diff --git a/0.7/sdk/reference/api/datasources/metadata/index.html b/0.7/sdk/reference/api/datasources/metadata/index.html index 53e141be..2b9e84c9 100644 --- a/0.7/sdk/reference/api/datasources/metadata/index.html +++ b/0.7/sdk/reference/api/datasources/metadata/index.html @@ -1189,7 +1189,7 @@

    Metadata

    columnscolumns List[Column]
     35
    +              
     33
    + 34
    + 35
      36
      37
      38
    @@ -1660,362 +1662,396 @@ 

    Methods

    387 388 389 -390
    @typechecked
    -class BaseSynthesizer(ABC, ModelFactoryMixin):
    -    """Main synthesizer class.
    +390
    +391
    +392
    +393
    +394
    +395
    +396
    +397
    +398
    +399
    +400
    +401
    +402
    +403
    +404
    +405
    +406
    @typechecked
    +class BaseSynthesizer(ABC, ModelFactoryMixin):
    +    """Main synthesizer class.
    +
    +    This class cannot be directly instanciated because of the specificities between [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer], [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] or [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer] `sample` methods.
     
    -    This class cannot be directly instanciated because of the specificities between [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer] and [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] `sample` methods.
    -
    -    Methods
    -    -------
    -    - `fit`: train a synthesizer instance.
    -    - `sample`: request synthetic data.
    -    - `status`: current status of the synthesizer instance.
    -
    -    Note:
    -            The synthesizer instance is created in the backend only when the `fit` method is called.
    -
    -    Arguments:
    -        client (Client): (optional) Client to connect to the backend
    -    """
    -
    -    def __init__(self, uid: UID | None = None, name: str | None = None, project: Project | None = None, client: Client | None = None):
    +    Methods
    +    -------
    +    - `fit`: train a synthesizer instance.
    +    - `sample`: request synthetic data.
    +    - `status`: current status of the synthesizer instance.
    +
    +    Note:
    +            The synthesizer instance is created in the backend only when the `fit` method is called.
    +
    +    Arguments:
    +        client (Client): (optional) Client to connect to the backend
    +    """
    +
    +    def __init__(
    +            self, uid: Optional[UID] = None, name: Optional[str] = None,
    +            project: Optional[Project] = None, client: Optional[Client] = None):
             self._init_common(client=client)
    -        self._model = mSynthesizer(uid=uid, name=name or str(
    -            uuid4())) if uid or project else None
    -        self.__project = project
    -
    -    @init_client
    -    def _init_common(self, client: Optional[Client] = None):
    -        self._client = client
    -        self._logger = create_logger(__name__, level=LOG_LEVEL)
    -
    -    def fit(self, X: Union[DataSource, pdDataFrame],
    -            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
    -            datatype: Optional[Union[DataSourceType, str]] = None,
    -            sortbykey: Optional[Union[str, List[str]]] = None,
    -            entities: Optional[Union[str, List[str]]] = None,
    -            generate_cols: Optional[List[str]] = None,
    -            exclude_cols: Optional[List[str]] = None,
    -            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,
    -            target: Optional[str] = None,
    -            anonymize: Optional[dict] = None,
    -            condition_on: Optional[List[str]] = None) -> None:
    -        """Fit the synthesizer.
    -
    -        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].
    -        When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.
    -
    -        The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].
    -
    -        By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.
    -        The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.
    -
    -        Arguments:
    -            X (Union[DataSource, pandas.DataFrame]): Training dataset
    -            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
    -            datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]
    -            sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets
    -            entities (Union[str, List[str]]): (optional) columns representing entities ID
    -            generate_cols (List[str]): (optional) columns that should be synthesized
    -            exclude_cols (List[str]): (optional) columns that should not be synthesized
    -            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
    -            target (Optional[str]): (optional) Target for the dataset
    -            name (Optional[str]): (optional) Synthesizer instance name
    -            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
    -            condition_on: (Optional[List[str]]): (optional) list of features to condition upon
    -        """
    -        if self._is_initialized():
    -            raise AlreadyFittedError()
    -
    -        _datatype = DataSourceType(datatype) if isinstance(
    -            X, pdDataFrame) else DataSourceType(X.datatype)
    -
    -        dataset_attrs = self._init_datasource_attributes(
    -            sortbykey, entities, generate_cols, exclude_cols, dtypes)
    -        self._validate_datasource_attributes(X, dataset_attrs, _datatype, target)
    -
    -        # If the training data is a pandas dataframe, we first need to create a data source and then the instance
    -        if isinstance(X, pdDataFrame):
    -            if X.empty:
    -                raise EmptyDataError("The DataFrame is empty")
    -            _X = LocalDataSource(source=X, datatype=_datatype, client=self._client)
    -        else:
    -            if datatype != _datatype:
    -                warn("When the training data is a DataSource, the argument `datatype` is ignored.",
    -                     DataSourceTypeWarning)
    -            _X = X
    -
    -        if _X.status != dsStatus.AVAILABLE:
    -            raise DataSourceNotAvailableError(
    -                f"The datasource '{_X.uid}' is not available (status = {_X.status.value})")
    -
    -        if isinstance(dataset_attrs, dict):
    -            dataset_attrs = DataSourceAttrs(**dataset_attrs)
    -
    -        self._fit_from_datasource(
    -            X=_X, dataset_attrs=dataset_attrs, target=target,
    -            anonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)
    -
    -    @staticmethod
    -    def _init_datasource_attributes(
    -            sortbykey: Optional[Union[str, List[str]]],
    -            entities: Optional[Union[str, List[str]]],
    -            generate_cols: Optional[List[str]],
    -            exclude_cols: Optional[List[str]],
    -            dtypes: Optional[Dict[str, Union[str, DataType]]]) -> DataSourceAttrs:
    -        dataset_attrs = {
    -            'sortbykey': sortbykey if sortbykey is not None else [],
    -            'entities': entities if entities is not None else [],
    -            'generate_cols': generate_cols if generate_cols is not None else [],
    -            'exclude_cols': exclude_cols if exclude_cols is not None else [],
    -            'dtypes': {k: DataType(v) for k, v in dtypes.items()} if dtypes is not None else {}
    -        }
    -        return DataSourceAttrs(**dataset_attrs)
    -
    -    @staticmethod
    -    def _validate_datasource_attributes(X: Union[DataSource, pdDataFrame], dataset_attrs: DataSourceAttrs, datatype: DataSourceType, target: Optional[str]):
    -        columns = []
    -        if isinstance(X, pdDataFrame):
    -            columns = X.columns
    -            if datatype is None:
    -                raise DataTypeMissingError(
    -                    "Argument `datatype` is mandatory for pandas.DataFrame training data")
    -            datatype = DataSourceType(datatype)
    -        else:
    -            columns = [c.name for c in X.metadata.columns]
    -
    -        if target is not None and target not in columns:
    -            raise DataSourceAttrsError(
    -                "Invalid target: column '{target}' does not exist")
    -
    -        if datatype == DataSourceType.TIMESERIES:
    -            if not dataset_attrs.sortbykey:
    -                raise DataSourceAttrsError(
    -                    "The argument `sortbykey` is mandatory for timeseries datasource.")
    -
    -        invalid_fields = {}
    -        for field, v in dataset_attrs.dict().items():
    -            field_columns = v if field != 'dtypes' else v.keys()
    -            not_in_cols = [c for c in field_columns if c not in columns]
    -            if len(not_in_cols) > 0:
    -                invalid_fields[field] = not_in_cols
    -
    -        if len(invalid_fields) > 0:
    -            error_msgs = ["\t- Field '{}': columns {} do not exist".format(
    -                f, ', '.join(v)) for f, v in invalid_fields.items()]
    -            raise DataSourceAttrsError(
    -                "The dataset attributes are invalid:\n {}".format('\n'.join(error_msgs)))
    -
    -    @staticmethod
    -    def _metadata_to_payload(
    -        datatype: DataSourceType, ds_metadata: Metadata,
    -        dataset_attrs: Optional[DataSourceAttrs] = None, target: str | None = None
    -    ) -> dict:
    -        """Transform a the metadata and dataset attributes into a valid
    -        payload.
    -
    -        Arguments:
    -            datatype (DataSourceType): datasource type
    -            ds_metadata (Metadata): datasource metadata object
    -            dataset_attrs ( Optional[DataSourceAttrs] ): (optional) Dataset attributes
    -            target (Optional[str]): (optional) target column name
    -
    -        Returns:
    -            metadata payload dictionary
    -        """
    -
    -        columns = [
    -            {
    -                'name': c.name,
    -                'generation': True and c.name not in dataset_attrs.exclude_cols,
    -                'dataType': DataType(dataset_attrs.dtypes[c.name]).value if c.name in dataset_attrs.dtypes else c.datatype,
    -                'varType': c.vartype,
    -            }
    -            for c in ds_metadata.columns]
    -
    -        metadata = {
    -            'columns': columns,
    -            'target': target
    -        }
    -
    -        if dataset_attrs is not None:
    -            if datatype == DataSourceType.TIMESERIES:
    -                metadata['sortBy'] = [c for c in dataset_attrs.sortbykey]
    -                metadata['entity'] = [c for c in dataset_attrs.entities]
    -
    -        return metadata
    -
    -    def _fit_from_datasource(
    -        self,
    -        X: DataSource,
    -        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
    -        dataset_attrs: Optional[DataSourceAttrs] = None,
    -        target: Optional[str] = None,
    -        anonymize: Optional[dict] = None,
    -        condition_on: Optional[List[str]] = None
    -    ) -> None:
    -        metadata = self._metadata_to_payload(
    -            DataSourceType(X.datatype), X.metadata, dataset_attrs, target)
    -        payload = {
    -            'name': self._model.name,
    -            'dataSourceUID': X.uid,
    -            'metadata': metadata,
    -            'extraData': {},
    -            'privacyLevel': privacy_level.value
    -        }
    -        if anonymize is not None:
    -            payload["extraData"]["anonymize"] = anonymize
    -        if condition_on is not None:
    -            payload["extraData"]["condition_on"] = condition_on
    -
    -        response = self._client.post(
    -            '/synthesizer/', json=payload, project=self.__project)
    -        data: list = response.json()
    -        self._model, _ = self._model_from_api(X.datatype, data)
    -        while self.status not in [Status.READY, Status.FAILED]:
    -            self._logger.info('Training the synthesizer...')
    -            sleep(BACKOFF)
    -
    -        if self.status == Status.FAILED:
    -            raise FittingError('Could not train the synthesizer')
    -
    -    @staticmethod
    -    def _model_from_api(datatype: str, data: Dict) -> Tuple[mSynthesizer, Type["BaseSynthesizer"]]:
    -        from ydata.sdk.synthesizers._models.synthesizer_map import TYPE_TO_CLASS
    -        synth_cls = TYPE_TO_CLASS.get(SynthesizerType(datatype).value)
    -        data['status'] = synth_cls._resolve_api_status(data['status'])
    -        data = filter_dict(mSynthesizer, data)
    -        return mSynthesizer(**data), synth_cls
    +        self._model = mSynthesizer(uid=uid, name=name or str(uuid4()))
    +        self._project = project
    +
    +    @init_client
    +    def _init_common(self, client: Optional[Client] = None):
    +        self._client = client
    +        self._logger = create_logger(__name__, level=LOG_LEVEL)
    +
    +    def fit(self, X: Union[DataSource, pdDataFrame],
    +            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
    +            datatype: Optional[Union[DataSourceType, str]] = None,
    +            sortbykey: Optional[Union[str, List[str]]] = None,
    +            entities: Optional[Union[str, List[str]]] = None,
    +            generate_cols: Optional[List[str]] = None,
    +            exclude_cols: Optional[List[str]] = None,
    +            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,
    +            target: Optional[str] = None,
    +            anonymize: Optional[dict] = None,
    +            condition_on: Optional[List[str]] = None) -> None:
    +        """Fit the synthesizer.
    +
    +        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].
    +        When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.
    +
    +        The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].
    +
    +        By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.
    +        The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.
    +
    +        Arguments:
    +            X (Union[DataSource, pandas.DataFrame]): Training dataset
    +            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
    +            datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]
    +            sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets
    +            entities (Union[str, List[str]]): (optional) columns representing entities ID
    +            generate_cols (List[str]): (optional) columns that should be synthesized
    +            exclude_cols (List[str]): (optional) columns that should not be synthesized
    +            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
    +            target (Optional[str]): (optional) Target for the dataset
    +            name (Optional[str]): (optional) Synthesizer instance name
    +            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
    +            condition_on: (Optional[List[str]]): (optional) list of features to condition upon
    +        """
    +        if self._is_initialized():
    +            raise AlreadyFittedError()
    +
    +        _datatype = DataSourceType(datatype) if isinstance(
    +            X, pdDataFrame) else DataSourceType(X.datatype)
    +
    +        dataset_attrs = self._init_datasource_attributes(
    +            sortbykey, entities, generate_cols, exclude_cols, dtypes)
    +        self._validate_datasource_attributes(X, dataset_attrs, _datatype, target)
    +
    +        # If the training data is a pandas dataframe, we first need to create a data source and then the instance
    +        if isinstance(X, pdDataFrame):
    +            if X.empty:
    +                raise EmptyDataError("The DataFrame is empty")
    +            _X = LocalDataSource(source=X, datatype=_datatype, client=self._client)
    +        else:
    +            if datatype != _datatype:
    +                warn("When the training data is a DataSource, the argument `datatype` is ignored.",
    +                     DataSourceTypeWarning)
    +            _X = X
    +
    +        if _X.status != dsStatus.AVAILABLE:
    +            raise DataSourceNotAvailableError(
    +                f"The datasource '{_X.uid}' is not available (status = {_X.status.value})")
    +
    +        if isinstance(dataset_attrs, dict):
    +            dataset_attrs = DataSourceAttrs(**dataset_attrs)
    +
    +        self._fit_from_datasource(
    +            X=_X, dataset_attrs=dataset_attrs, target=target,
    +            anonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)
    +
    +    @staticmethod
    +    def _init_datasource_attributes(
    +            sortbykey: Optional[Union[str, List[str]]],
    +            entities: Optional[Union[str, List[str]]],
    +            generate_cols: Optional[List[str]],
    +            exclude_cols: Optional[List[str]],
    +            dtypes: Optional[Dict[str, Union[str, DataType]]]) -> DataSourceAttrs:
    +        dataset_attrs = {
    +            'sortbykey': sortbykey if sortbykey is not None else [],
    +            'entities': entities if entities is not None else [],
    +            'generate_cols': generate_cols if generate_cols is not None else [],
    +            'exclude_cols': exclude_cols if exclude_cols is not None else [],
    +            'dtypes': {k: DataType(v) for k, v in dtypes.items()} if dtypes is not None else {}
    +        }
    +        return DataSourceAttrs(**dataset_attrs)
    +
    +    @staticmethod
    +    def _validate_datasource_attributes(X: Union[DataSource, pdDataFrame], dataset_attrs: DataSourceAttrs, datatype: DataSourceType, target: Optional[str]):
    +        columns = []
    +        if isinstance(X, pdDataFrame):
    +            columns = X.columns
    +            if datatype is None:
    +                raise DataTypeMissingError(
    +                    "Argument `datatype` is mandatory for pandas.DataFrame training data")
    +            datatype = DataSourceType(datatype)
    +        else:
    +            columns = [c.name for c in X.metadata.columns]
    +
    +        if target is not None and target not in columns:
    +            raise DataSourceAttrsError(
    +                "Invalid target: column '{target}' does not exist")
    +
    +        if datatype == DataSourceType.TIMESERIES:
    +            if not dataset_attrs.sortbykey:
    +                raise DataSourceAttrsError(
    +                    "The argument `sortbykey` is mandatory for timeseries datasource.")
    +
    +        invalid_fields = {}
    +        for field, v in dataset_attrs.dict().items():
    +            field_columns = v if field != 'dtypes' else v.keys()
    +            not_in_cols = [c for c in field_columns if c not in columns]
    +            if len(not_in_cols) > 0:
    +                invalid_fields[field] = not_in_cols
    +
    +        if len(invalid_fields) > 0:
    +            error_msgs = ["\t- Field '{}': columns {} do not exist".format(
    +                f, ', '.join(v)) for f, v in invalid_fields.items()]
    +            raise DataSourceAttrsError(
    +                "The dataset attributes are invalid:\n {}".format('\n'.join(error_msgs)))
    +
    +    @staticmethod
    +    def _metadata_to_payload(
    +        datatype: DataSourceType, ds_metadata: Metadata,
    +        dataset_attrs: Optional[DataSourceAttrs] = None, target: Optional[str] = None
    +    ) -> dict:
    +        """Transform a the metadata and dataset attributes into a valid
    +        payload.
    +
    +        Arguments:
    +            datatype (DataSourceType): datasource type
    +            ds_metadata (Metadata): datasource metadata object
    +            dataset_attrs ( Optional[DataSourceAttrs] ): (optional) Dataset attributes
    +            target (Optional[str]): (optional) target column name
    +
    +        Returns:
    +            metadata payload dictionary
    +        """
    +
    +        columns = [
    +            {
    +                'name': c.name,
    +                'generation': True and c.name not in dataset_attrs.exclude_cols,
    +                'dataType': DataType(dataset_attrs.dtypes[c.name]).value if c.name in dataset_attrs.dtypes else c.datatype,
    +                'varType': c.vartype,
    +            }
    +            for c in ds_metadata.columns]
    +
    +        metadata = {
    +            'columns': columns,
    +            'target': target
    +        }
    +
    +        if dataset_attrs is not None:
    +            if datatype == DataSourceType.TIMESERIES:
    +                metadata['sortBy'] = [c for c in dataset_attrs.sortbykey]
    +                metadata['entity'] = [c for c in dataset_attrs.entities]
    +
    +        return metadata
    +
    +    def _fit_from_datasource(
    +        self,
    +        X: DataSource,
    +        privacy_level: Optional[PrivacyLevel] = None,
    +        dataset_attrs: Optional[DataSourceAttrs] = None,
    +        target: Optional[str] = None,
    +        anonymize: Optional[dict] = None,
    +        condition_on: Optional[List[str]] = None
    +    ) -> None:
    +        payload = self._create_payload()
    +
    +        payload['dataSourceUID'] = X.uid
    +
    +        if privacy_level:
    +            payload['privacy_level'] = privacy_level.value
    +
    +        if X.metadata is not None and X.datatype is not None:
    +            payload['metadata'] = self._metadata_to_payload(
    +                DataSourceType(X.datatype), X.metadata, dataset_attrs, target)
    +
    +        if anonymize is not None:
    +            payload["extraData"]["anonymize"] = anonymize
    +        if condition_on is not None:
    +            payload["extraData"]["condition_on"] = condition_on
    +
    +        response = self._client.post(
    +            '/synthesizer/', json=payload, project=self._project)
    +        data = response.json()
    +        self._model = mSynthesizer(**data)
    +        while self._check_fitting_not_finished(self.status):
    +            self._logger.info('Training the synthesizer...')
    +            sleep(BACKOFF)
    +
    +    def _create_payload(self) -> dict:
    +        payload = {
    +            'extraData': {}
    +        }
    +
    +        if self._model and self._model.name:
    +            payload['name'] = self._model.name
    +
    +        return payload
     
    -    @abstractmethod
    -    def sample(self) -> pdDataFrame:
    -        """Abstract method to sample from a synthesizer."""
    -
    -    def _sample(self, payload: Dict) -> pdDataFrame:
    -        """Sample from a synthesizer.
    -
    -        Arguments:
    -            payload (dict): payload configuring the sample request
    -
    -        Returns:
    -            pandas `DataFrame`
    -        """
    -        response = self._client.post(
    -            f"/synthesizer/{self.uid}/sample", json=payload, project=self.__project)
    +    def _check_fitting_not_finished(self, status: Status) -> bool:
    +        self._logger.debug(f'checking status {status}')
    +
    +        if status.state in [Status.State.READY, Status.State.REPORT]:
    +            return False
    +
    +        self._logger.debug(f'status not ready yet {status.state}')
    +
    +        if status.prepare and PrepareState(status.prepare.state) == PrepareState.FAILED:
    +            raise FittingError('Could not train the synthesizer')
    +
    +        if status.training and TrainingState(status.training.state) == TrainingState.FAILED:
    +            raise FittingError('Could not train the synthesizer')
    +
    +        return True
     
    -        data: Dict = response.json()
    -        sample_uid = data.get('uid')
    -        sample_status = None
    -        while sample_status not in ['finished', 'failed']:
    -            self._logger.info('Sampling from the synthesizer...')
    -            response = self._client.get(
    -                f'/synthesizer/{self.uid}/history', project=self.__project)
    -            history: Dict = response.json()
    -            sample_data = next((s for s in history if s.get('uid') == sample_uid), None)
    -            sample_status = sample_data.get('status', {}).get('state')
    -            sleep(BACKOFF)
    -
    -        response = self._client.get_static_file(
    -            f'/synthesizer/{self.uid}/sample/{sample_uid}/sample.csv', project=self.__project)
    -        data = StringIO(response.content.decode())
    -        return read_csv(data)
    -
    -    @property
    -    def uid(self) -> UID:
    -        """Get the status of a synthesizer instance.
    -
    -        Returns:
    -            Synthesizer status
    -        """
    -        if not self._is_initialized():
    -            return Status.NOT_INITIALIZED
    -
    -        return self._model.uid
    -
    -    @property
    -    def status(self) -> Status:
    -        """Get the status of a synthesizer instance.
    +    @abstractmethod
    +    def sample(self) -> pdDataFrame:
    +        """Abstract method to sample from a synthesizer."""
    +
    +    def _sample(self, payload: Dict) -> pdDataFrame:
    +        """Sample from a synthesizer.
    +
    +        Arguments:
    +            payload (dict): payload configuring the sample request
    +
    +        Returns:
    +            pandas `DataFrame`
    +        """
    +        response = self._client.post(
    +            f"/synthesizer/{self.uid}/sample", json=payload, project=self._project)
    +
    +        data: Dict = response.json()
    +        sample_uid = data.get('uid')
    +        sample_status = None
    +        while sample_status not in ['finished', 'failed']:
    +            self._logger.info('Sampling from the synthesizer...')
    +            response = self._client.get(
    +                f'/synthesizer/{self.uid}/history', project=self._project)
    +            history: Dict = response.json()
    +            sample_data = next((s for s in history if s.get('uid') == sample_uid), None)
    +            sample_status = sample_data.get('status', {}).get('state')
    +            sleep(BACKOFF)
    +
    +        response = self._client.get_static_file(
    +            f'/synthesizer/{self.uid}/sample/{sample_uid}/sample.csv', project=self._project)
    +        data = StringIO(response.content.decode())
    +        return read_csv(data)
     
    -        Returns:
    -            Synthesizer status
    -        """
    -        if not self._is_initialized():
    -            return Status.NOT_INITIALIZED
    -
    -        try:
    -            self = self.get(self._model.uid, self._client)
    -            return self._model.status
    -        except Exception:  # noqa: PIE786
    -            return Status.UNKNOWN
    +    @property
    +    def uid(self) -> UID:
    +        """Get the status of a synthesizer instance.
    +
    +        Returns:
    +            Synthesizer status
    +        """
    +        if not self._is_initialized():
    +            return Status.State.NOT_INITIALIZED
    +
    +        return self._model.uid
     
    -    def get(self):
    -        assert self._is_initialized() and self._model.uid, InputError(
    -            "Please provide the synthesizer `uid`")
    +    @property
    +    def status(self) -> Status:
    +        """Get the status of a synthesizer instance.
     
    -        response = self._client.get(f'/synthesizer/{self.uid}', project=self.__project)
    -        data = filter_dict(mSynthesizer, response.json())
    -        self._model = mSynthesizer(**data)
    -
    -        return self
    +        Returns:
    +            Synthesizer status
    +        """
    +        if not self._is_initialized():
    +            return Status.not_initialized()
     
    -    @staticmethod
    -    @init_client
    -    def list(client: Optional[Client] = None) -> SynthesizersList:
    -        """List the synthesizer instances.
    -
    -        Arguments:
    -            client (Client): (optional) Client to connect to the backend
    -
    -        Returns:
    -            List of synthesizers
    -        """
    -        def __process_data(data: list) -> list:
    -            to_del = ['metadata', 'report', 'mode']
    -            for e in data:
    -                for k in to_del:
    -                    e.pop(k, None)
    -            return data
    -
    -        response = client.get('/synthesizer')
    -        data: list = response.json()
    -        data = __process_data(data)
    -
    -        return SynthesizersList(data)
    +        try:
    +            self = self.get()
    +            return self._model.status
    +        except Exception:  # noqa: PIE786
    +            return Status.unknown()
    +
    +    def get(self):
    +        assert self._is_initialized() and self._model.uid, InputError(
    +            "Please provide the synthesizer `uid`")
    +
    +        response = self._client.get(f'/synthesizer/{self.uid}', project=self._project)
    +        data = response.json()
    +        self._model = mSynthesizer(**data)
    +
    +        return self
    +
    +    @staticmethod
    +    @init_client
    +    def list(client: Optional[Client] = None) -> SynthesizersList:
    +        """List the synthesizer instances.
    +
    +        Arguments:
    +            client (Client): (optional) Client to connect to the backend
     
    -    def _is_initialized(self) -> bool:
    -        """Determine if a synthesizer is instanciated or not.
    -
    -        Returns:
    -            True if the synthesizer is instanciated
    -        """
    -        return self._model is not None
    -
    -    @staticmethod
    -    def _resolve_api_status(api_status: Dict) -> Status:
    -        """Determine the status of the Synthesizer.
    -
    -        The status of the synthesizer instance is determined by the state of
    -        its different components.
    -
    -        Arguments:
    -            api_status (dict): json from the endpoint GET /synthesizer
    -
    -        Returns:
    -            Synthesizer Status
    -        """
    -        status = Status(api_status.get('state', Status.UNKNOWN.name))
    -        if status == Status.PREPARE:
    -            if PrepareState(api_status.get('prepare', {}).get(
    -                    'state', PrepareState.UNKNOWN.name)) == PrepareState.FAILED:
    -                return Status.FAILED
    -        elif status == Status.TRAIN:
    -            if TrainingState(api_status.get('training', {}).get(
    -                    'state', TrainingState.UNKNOWN.name)) == TrainingState.FAILED:
    -                return Status.FAILED
    -        elif status == Status.REPORT:
    -            return Status.READY
    -        return status
    +        Returns:
    +            List of synthesizers
    +        """
    +        def __process_data(data: list) -> list:
    +            to_del = ['metadata', 'report', 'mode']
    +            for e in data:
    +                for k in to_del:
    +                    e.pop(k, None)
    +            return data
    +
    +        response = client.get('/synthesizer')
    +        data: list = response.json()
    +        data = __process_data(data)
    +
    +        return SynthesizersList(data)
    +
    +    def _is_initialized(self) -> bool:
    +        """Determine if a synthesizer is instanciated or not.
    +
    +        Returns:
    +            True if the synthesizer is instanciated
    +        """
    +        return self._model is not None
    +
    +    @staticmethod
    +    def _resolve_api_status(api_status: Dict) -> Status:
    +        """Determine the status of the Synthesizer.
    +
    +        The status of the synthesizer instance is determined by the state of
    +        its different components.
    +
    +        Arguments:
    +            api_status (dict): json from the endpoint GET /synthesizer
    +
    +        Returns:
    +            Synthesizer Status
    +        """
    +        status = Status(api_status.get('state', Status.UNKNOWN.name))
    +        if status == Status.PREPARE:
    +            if PrepareState(api_status.get('prepare', {}).get(
    +                    'state', PrepareState.UNKNOWN.name)) == PrepareState.FAILED:
    +                return Status.FAILED
    +        elif status == Status.TRAIN:
    +            if TrainingState(api_status.get('training', {}).get(
    +                    'state', TrainingState.UNKNOWN.name)) == TrainingState.FAILED:
    +                return Status.FAILED
    +        elif status == Status.REPORT:
    +            return Status.READY
    +        return status
     
    @@ -2033,7 +2069,6 @@

    Methods

    status: Status @@ -2079,7 +2114,6 @@

    uid: UID @@ -2128,7 +2162,6 @@

    fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None) @@ -2330,7 +2363,8 @@

    Source code in ydata/sdk/synthesizers/synthesizer.py -
     65
    +            
     64
    + 65
      66
      67
      68
    @@ -2394,73 +2428,72 @@ 

    126 127 128 -129 -130

    def fit(self, X: Union[DataSource, pdDataFrame],
    -        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
    -        datatype: Optional[Union[DataSourceType, str]] = None,
    -        sortbykey: Optional[Union[str, List[str]]] = None,
    -        entities: Optional[Union[str, List[str]]] = None,
    -        generate_cols: Optional[List[str]] = None,
    -        exclude_cols: Optional[List[str]] = None,
    -        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,
    -        target: Optional[str] = None,
    -        anonymize: Optional[dict] = None,
    -        condition_on: Optional[List[str]] = None) -> None:
    -    """Fit the synthesizer.
    -
    -    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].
    -    When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.
    -
    -    The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].
    -
    -    By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.
    -    The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.
    -
    -    Arguments:
    -        X (Union[DataSource, pandas.DataFrame]): Training dataset
    -        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
    -        datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]
    -        sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets
    -        entities (Union[str, List[str]]): (optional) columns representing entities ID
    -        generate_cols (List[str]): (optional) columns that should be synthesized
    -        exclude_cols (List[str]): (optional) columns that should not be synthesized
    -        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
    -        target (Optional[str]): (optional) Target for the dataset
    -        name (Optional[str]): (optional) Synthesizer instance name
    -        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
    -        condition_on: (Optional[List[str]]): (optional) list of features to condition upon
    -    """
    -    if self._is_initialized():
    -        raise AlreadyFittedError()
    -
    -    _datatype = DataSourceType(datatype) if isinstance(
    -        X, pdDataFrame) else DataSourceType(X.datatype)
    -
    -    dataset_attrs = self._init_datasource_attributes(
    -        sortbykey, entities, generate_cols, exclude_cols, dtypes)
    -    self._validate_datasource_attributes(X, dataset_attrs, _datatype, target)
    -
    -    # If the training data is a pandas dataframe, we first need to create a data source and then the instance
    -    if isinstance(X, pdDataFrame):
    -        if X.empty:
    -            raise EmptyDataError("The DataFrame is empty")
    -        _X = LocalDataSource(source=X, datatype=_datatype, client=self._client)
    -    else:
    -        if datatype != _datatype:
    -            warn("When the training data is a DataSource, the argument `datatype` is ignored.",
    -                 DataSourceTypeWarning)
    -        _X = X
    -
    -    if _X.status != dsStatus.AVAILABLE:
    -        raise DataSourceNotAvailableError(
    -            f"The datasource '{_X.uid}' is not available (status = {_X.status.value})")
    -
    -    if isinstance(dataset_attrs, dict):
    -        dataset_attrs = DataSourceAttrs(**dataset_attrs)
    -
    -    self._fit_from_datasource(
    -        X=_X, dataset_attrs=dataset_attrs, target=target,
    -        anonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)
    +129
    def fit(self, X: Union[DataSource, pdDataFrame],
    +        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
    +        datatype: Optional[Union[DataSourceType, str]] = None,
    +        sortbykey: Optional[Union[str, List[str]]] = None,
    +        entities: Optional[Union[str, List[str]]] = None,
    +        generate_cols: Optional[List[str]] = None,
    +        exclude_cols: Optional[List[str]] = None,
    +        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,
    +        target: Optional[str] = None,
    +        anonymize: Optional[dict] = None,
    +        condition_on: Optional[List[str]] = None) -> None:
    +    """Fit the synthesizer.
    +
    +    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].
    +    When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.
    +
    +    The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].
    +
    +    By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.
    +    The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.
    +
    +    Arguments:
    +        X (Union[DataSource, pandas.DataFrame]): Training dataset
    +        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)
    +        datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]
    +        sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets
    +        entities (Union[str, List[str]]): (optional) columns representing entities ID
    +        generate_cols (List[str]): (optional) columns that should be synthesized
    +        exclude_cols (List[str]): (optional) columns that should not be synthesized
    +        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes
    +        target (Optional[str]): (optional) Target for the dataset
    +        name (Optional[str]): (optional) Synthesizer instance name
    +        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy
    +        condition_on: (Optional[List[str]]): (optional) list of features to condition upon
    +    """
    +    if self._is_initialized():
    +        raise AlreadyFittedError()
    +
    +    _datatype = DataSourceType(datatype) if isinstance(
    +        X, pdDataFrame) else DataSourceType(X.datatype)
    +
    +    dataset_attrs = self._init_datasource_attributes(
    +        sortbykey, entities, generate_cols, exclude_cols, dtypes)
    +    self._validate_datasource_attributes(X, dataset_attrs, _datatype, target)
    +
    +    # If the training data is a pandas dataframe, we first need to create a data source and then the instance
    +    if isinstance(X, pdDataFrame):
    +        if X.empty:
    +            raise EmptyDataError("The DataFrame is empty")
    +        _X = LocalDataSource(source=X, datatype=_datatype, client=self._client)
    +    else:
    +        if datatype != _datatype:
    +            warn("When the training data is a DataSource, the argument `datatype` is ignored.",
    +                 DataSourceTypeWarning)
    +        _X = X
    +
    +    if _X.status != dsStatus.AVAILABLE:
    +        raise DataSourceNotAvailableError(
    +            f"The datasource '{_X.uid}' is not available (status = {_X.status.value})")
    +
    +    if isinstance(dataset_attrs, dict):
    +        dataset_attrs = DataSourceAttrs(**dataset_attrs)
    +
    +    self._fit_from_datasource(
    +        X=_X, dataset_attrs=dataset_attrs, target=target,
    +        anonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)
     
    @@ -2472,7 +2505,6 @@

    list(client=None) @@ -2543,51 +2575,51 @@

    Source code in ydata/sdk/synthesizers/synthesizer.py -
    334
    -335
    -336
    -337
    -338
    -339
    -340
    -341
    -342
    -343
    -344
    -345
    -346
    -347
    -348
    -349
    -350
    +            
    @staticmethod
    -@init_client
    -def list(client: Optional[Client] = None) -> SynthesizersList:
    -    """List the synthesizer instances.
    -
    -    Arguments:
    -        client (Client): (optional) Client to connect to the backend
    -
    -    Returns:
    -        List of synthesizers
    -    """
    -    def __process_data(data: list) -> list:
    -        to_del = ['metadata', 'report', 'mode']
    -        for e in data:
    -            for k in to_del:
    -                e.pop(k, None)
    -        return data
    -
    -    response = client.get('/synthesizer')
    -    data: list = response.json()
    -    data = __process_data(data)
    -
    -    return SynthesizersList(data)
    +356
    +357
    +358
    +359
    +360
    +361
    +362
    +363
    +364
    +365
    +366
    +367
    +368
    +369
    +370
    +371
    +372
    @staticmethod
    +@init_client
    +def list(client: Optional[Client] = None) -> SynthesizersList:
    +    """List the synthesizer instances.
    +
    +    Arguments:
    +        client (Client): (optional) Client to connect to the backend
    +
    +    Returns:
    +        List of synthesizers
    +    """
    +    def __process_data(data: list) -> list:
    +        to_del = ['metadata', 'report', 'mode']
    +        for e in data:
    +            for k in to_del:
    +                e.pop(k, None)
    +        return data
    +
    +    response = client.get('/synthesizer')
    +    data: list = response.json()
    +    data = __process_data(data)
    +
    +    return SynthesizersList(data)
     
    @@ -2599,7 +2631,6 @@

    sample() @@ -2616,11 +2647,11 @@

    Source code in ydata/sdk/synthesizers/synthesizer.py -
    @abstractmethod
    -def sample(self) -> pdDataFrame:
    -    """Abstract method to sample from a synthesizer."""
    +            
    @abstractmethod
    +def sample(self) -> pdDataFrame:
    +    """Abstract method to sample from a synthesizer."""
     
    @@ -2633,6 +2664,7 @@

    PrivacyLevel

    @@ -2663,7 +2695,6 @@

    BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' @@ -2686,7 +2717,6 @@

    HIGH_FIDELITY = 'HIGH_FIDELITY' @@ -2709,7 +2739,6 @@

    HIGH_PRIVACY = 'HIGH_PRIVACY' @@ -2736,6 +2765,7 @@

    + + + + + + + + + + + + + + + Multitable - YData Fabric + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + + + + +
    +
    + + + +
    +
    +
    + + + + + + + + +
    +
    +
    + + + + +
    +
    + + + + +

    Multitable

    + +
    + + + + +
    +

    + Bases: BaseSynthesizer

    + + +

    MultiTable synthesizer class.

    +

    Methods

    +
      +
    • fit: train a synthesizer instance.
    • +
    • sample: request synthetic data.
    • +
    • status: current status of the synthesizer instance.
    • +
    + +
    + Note +

    The synthesizer instance is created in the backend only when the fit method is called.

    +
    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    write_connector + UID + +
    +

    Connector of type RDBMS to be used to write the samples

    +
    +
    + required +
    name + str + +
    +

    (optional) Name to be used when creating the synthesizer. Calculated internally if not provided

    +
    +
    + None +
    client + Client + +
    +

    (optional) Client to connect to the backend

    +
    +
    + None +
    + +
    + Source code in ydata/sdk/synthesizers/multitable.py +
    class MultiTableSynthesizer(BaseSynthesizer):
    +    """MultiTable synthesizer class.
    +
    +    Methods
    +    -------
    +    - `fit`: train a synthesizer instance.
    +    - `sample`: request synthetic data.
    +    - `status`: current status of the synthesizer instance.
    +
    +    Note:
    +            The synthesizer instance is created in the backend only when the `fit` method is called.
    +
    +    Arguments:
    +        write_connector (UID): Connector of type RDBMS to be used to write the samples
    +        name (str): (optional) Name to be used when creating the synthesizer. Calculated internally if not provided
    +        client (Client): (optional) Client to connect to the backend
    +    """
    +
    +    def __init__(
    +            self, write_connector: Union[Connector, UID], uid: Optional[UID] = None, name: Optional[str] = None,
    +            project: Optional[Project] = None, client: Optional[Client] = None):
    +
    +        super().__init__(uid, name, project, client)
    +
    +        connector = self._check_or_fetch_connector(write_connector)
    +        self.__write_connector = connector.uid
    +
    +    def fit(self, X: DataSource,
    +            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
    +            datatype: Optional[Union[DataSourceType, str]] = None,
    +            sortbykey: Optional[Union[str, List[str]]] = None,
    +            entities: Optional[Union[str, List[str]]] = None,
    +            generate_cols: Optional[List[str]] = None,
    +            exclude_cols: Optional[List[str]] = None,
    +            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,
    +            target: Optional[str] = None,
    +            anonymize: Optional[dict] = None,
    +            condition_on: Optional[List[str]] = None) -> None:
    +        """Fit the synthesizer.
    +
    +        The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].
    +        Except X, all the other arguments are for now ignored until they are supported.
    +
    +        Arguments:
    +            X (DataSource): DataSource to Train
    +        """
    +
    +        self._fit_from_datasource(X)
    +
    +    def sample(self, frac: Union[int, float] = 1, write_connector: Optional[Union[Connector, UID]] = None) -> None:
    +        """Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]
    +        instance.
    +        The sample is saved in the connector that was provided in the synthesizer initialization
    +        or in the
    +
    +        Arguments:
    +            frac (int | float): fraction of the sample to be returned
    +        """
    +
    +        assert frac >= 0.1, InputError(
    +            "It is not possible to generate an empty synthetic data schema. Please validate the input provided. ")
    +        assert frac <= 5, InputError(
    +            "It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.")
    +
    +        payload = {
    +            'fraction': frac,
    +        }
    +
    +        if write_connector is not None:
    +            connector = self._check_or_fetch_connector(write_connector)
    +            payload['writeConnector'] = connector.uid
    +
    +        response = self._client.post(
    +            f"/synthesizer/{self.uid}/sample", json=payload, project=self._project)
    +
    +        data = response.json()
    +        sample_uid = data.get('uid')
    +        sample_status = None
    +        while sample_status not in ['finished', 'failed']:
    +            self._logger.info('Sampling from the synthesizer...')
    +            response = self._client.get(
    +                f'/synthesizer/{self.uid}/history', project=self._project)
    +            history = response.json()
    +            sample_data = next((s for s in history if s.get('uid') == sample_uid), None)
    +            sample_status = sample_data.get('status', {}).get('state')
    +            sleep(BACKOFF)
    +
    +        print(
    +            f"Sample created and saved into connector with ID {self.__write_connector or write_connector}")
    +
    +    def _create_payload(self) -> dict:
    +        payload = super()._create_payload()
    +        payload['writeConnector'] = self.__write_connector
    +
    +        return payload
    +
    +    def _check_or_fetch_connector(self, write_connector: Union[Connector, UID]) -> Connector:
    +        self._logger.debug(f'Write connector is {write_connector}')
    +        if isinstance(write_connector, str):
    +            self._logger.debug(f'Write connector is of type `UID` {write_connector}')
    +            write_connector = Connector.get(write_connector)
    +            self._logger.debug(f'Using fetched connector {write_connector}')
    +
    +        if write_connector.uid is None:
    +            raise InputError("Invalid connector provided as input for write")
    +
    +        if write_connector.type not in [ConnectorType.AZURE_SQL, ConnectorType.MYSQL, ConnectorType.SNOWFLAKE]:
    +            raise ConnectorError(
    +                f"Invalid type `{write_connector.type}` for the provided connector")
    +
    +        return write_connector
    +
    +
    + + + +
    + + + + + + + + + + +
    + + + +

    + fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None) + +

    + + +
    + +

    Fit the synthesizer.

    +

    The synthesizer accepts as training dataset a YData DataSource. +Except X, all the other arguments are for now ignored until they are supported.

    + + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    X + DataSource + +
    +

    DataSource to Train

    +
    +
    + required +
    + +
    + Source code in ydata/sdk/synthesizers/multitable.py +
    45
    +46
    +47
    +48
    +49
    +50
    +51
    +52
    +53
    +54
    +55
    +56
    +57
    +58
    +59
    +60
    +61
    +62
    +63
    +64
    +65
    def fit(self, X: DataSource,
    +        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,
    +        datatype: Optional[Union[DataSourceType, str]] = None,
    +        sortbykey: Optional[Union[str, List[str]]] = None,
    +        entities: Optional[Union[str, List[str]]] = None,
    +        generate_cols: Optional[List[str]] = None,
    +        exclude_cols: Optional[List[str]] = None,
    +        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,
    +        target: Optional[str] = None,
    +        anonymize: Optional[dict] = None,
    +        condition_on: Optional[List[str]] = None) -> None:
    +    """Fit the synthesizer.
    +
    +    The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].
    +    Except X, all the other arguments are for now ignored until they are supported.
    +
    +    Arguments:
    +        X (DataSource): DataSource to Train
    +    """
    +
    +    self._fit_from_datasource(X)
    +
    +
    +
    + +
    + + +
    + + + +

    + sample(frac=1, write_connector=None) + +

    + + +
    + +

    Sample from a MultiTableSynthesizer +instance. +The sample is saved in the connector that was provided in the synthesizer initialization +or in the

    + + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    frac + int | float + +
    +

    fraction of the sample to be returned

    +
    +
    + 1 +
    + +
    + Source code in ydata/sdk/synthesizers/multitable.py +
    def sample(self, frac: Union[int, float] = 1, write_connector: Optional[Union[Connector, UID]] = None) -> None:
    +    """Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]
    +    instance.
    +    The sample is saved in the connector that was provided in the synthesizer initialization
    +    or in the
    +
    +    Arguments:
    +        frac (int | float): fraction of the sample to be returned
    +    """
    +
    +    assert frac >= 0.1, InputError(
    +        "It is not possible to generate an empty synthetic data schema. Please validate the input provided. ")
    +    assert frac <= 5, InputError(
    +        "It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.")
    +
    +    payload = {
    +        'fraction': frac,
    +    }
    +
    +    if write_connector is not None:
    +        connector = self._check_or_fetch_connector(write_connector)
    +        payload['writeConnector'] = connector.uid
    +
    +    response = self._client.post(
    +        f"/synthesizer/{self.uid}/sample", json=payload, project=self._project)
    +
    +    data = response.json()
    +    sample_uid = data.get('uid')
    +    sample_status = None
    +    while sample_status not in ['finished', 'failed']:
    +        self._logger.info('Sampling from the synthesizer...')
    +        response = self._client.get(
    +            f'/synthesizer/{self.uid}/history', project=self._project)
    +        history = response.json()
    +        sample_data = next((s for s in history if s.get('uid') == sample_uid), None)
    +        sample_status = sample_data.get('status', {}).get('state')
    +        sleep(BACKOFF)
    +
    +    print(
    +        f"Sample created and saved into connector with ID {self.__write_connector or write_connector}")
    +
    +
    +
    + +
    + + + +
    + +
    + + +
    + + + + + + +
    +
    + + + + +
    + + + +
    + +
    + + + + +
    + +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/0.7/sdk/reference/api/synthesizers/regular/index.html b/0.7/sdk/reference/api/synthesizers/regular/index.html index 9609ade7..35b5bbf9 100644 --- a/0.7/sdk/reference/api/synthesizers/regular/index.html +++ b/0.7/sdk/reference/api/synthesizers/regular/index.html @@ -1084,14 +1084,14 @@
  • - fit() + fit
  • - sample() + sample
  • @@ -1363,7 +1363,6 @@ -

    fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None) @@ -1601,7 +1600,6 @@

    sample(n_samples=1, condition_on=None) @@ -1734,6 +1732,7 @@

    PrivacyLevel

    @@ -1764,7 +1763,6 @@

    BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' @@ -1787,7 +1785,6 @@

    HIGH_FIDELITY = 'HIGH_FIDELITY' @@ -1810,7 +1807,6 @@

    HIGH_PRIVACY = 'HIGH_PRIVACY' @@ -1837,6 +1833,7 @@

    - fit() + fit
  • - sample() + sample
  • @@ -1371,7 +1371,6 @@ -

    fit(X, sortbykey, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None) @@ -1627,7 +1626,6 @@

    sample(n_entities, condition_on=None) @@ -1765,6 +1763,7 @@

    PrivacyLevel

    @@ -1795,7 +1794,6 @@

    BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' @@ -1818,7 +1816,6 @@

    HIGH_FIDELITY = 'HIGH_FIDELITY' @@ -1841,7 +1838,6 @@

    HIGH_PRIVACY = 'HIGH_PRIVACY' @@ -1868,6 +1864,7 @@

    YData Fabric is a Data-Centric AI development platform that accelerates AI development by helping data practitioners achieve production-quality data.

    Much like for software engineering the quality of code is a must for the success of software development, Fabric accounts for the data quality requirements for data-driven applications. It introduces standards, processes, and acceleration to empower data science, analytics, and data engineering teams.

    "},{"location":"#try-fabric","title":"Try Fabric","text":"
    • Get started with Fabric Community
    "},{"location":"#why-adopt-ydata-fabric","title":"Why adopt YData Fabric?","text":"

    With Fabric, you can standardize the understanding of your data, quickly identify data quality issues, streamline and version your data preparation workflows and finally leverage synthetic data for privacy-compliance or as a tool to boost ML performance. Fabric is a development environment that supports a faster and easier process of preparing data for AI development. Data practitioners are using Fabric to:

    • Establish a centralized and collaborative repository for data projects.
    • Create and share comprehensive documentation of data, encompassing data schema, structure, and personally identifiable information (PII).
    • Prevent data quality issues with standardized data quality profiling, providing visual understanding and warnings on potential issues.
    • Accelerate data preparation with customizable recipes.
    • Improve machine learning performance with optimal data preparation through solutions such as synthetic data.
    • Shorten access to data with privacy-compliant synthetic data generatio.
    • Build and streamline data preparation workflows effortlessly through a user-friendly drag-and-drop interface.
    • Efficiently manage business rules, conduct comparisons, and implement version control for data workflows using pipelines.
    "},{"location":"#key-features","title":"\ud83d\udcdd Key features","text":""},{"location":"#data-catalog","title":"Data Catalog","text":"

    Fabric Data Catalog provides a centralized perspective on datasets within a project-basis, optimizing data management through seamless integration with the organization's existing data architectures via scalable connectors (e.g., MySQL, Google Cloud Storage, AWS S3). It standardizes data quality profiling, streamlining the processes of efficient data cleaning and preparation, while also automating the identification of Personally Identifiable Information (PII) to facilitate compliance with privacy regulations.

    Explore how a Data Catalog through a centralized repository of your datasets, schema validation, and automated data profiling.

    "},{"location":"#labs","title":"Labs","text":"

    Fabric's Labs environments provide collaborative, scalable, and secure workspaces layered on a flexible infrastructure, enabling users to seamlessly switch between CPUs and GPUs based on their computational needs. Labs are familiar environments that empower data developers with powerful IDEs (Jupyter Notebooks, Visual Code or H2O flow) and a seamless experience with the tools they already love combined with YData's cutting-edge SDK for data preparation.

    Learn how to use the Labs to generate synthetic data in a familiar Python interface.

    "},{"location":"#synthetic-data","title":"Synthetic data","text":"

    Synthetic data, enabled by YData Fabric, provides data developers with a user-friendly interfaces (UI and code) for generating artificial datasets, offering a versatile solution across formats like tabular, time-series and multi-table datasets. The generated synthetic data holds the same value of the original and aligns intricately with specific business rules, contributing to machine learning models enhancement, mitigation of privacy concerns and more robustness for data developments. Fabric offers synthetic data that is ease to adapt and configure, allows customization in what concerns privacy-utility trade-offs.

    Learn how you to create high-quality synthetic data within a user-friendly UI using Fabric\u2019s data synthesis flow.

    "},{"location":"#pipelines","title":"Pipelines","text":"

    Fabric Pipelines streamlines data preparation workflows by automating, orchestrating, and optimizing data pipelines, providing benefits such as flexibility, scalability, monitoring, and reproducibility for efficient and reliable data processing. The intuitive drag-and-drop interface, leveraging Jupyter notebooks or Python scripts, expedites the pipeline setup process, providing data developers with a quick and user-friendly experience.

    Explore how you can leverage Fabric Pipelines to build versionable and reproducible data preparation workflows for ML development.

    "},{"location":"#tutorials","title":"Tutorials","text":"

    To understand how to best apply Fabric to your use cases, start by exploring the following tutorials:

    • Handling Imbalanced Data for Improved Fraud DetectionLearn how to implement high-performant fraud detection models by incorporating synthetic data to balance your datasets.

    • Prediction with Quality Inspection Learn how to develop data preparation workflows with automated data quality checks and Pipelines.

    • Generating Synthetic Data for Financial TransactionsLearn how to use synthetic data generation to replicate your existing relational databases while ensuring referential integrity.

    You can find additional examples and use cases at YData Academy GitHub Repository.

    "},{"location":"#support","title":"\ud83d\ude4b Support","text":"

    Facing an issue? We\u2019re committed to providing all the support you need to ensure a smooth experience using Fabric:

    • Create a support ticket: our team will help you move forward!
    • Contact a Fabric specialist: for personalized guidance or full access to the platform
    "},{"location":"examples/synthesize_tabular_data/","title":"Synthesize tabular data","text":"

    Use YData's RegularSynthesizer to generate tabular synthetic data

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\ndef main():\n\"\"\"In this example, we demonstrate how to train a synthesizer from a pandas\n    DataFrame.\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\nX = get_dataset('census')\n# We initialize a regular synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = RegularSynthesizer()\n# We train the synthesizer on our dataset\nsynth.fit(X)\n# We request a synthetic dataset with 50 rows\nsample = synth.sample(n_samples=50)\nprint(sample.shape)\nif __name__ == \"__main__\":\nmain()\n
    "},{"location":"examples/synthesize_timeseries_data/","title":"Synthesize time-series data","text":"

    Use YData's TimeSeriesSynthesizer to generate time-series synthetic data

    Tabular data is the most common type of data we encounter in data problems.

    When thinking about tabular data, we assume independence between different records, but this does not happen in reality. Suppose we check events from our day-to-day life, such as room temperature changes, bank account transactions, stock price fluctuations, and air quality measurements in our neighborhood. In that case, we might end up with datasets where measures and records evolve and are related through time. This type of data is known to be sequential or time-series data.

    Thus, sequential or time-series data refers to any data containing elements ordered into sequences in a structured format. Dissecting any time-series dataset, we see differences in variables' behavior that need to be understood for an effective generation of synthetic data. Typically any time-series dataset is composed of the following:

    • Variables that define the order of time (these can be simple with one variable or composed)
    • Time-variant variables
    • Variables that refer to entities (single or multiple entities)
    • Variables that are attributes (those that don't depend on time but rather on the entity)

    Below find an example:

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import TimeSeriesSynthesizer\n# Do not forget to add your token as env variable\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'\nX = get_dataset('occupancy')\n# We initialize a time series synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = TimeSeriesSynthesizer()\n# We train the synthesizer on our dataset\n# sortbykey -> variable that define the time order for the sequence\nsynth.fit(X, sortbykey='date')\n# By default it is requested a synthetic sample with the same length as the original data\n# The TimeSeriesSynthesizer is designed to replicate temporal series and therefore the original time-horizon is respected\nsample = synth.sample(n_entities=1)\n
    "},{"location":"examples/synthesize_with_anonymization/","title":"Anonymization","text":"

    YData Synthesizers offers a way to anonymize sensitive information such that the original values are not present in the synthetic data but replaced by fake values.

    Does the model retain the original values?

    No! The anonymization is performed before the model training such that it never sees the original values.

    The anonymization is performed by specifying which columns need to be anonymized and how to perform the anonymization. The anonymization rules are defined as a dictionary with the following format:

    {column_name: anonymization_rule}

    While here are some predefined anonymization rules such as name, email, company, it is also possible to create a rule using a regular expression. The anonymization rules have to be passed to a synthesizer in its fit method using the parameter anonymize.

    What is the difference between anonymization and privacy?

    Anonymization makes sure sensitive information are hidden from the data. Privacy makes sure it is not possible to infer the original data points from the synthetic data points via statistical attacks.

    Therefore, for data sharing anonymization and privacy controls are complementary.

    The example below demonstrates how to anonymize the column Name by fake names and the column Ticket by a regular expression:

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\ndef main():\n\"\"\"In this example, we demonstrate how to train a synthesizer from a pandas\n    DataFrame.\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\nX = get_dataset('titanic')\n# We initialize a regular synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = RegularSynthesizer()\n# We define anonymization rules, which is a dictionary with format:\n# {column_name: anonymization_rule, ...}\n# while here are some predefined anonymization rules like: name, email, company\n# it is also possible to create a rule using a regular expression\nrules = {\n\"Name\": \"name\",\n\"Ticket\": \"[A-Z]{2}-[A-Z]{4}\"\n}\n# We train the synthesizer on our dataset\nsynth.fit(\nX,\nname=\"titanic_synthesizer\",\nanonymize=rules\n)\n# We request a synthetic dataset with 50 rows\nsample = synth.sample(n_samples=50)\nprint(sample[[\"Name\", \"Ticket\"]].head(3))\nif __name__ == \"__main__\":\nmain()\n

    "},{"location":"examples/synthesize_with_conditional_sampling/","title":"Conditional sampling","text":"

    YData Synthesizers support conditional sampling. The fit method has an optional parameter named condition_on, which receives a list of features to condition upon. Furthermore, the sample method receives the conditions to be applied through another optional parameter also named condition_on. For now, two types of conditions are supported:

    • Condition upon a categorical (or string) feature. The parameters are the name of the feature and a list of values (i.e., categories) to be considered. Each category also has its percentage of representativeness. For example, if we want to condition upon two categories, we need to define the percentage of rows each of these categories will have on the synthetic dataset. Naturally, the sum of such percentages needs to be 1. The default percentage is also 1 since it is the required value for a single category.
    • Condition upon a numerical feature. The parameters are the name of the feature and the minimum and maximum of the range to be considered. This feature will present a uniform distribution on the synthetic dataset, limited by the specified range.

    The example below demonstrates how to train and sample from a synthesizer using conditional sampling:

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n# Do not forget to add your token as env variables.\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined.\ndef main():\n\"\"\"In this example, we demonstrate how to train and\n    sample from a synthesizer using conditional sampling.\"\"\"\nX = get_dataset('census')\n# We initialize a regular synthesizer.\n# As long as the synthesizer does not call `fit`, it exists only locally.\nsynth = RegularSynthesizer()\n# We train the synthesizer on our dataset setting\n# the features to condition upon.\nsynth.fit(\nX,\nname=\"census_synthesizer\",\ncondition_on=[\"sex\", \"native-country\", \"age\"]\n)\n# We request a synthetic dataset with specific condition rules.\nsample = synth.sample(\nn_samples=500,\ncondition_on={\n\"sex\": {\n\"categories\": [\"Female\"]\n},\n\"native-country\": {\n\"categories\": [(\"United-States\", 0.6),\n(\"Mexico\", 0.4)]\n},\n\"age\": {\n\"minimum\": 55,\n\"maximum\": 60\n}\n}\n)\nprint(sample)\nif __name__ == \"__main__\":\nmain()\n
    "},{"location":"examples/synthesize_with_privacy_control/","title":"Privacy control","text":"

    YData Synthesizers offers 3 different levels of privacy:

    1. high privacy: the model is optimized for privacy purposes,
    2. high fidelity (default): the model is optimized for high fidelity,
    3. balanced: tradeoff between privacy and fidelity.

    The default privacy level is high fidelity. The privacy level can be changed by the user at the moment a synthesizer level is trained by using the parameter privacy_level. The parameter expect a PrivacyLevel value.

    What is the difference between anonymization and privacy?

    Anonymization makes sure sensitive information are hidden from the data. Privacy makes sure it is not possible to infer the original data points from the synthetic data points via statistical attacks.

    Therefore, for data sharing anonymization and privacy controls are complementary.

    The example below demonstrates how to train a synthesizer configured for high privacy:

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import PrivacyLevel, RegularSynthesizer\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\ndef main():\n\"\"\"In this example, we demonstrate how to train a synthesizer\n    with a high-privacy setting from a pandas DataFrame.\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\nX = get_dataset('titanic')\n# We initialize a regular synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = RegularSynthesizer()\n# We train the synthesizer on our dataset setting the privacy level to high\nsynth.fit(\nX,\nname=\"titanic_synthesizer\",\nprivacy_level=PrivacyLevel.HIGH_PRIVACY\n)\n# We request a synthetic dataset with 50 rows\nsample = synth.sample(n_samples=50)\nprint(sample)\nif __name__ == \"__main__\":\nmain()\n
    "},{"location":"get-started/","title":"Get started with Fabric","text":"

    The get started is here to help you if you are not yet familiar with YData Fabric or if you just want to learn more about data quality, data preparation workflows and how you can start leveraging synthetic data. Mention to YData Fabric Community

    "},{"location":"get-started/#create-your-first-data-with-the-data-catalog","title":"\ud83d\udcda Create your first Data with the Data Catalog","text":""},{"location":"get-started/#create-your-first-synthetic-data-generator","title":"\u2699\ufe0f Create your first Synthetic Data generator","text":""},{"location":"get-started/#create-your-first-lab","title":"\ud83e\uddea Create your first Lab","text":""},{"location":"get-started/#create-your-first-data-pipeline","title":"\ud83c\udf00 Create your first data Pipeline","text":""},{"location":"get-started/create_lab/","title":"How to create your first Lab environment","text":"

    Labs are code environments for a more flexible development of data-driven solutions while leveraging Fabric capabilities combined with already loved tools such as scikit-learn, numpy and pandas. To create your first Lab, you can use the \u201cCreate Lab\u201d from Fabric\u2019s home, or you can access it from the Labs module by selecting it on the left side menu, and clicking the \u201cCreate Lab\u201d button.

    Next, a menu with different IDEs will be shown. As a quickstart select Jupyter Lab. As labs are development environments you will be also asked what language you would prefer your environment to support: R or Python. Select Python.

    Select IDE Select language

    Bundles are environments with pre-installed packages. Select YData bundle, so we can leverage some other Fabric features such as Data Profiling, Synthetic Data and Pipelines.

    As a last step, you will be asked to configure the infrastructure resources for this new environment as well as giving it a Display Name. We will keep the defaults, but you have flexibility to select GPU acceleration or whether you need more computational resources for your developments.

    Finally, your Lab will be created and added to the \"Labs\" list, as per the image below. The status of the lab will be \ud83d\udfe1 while preparing, and this process takes a few minutes, as the infrastructure is being allocated to your development environment. As soon as the status changes to \ud83d\udfe2, you can open your lab by clicking in the button as shown below:

    Create a new notebook in the JupyterLab and give it a name. You are now ready to start your developments!

    Create a new notebook Notebook created

    Congrats! \ud83d\ude80 You have now successfully created your first Lab a code environment, so you can benefit from the most advanced Fabric features as well as compose complex data workflows. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_pipeline/","title":"How to create your first Pipeline","text":"

    Check this quickstart video on how to create your first Pipeline.

    The best way to get started with Pipelines is to use the interactive Pipeline editor available in the Labs with Jupyter Lab set as IDE. If you don't have a Lab yet, or you don't know how to create one, check our quickstart guide on how to create your first lab.

    Open an already existing lab.

    A Pipeline comprises one or more nodes that are connected (or not!) with each other to define execution dependencies. Each pipeline node is and should be implemented as a component that is expected to manage a single task, such as read the data, profiling the data, training a model, or even publishing a model to production environments.

    In this tutorial we will build a simple and generic pipeline that use a Dataset from Fabric's Data Catalog and profile to check it's quality. We have the notebooks template already available. For that you need to access the \"Academy\" folder as per the image below.

    Make sure to copy all the files in the folder \"3 - Pipelines/quickstart\" to the root folder of your lab, as per the image below.

    Now that we have our notebooks we need to make a small change in the notebook \"1. Read dataset\". Go back to your Data Catalog, from one of the datasets in your Catalog list, select the three vertical dots and click in \"Explore in Labs\" as shown in the image below.

    The following screen will be shown. Click in copy.

    Now that we have copied the code, let's get back to our \"1. Read data.ipynb\" notebook, and replace the first code cell by with the new code. This will allow us to use a dataset from the Data Catalog in our pipeline.

    Placeholder code Replaced with code snippet

    With our notebooks ready, we can now configure our Pipeline. For this quickstart we will be leveraging an already existing pipeline - double-click the file my_first_pipeline.pipeline. You should see a pipeline as depicted in the images below. To create a new Pipeline, you can open the lab launcher tab and select \"Pipeline Editor\".

    Open Pipeline My first pipeline

    Before running the pipeline, we need to check each component/step properties and configurations. Right-click each one of the steps, select \"Open Properties\", and a menu will be depicted in your right side. Make sure that you have \"YData - CPU\" selected as the Runtime Image as show below.

    Open properties Runtime image

    We are now ready to create and run our first pipeline. In the top left corner of the pipeline editor, the run button will be available for you to click.

    Accept the default values shown in the run dialog and start the run

    If the following message is shown, it means that you have create a run of your first pipeline.

    Now that you have created your first pipeline, you can select the Pipeline from Fabric's left side menu.

    Your most recent pipeline will be listed, as shown in below image.

    To check the run of your pipeline, jump into the \"Run\" tab. You will be able to see your first pipeline running!

    By clicking on top of the record you will be able to see the progress of the run step-by-step, and visualize the outputs of each and every step by clicking on each step and selecting the Visualizations tab.

    Congrats! \ud83d\ude80 You have now successfully created your first Pipeline a code environment, so you can benefit from Fabric's orchestration engine to crate scalable, versionable and comparable data workflows. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_syntheticdata_generator/","title":"How to create your first Synthetic Data generator","text":"

    Check this quickstart video on how to create your first Synthetic Data generator.

    To generate your first synthetic data, you need to have a Dataset already available in your Data Catalog. Check this tutorial to see how you can add your first dataset to Fabric\u2019s Data Catalog.

    With your first dataset created, you are now able to start the creation of your Synthetic Data generator. You can either select \"Synthetic Data\" from your left side menu, or you can select \"Create Synthetic Data\" in your project Home as shown in the image below.

    You'll be asked to select the dataset you wish to generate synthetic data from and verify the columns you'd like to include in the synthesis process, validating their Variable and Data Types.

    Data types are relevant for synthetic data quality

    Data Types are important to be revisited and aligned with the objectives for the synthetic data as they can highly impact the quality of the generated data. For example, let's say we have a column that is a \"Name\", while is some situations it would make sense to consider it a String, under the light of a dataset where \"Name\" refers to the name of the product purchases, it might be more beneficial to set it as a Category.

    Finally, as the last step of our process it comes the Synthetic Data specific configurations, for this particular case we only need to define a Display Name, and we can finish the process by clicking in the \"Save\" button as per the image below.

    Your Synthetic Data generator is now training and listed under \"Synthetic Data\". While the model is being trained, the Status will be \ud83d\udfe1, as soon as the training is completed successfully it will transition to \ud83d\udfe2 as per the image below.

    Once the Synthetic Data generator has finished training, you're ready to start generating your first synthetic dataset. You can start by exploring an overview of the model configurations and even download a PDF report with a comprehensive overview of your Synthetic Data Quality Metrics. Next, you can generate synthetic data samples by accessing the Generation tab or click on \"Go to Generation\".

    In this section, you are able to generate as many synthetic samples as you want. For that you need to define the number rows to generate and click \"Generate\", as depicted in the image below.

    A new line in your \"Sample History\" will be shown and as soon as the sample generation is completed you will be able to \"Compare\" your synthetic data with the original data, add as a Dataset with \"Add to Data Catalog\" and last but not the least download it as a file with \"Download csv\".

    Congrats! \ud83d\ude80 You have now successfully created your first Synthetic Data generator with Fabric. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/fabric_community/","title":"Get started with Fabric Community","text":"

    Fabric Community is a SaaS version that allows you to explore all the functionalities of Fabric first-hand: free, forever, for everyone. You\u2019ll be able to validate your data quality with automated profiling, unlock data sharing and improve your ML models with synthetic data, and increase your productivity with seamless integration:

    • Build 1 personal project;
    • Create your first Data Catalog and benefit from automated data profiling;
    • Train and generate synthetic data up to 2 models and datasets with 50 columns and 100K rows;
    • Optimize synthetic data quality for your use cases with an evaluation PDF report;
    • Create 1 development environment (Labs) and integrate it with your familiar ML packages and workflows.
    "},{"location":"get-started/fabric_community/#register","title":"Register","text":"

    To register for Fabric Community:

    • Access the Fabric Community Try Now and create your YData account by submitting the form
    • Check your email for your login credentials
    • Login into fabric.ydata.ai and enjoy!

    Once you login, you'll access the Home page and get started with your data preparation!

    "},{"location":"get-started/upload_csv/","title":"How to create your first Dataset from a CSV file","text":"

    Check this quickstart video on how to create your first Dataset from a CSV file.

    To create your first dataset in the Data Catalog, you can start by clicking on \"Add Dataset\" from the Home section. Or click to Data Catalog (on the left side menu) and click \u201cAdd Dataset\u201d.

    After that the below modal will be shown. You will need to select a connector. To upload a CSV file, we need to select \u201cUpload CSV\u201d.

    Once you've selected the \u201cUpload CSV\u201d connector, a new screen will appear, enabling you to upload your file and designate a name for your connector. This file upload connector will subsequently empower you to create one or more datasets from the same file at a later stage.

    Loading area Upload csv file

    With the Connector created, you'll be able to add a dataset and specify its properties:

    • Name: The name of your dataset;
    • Separator: This is an important parameter to make sure that we can parse your CSV correctly. The default value is \u201c,\u201d.
    • Data Type: Whether your dataset contains tabular or time-series (i.e., containing temporal dependency) data.

    Your created Connector (\u201cCensus File\u201d) and Dataset (\u201cCensus\u201d) will be added to the Data Catalog. As soon as the status is green, you can navigate your Dataset. Click in Open Dataset as per the image below.

    Within the Dataset details, you can gain valuable insights through our automated data quality profiling. This includes comprehensive metadata and an overview of your data, encompassing details like row count, identification of duplicates, and insights into the overall quality of your dataset.

    Or perhaps, you want to further explore through visualization, the profile of your data with both univariate and multivariate of your data.

    Congrats! \ud83d\ude80 You have now successfully created your first Connector and Dataset in Fabric\u2019s Data Catalog. Get ready for your journey of improved quality data for AI.

    "},{"location":"sdk/","title":"Overview","text":"

    YData SDK for improved data quality everywhere!

    ydata-sdk is here! Create a YData account so you can start using today!

    Create account

    "},{"location":"sdk/#overview","title":"Overview","text":"

    The YData SDK is an ecosystem of methods that allows users to, through a python interface, adopt a Data-Centric approach towards the AI development. The solution includes a set of integrated components for data ingestion, standardized data quality evaluation and data improvement, such as synthetic data generation, allowing an iterative improvement of the datasets used in high-impact business applications.

    Synthetic data can be used as Machine Learning performance enhancer, to augment or mitigate the presence of bias in real data. Furthermore, it can be used as a Privacy Enhancing Technology, to enable data-sharing initiatives or even to fuel testing environments.

    Under the YData-SDK hood, you can find a set of algorithms and metrics based on statistics and deep learning based techniques, that will help you to accelerate your data preparation.

    "},{"location":"sdk/#current-functionality","title":"Current functionality","text":"

    YData SDK is currently composed by the following main modules:

    • Datasources

      • YData\u2019s SDK includes several connectors for easy integration with existing data sources. It supports several storage types, like filesystems and RDBMS. Check the list of connectors.
      • SDK\u2019s Datasources run on top of Dask, which allows it to deal with not only small workloads but also larger volumes of data.
    • Synthesizers

      • Simplified interface to train a generative model and learn in a data-driven manner the behavior, the patterns and original data distribution. Optimize your model for privacy or utility use-cases.
      • From a trained synthesizer, you can generate synthetic samples as needed and parametrise the number of records needed.
      • Anonymization and privacy preserving capabilities to ensure that synthetic datasets does not contain Personal Identifiable Information (PII) and can safely be shared!
      • Conditional sampling can be used to restrict the domain and values of specific features in the sampled data.
    • Synthetic data quality report Coming soon

      • An extensive synthetic data quality report that measures 3 dimensions: privacy, utility and fidelity of the generated data. The report can be downloaded in PDF format for ease of sharing and compliance purposes or as a JSON to enable the integration in data flows.
    • Profiling Coming soon

      • A set of metrics and algorithms summarizes datasets quality in three main dimensions: warnings, univariate analysis and a multivariate perspective.
    "},{"location":"sdk/#supported-data-formats","title":"Supported data formats","text":"TabularTime-SeriesTransactionalRelational databases

    The RegularSynthesizer is perfect to synthesize high-dimensional data, that is time-indepentent with high quality results.

    Know more

    The TimeSeriesSynthesizer is perfect to synthesize both regularly and not evenly spaced time-series, from smart-sensors to stock.

    Know more

    The TimeSeriesSynthesizer supports transactional data, known to have highly irregular time intervals between records and directional relations between entities.

    Coming soon

    Know more

    The MultiTableSynthesizer is perfect to learn how to replicate the data within a relational database schema.

    Coming soon

    Know more

    "},{"location":"sdk/installation/","title":"Installation","text":"

    YData SDK is generally available through both Pypi and Conda allowing an easy process of installation. This experience allows combining YData SDK with other packages such as Pandas, Numpy or Scikit-Learn.

    YData SDK is available for the public through a token-based authentication system. If you don\u2019t have one yet, you can get your free license key during the installation process. You can check what features are available in the free version here.

    "},{"location":"sdk/installation/#installing-the-package","title":"Installing the package","text":"

    YData SDK supports python versions bigger than python 3.8, and can be installed in Windows, Linux or MacOS operating systems.

    Prior to the package installation, it is recommended the creation of a virtual or conda environment:

    pyenv
    pyenv virtualenv 3.10 ydatasdk\n

    And install ydata-sdk

    pypi
    pip install ydata-sdk\n
    "},{"location":"sdk/installation/#authentication","title":"Authentication","text":"

    Once you've installed ydata-sdk package you will need a token to run the functionalities. YData SDK uses a token based authentication system. To get access to your token, you need to create a YData account.

    YData SDK offers a free-trial and an enterprise version. To access your free-trial token, you need to create a YData account.

    The token will be available here, after login:

    With your account toke copied, you can set a new environment variable YDATA_TOKEN in the beginning of your development session.

        import os\nos.setenv['YDATA_TOKEN'] = '{add-your-token}'\n

    Once you have set your token, you are good to go to start exploring the incredible world of data-centric AI and smart synthetic data generation!

    Check out our quickstart guide!

    "},{"location":"sdk/quickstart/","title":"Quickstart","text":"

    YData SDK allows you to with an easy and familiar interface, to adopt a Data-Centric AI approach for the development of Machine Learning solutions. YData SDK features were designed to support structure data, including tabular data, time-series and transactional data.

    "},{"location":"sdk/quickstart/#read-data","title":"Read data","text":"

    To start leveraging the package features you should consume your data either through the Connectors or pandas.Dataframe. The list of available connectors can be found here [add a link].

    From pandas dataframeFrom a connector
        # Example for a Google Cloud Storage Connector\ncredentials = \"{insert-credentials-file-path}\"\n# We create a new connector for Google Cloud Storage\nconnector = Connector(connector_type='gcs', credentials=credentials)\n# Create a Datasource from the connector\n# Note that a connector can be re-used for several datasources\nX = DataSource(connector=connector, path='gs://<my_bucket>.csv')\n
        # Load a small dataset\nX = pd.read_csv('{insert-file-path.csv}')\n# Init a synthesizer\nsynth = RegularSynthesizer()\n# Train the synthesizer with the pandas Dataframe as input\n# The data is then sent to the cluster for processing\nsynth.fit(X)\n

    The synthesis process returns a pandas.DataFrame object. Note that if you are using the ydata-sdk free version, all of your data is sent to a remote cluster on YData's infrastructure.

    "},{"location":"sdk/quickstart/#data-synthesis-flow","title":"Data synthesis flow","text":"

    The process of data synthesis can be described into the following steps:

    stateDiagram-v2\n  state read_data\n  read_data --> init_synth\n  init_synth --> train_synth\n  train_synth --> generate_samples\n  generate_samples --> [*]

    The code snippet below shows how easy can be to start generating new synthetic data. The package includes a set of examples datasets for a quickstart.

        from ydata.sdk.dataset import get_dataset\n#read the example data\nX = get_dataset('census')\n# Init a synthesizer\nsynth = RegularSynthesizer()\n# Fit the synthesizer to the input data\nsynth.fit(X)\n# Sample new synthetic data. The below request ask for new 1000 synthetic rows\nsynth.sample(n_samples=1000)\n

    Do I need to prepare my data before synthesis?

    The sdk ensures that the original behaviour is replicated. For that reason, there is no need to preprocess outlier observations or missing data.

    By default all the missing data is replicated as NaN.

    "},{"location":"sdk/modules/connectors/","title":"Connectors","text":"

    YData SDK allows users to consume data assets from remote storages through Connectors. YData Connectors support different types of storages, from filesystems to RDBMS'.

    Below the list of available connectors:

    Connector Name Type Supported File Types Useful Links Notes AWS S3 Remote object storage CSV, Parquet https://aws.amazon.com/s3/ Google Cloud Storage Remote object storage CSV, Parquet https://cloud.google.com/storage Azure Blob Storage Remote object storage CSV, Parquet https://azure.microsoft.com/en-us/services/storage/blobs/ File Upload Local CSV - Maximum file size is 220MB. Bigger files should be uploaded and read from remote object storages MySQL RDBMS Not applicable https://www.mysql.com/ Supports reading whole schemas or specifying a query Azure SQL Server RDBMS Not applicable https://azure.microsoft.com/en-us/services/sql-database/campaign/ Supports reading whole schemas or specifying a query PostgreSQL RDBMS Not applicable https://www.postgresql.org/ Supports reading whole schemas or specifying a query Snowflake RDBMS Not applicable https://docs.snowflake.com/en/sql-reference-commands Supports reading whole schemas or specifying a query Google BigQuery Data warehouse Not applicable https://cloud.google.com/bigquery Azure Data Lake Data lake CSV, Parquet https://azure.microsoft.com/en-us/services/storage/data-lake-storage/

    More details can be found at Connectors APi Reference Docs.

    "},{"location":"sdk/modules/synthetic_data/","title":"Synthetic data generation","text":""},{"location":"sdk/modules/synthetic_data/#data-formats","title":"Data formats","text":""},{"location":"sdk/modules/synthetic_data/#tabular-data","title":"Tabular data","text":""},{"location":"sdk/modules/synthetic_data/#time-series-data","title":"Time-series data","text":""},{"location":"sdk/modules/synthetic_data/#transactions-data","title":"Transactions data","text":""},{"location":"sdk/modules/synthetic_data/#best-practices","title":"Best practices","text":""},{"location":"sdk/reference/api/common/client/","title":"Get client","text":"

    Deduce how to initialize or retrieve the client.

    This is meant to be a zero configuration for the user.

    Create and set a client globally
    from ydata.sdk.client import get_client\nget_client(set_as_global=True)\n

    Parameters:

    Name Type Description Default client_or_creds Optional[Union[Client, dict, str, Path]]

    Client to forward or credentials for initialization

    None set_as_global bool

    If True, set client as global

    False wait_for_auth bool

    If True, wait for the user to authenticate

    True

    Returns:

    Type Description Client

    Client instance

    Source code in ydata/sdk/common/client/utils.py
    def get_client(client_or_creds: Optional[Union[Client, Dict, str, Path]] = None, set_as_global: bool = False, wait_for_auth: bool = True) -> Client:\n\"\"\"Deduce how to initialize or retrieve the client.\n    This is meant to be a zero configuration for the user.\n    Example: Create and set a client globally\n            ```py\n            from ydata.sdk.client import get_client\n            get_client(set_as_global=True)\n            ```\n    Args:\n        client_or_creds (Optional[Union[Client, dict, str, Path]]): Client to forward or credentials for initialization\n        set_as_global (bool): If `True`, set client as global\n        wait_for_auth (bool): If `True`, wait for the user to authenticate\n    Returns:\n        Client instance\n    \"\"\"\nclient = None\nglobal WAITING_FOR_CLIENT\ntry:\n# If a client instance is set globally, return it\nif not set_as_global and Client.GLOBAL_CLIENT is not None:\nreturn Client.GLOBAL_CLIENT\n# Client exists, forward it\nif isinstance(client_or_creds, Client):\nreturn client_or_creds\n# Explicit credentials\n''' # For the first version, we deactivate explicit credentials via string or file for env var only\n        if isinstance(client_or_creds, (dict, str, Path)):\n            if isinstance(client_or_creds, str):  # noqa: SIM102\n                if Path(client_or_creds).is_file():\n                    client_or_creds = Path(client_or_creds)\n            if isinstance(client_or_creds, Path):\n                client_or_creds = json.loads(client_or_creds.open().read())\n            return Client(credentials=client_or_creds)\n        # Last try with environment variables\n        #if client_or_creds is None:\n        client = _client_from_env(wait_for_auth=wait_for_auth)\n        '''\ncredentials = environ.get(TOKEN_VAR)\nif credentials is not None:\nclient = Client(credentials=credentials)\nexcept ClientHandshakeError as e:\nwait_for_auth = False  # For now deactivate wait_for_auth until the backend is ready\nif wait_for_auth:\nWAITING_FOR_CLIENT = True\nstart = time()\nlogin_message_printed = False\nwhile client is None:\nif not login_message_printed:\nprint(\nf\"The token needs to be refreshed - please validate your token by browsing at the following URL:\\n\\n\\t{e.auth_link}\")\nlogin_message_printed = True\nwith suppress(ClientCreationError):\nsleep(BACKOFF)\nclient = get_client(wait_for_auth=False)\nnow = time()\nif now - start > CLIENT_INIT_TIMEOUT:\nWAITING_FOR_CLIENT = False\nbreak\nif client is None and not WAITING_FOR_CLIENT:\nsys.tracebacklimit = None\nraise ClientCreationError\nreturn client\n

    Main Client class used to abstract the connection to the backend.

    A normal user should not have to instanciate a Client by itself. However, in the future it will be useful for power-users to manage projects and connections.

    Parameters:

    Name Type Description Default credentials Optional[dict]

    (optional) Credentials to connect

    None project Optional[Project]

    (optional) Project to connect to. If not specified, the client will connect to the default user's project.

    None Source code in ydata/sdk/common/client/client.py
    @typechecked\nclass Client(metaclass=SingletonClient):\n\"\"\"Main Client class used to abstract the connection to the backend.\n    A normal user should not have to instanciate a [`Client`][ydata.sdk.common.client.Client] by itself.\n    However, in the future it will be useful for power-users to manage projects and connections.\n    Args:\n        credentials (Optional[dict]): (optional) Credentials to connect\n        project (Optional[Project]): (optional) Project to connect to. If not specified, the client will connect to the default user's project.\n    \"\"\"\ncodes = codes\ndef __init__(self, credentials: Optional[Union[str, Dict]] = None, project: Optional[Project] = None, set_as_global: bool = False):\nself._base_url = environ.get(\"YDATA_BASE_URL\", DEFAULT_URL)\nself._scheme = 'https'\nself._headers = {'Authorization': credentials}\nself._http_client = httpClient(\nheaders=self._headers, timeout=Timeout(10, read=None))\nself._handshake()\nself._default_project = project or self._get_default_project(credentials)\nif set_as_global:\nself.__set_global()\ndef post(self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None,\nproject: Project | None = None, files: Optional[Dict] = None, raise_for_status: bool = True) -> Response:\n\"\"\"POST request to the backend.\n        Args:\n            endpoint (str): POST endpoint\n            data (Optional[dict]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            raise_for_status (bool): raise an exception on error\n        Returns:\n            Response object\n        \"\"\"\nurl_data = self.__build_url(\nendpoint, data=data, json=json, files=files, project=project)\nresponse = self._http_client.post(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\ndef get(self, endpoint: str, params: Optional[Dict] = None,\nproject: Project | None = None, cookies: Optional[Dict] = None, raise_for_status: bool = True) -> Response:\n\"\"\"GET request to the backend.\n        Args:\n            endpoint (str): GET endpoint\n            cookies (Optional[dict]): (optional) cookies data\n            raise_for_status (bool): raise an exception on error\n        Returns:\n            Response object\n        \"\"\"\nurl_data = self.__build_url(endpoint, params=params,\ncookies=cookies, project=project)\nresponse = self._http_client.get(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\ndef get_static_file(self, endpoint: str, project: Project | None = None, raise_for_status: bool = True) -> Response:\n\"\"\"Retrieve a static file from the backend.\n        Args:\n            endpoint (str): GET endpoint\n            raise_for_status (bool): raise an exception on error\n        Returns:\n            Response object\n        \"\"\"\nurl_data = self.__build_url(endpoint, project=project)\nurl_data['url'] = f'{self._scheme}://{self._base_url}/static-content{endpoint}'\nresponse = self._http_client.get(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\ndef _handshake(self):\n\"\"\"Client handshake.\n        It is used to determine is the client can connect with its\n        current authorization token.\n        \"\"\"\nresponse = self.get('/profiles', params={}, raise_for_status=False)\nif response.status_code == Client.codes.FOUND:\nparser = LinkExtractor()\nparser.feed(response.text)\nraise ClientHandshakeError(auth_link=parser.link)\ndef _get_default_project(self, token: str):\nresponse = self.get('/profiles/me', params={}, cookies={'access_token': token})\ndata: Dict = response.json()\nreturn data['myWorkspace']\ndef __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,\njson: Optional[Dict] = None, project: Project | None = None, files: Optional[Dict] = None,\ncookies: Optional[Dict] = None) -> Dict:\n\"\"\"Build a request for the backend.\n        Args:\n            endpoint (str): backend endpoint\n            params (Optional[dict]): URL parameters\n            data (Optional[Project]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            cookies (Optional[dict]): (optional) cookies data\n        Returns:\n            dictionary containing the information to perform a request\n        \"\"\"\n_params = params if params is not None else {\n'ns': project or self._default_project\n}\nurl_data = {\n'url': f'{self._scheme}://{self._base_url}/api{endpoint}',\n'headers': self._headers,\n'params': _params,\n}\nif data is not None:\nurl_data['data'] = data\nif json is not None:\nurl_data['json'] = json\nif files is not None:\nurl_data['files'] = files\nif cookies is not None:\nurl_data['cookies'] = cookies\nreturn url_data\ndef __set_global(self) -> None:\n\"\"\"Sets a client instance as global.\"\"\"\n# If the client is stateful, close it gracefully!\nClient.GLOBAL_CLIENT = self\ndef __raise_for_status(self, response: Response) -> None:\n\"\"\"Raise an exception if the response is not OK.\n        When an exception is raised, we try to convert it to a ResponseError which is\n        a wrapper around a backend error. This usually gives enough context and provides\n        nice error message.\n        If it cannot be converted to ResponseError, it is re-raised.\n        Args:\n            response (Response): response to analyze\n        \"\"\"\ntry:\nresponse.raise_for_status()\nexcept HTTPStatusError as e:\nwith suppress(Exception):\ne = ResponseError(**response.json())\nraise e\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__build_url","title":"__build_url(endpoint, params=None, data=None, json=None, project=None, files=None, cookies=None)","text":"

    Build a request for the backend.

    Parameters:

    Name Type Description Default endpoint str

    backend endpoint

    required params Optional[dict]

    URL parameters

    None data Optional[Project]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None cookies Optional[dict]

    (optional) cookies data

    None

    Returns:

    Type Description Dict

    dictionary containing the information to perform a request

    Source code in ydata/sdk/common/client/client.py
    def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,\njson: Optional[Dict] = None, project: Project | None = None, files: Optional[Dict] = None,\ncookies: Optional[Dict] = None) -> Dict:\n\"\"\"Build a request for the backend.\n    Args:\n        endpoint (str): backend endpoint\n        params (Optional[dict]): URL parameters\n        data (Optional[Project]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        cookies (Optional[dict]): (optional) cookies data\n    Returns:\n        dictionary containing the information to perform a request\n    \"\"\"\n_params = params if params is not None else {\n'ns': project or self._default_project\n}\nurl_data = {\n'url': f'{self._scheme}://{self._base_url}/api{endpoint}',\n'headers': self._headers,\n'params': _params,\n}\nif data is not None:\nurl_data['data'] = data\nif json is not None:\nurl_data['json'] = json\nif files is not None:\nurl_data['files'] = files\nif cookies is not None:\nurl_data['cookies'] = cookies\nreturn url_data\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__raise_for_status","title":"__raise_for_status(response)","text":"

    Raise an exception if the response is not OK.

    When an exception is raised, we try to convert it to a ResponseError which is a wrapper around a backend error. This usually gives enough context and provides nice error message.

    If it cannot be converted to ResponseError, it is re-raised.

    Parameters:

    Name Type Description Default response Response

    response to analyze

    required Source code in ydata/sdk/common/client/client.py
    def __raise_for_status(self, response: Response) -> None:\n\"\"\"Raise an exception if the response is not OK.\n    When an exception is raised, we try to convert it to a ResponseError which is\n    a wrapper around a backend error. This usually gives enough context and provides\n    nice error message.\n    If it cannot be converted to ResponseError, it is re-raised.\n    Args:\n        response (Response): response to analyze\n    \"\"\"\ntry:\nresponse.raise_for_status()\nexcept HTTPStatusError as e:\nwith suppress(Exception):\ne = ResponseError(**response.json())\nraise e\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__set_global","title":"__set_global()","text":"

    Sets a client instance as global.

    Source code in ydata/sdk/common/client/client.py
    def __set_global(self) -> None:\n\"\"\"Sets a client instance as global.\"\"\"\n# If the client is stateful, close it gracefully!\nClient.GLOBAL_CLIENT = self\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.get","title":"get(endpoint, params=None, project=None, cookies=None, raise_for_status=True)","text":"

    GET request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    GET endpoint

    required cookies Optional[dict]

    (optional) cookies data

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def get(self, endpoint: str, params: Optional[Dict] = None,\nproject: Project | None = None, cookies: Optional[Dict] = None, raise_for_status: bool = True) -> Response:\n\"\"\"GET request to the backend.\n    Args:\n        endpoint (str): GET endpoint\n        cookies (Optional[dict]): (optional) cookies data\n        raise_for_status (bool): raise an exception on error\n    Returns:\n        Response object\n    \"\"\"\nurl_data = self.__build_url(endpoint, params=params,\ncookies=cookies, project=project)\nresponse = self._http_client.get(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.get_static_file","title":"get_static_file(endpoint, project=None, raise_for_status=True)","text":"

    Retrieve a static file from the backend.

    Parameters:

    Name Type Description Default endpoint str

    GET endpoint

    required raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def get_static_file(self, endpoint: str, project: Project | None = None, raise_for_status: bool = True) -> Response:\n\"\"\"Retrieve a static file from the backend.\n    Args:\n        endpoint (str): GET endpoint\n        raise_for_status (bool): raise an exception on error\n    Returns:\n        Response object\n    \"\"\"\nurl_data = self.__build_url(endpoint, project=project)\nurl_data['url'] = f'{self._scheme}://{self._base_url}/static-content{endpoint}'\nresponse = self._http_client.get(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.post","title":"post(endpoint, data=None, json=None, project=None, files=None, raise_for_status=True)","text":"

    POST request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    POST endpoint

    required data Optional[dict]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def post(self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None,\nproject: Project | None = None, files: Optional[Dict] = None, raise_for_status: bool = True) -> Response:\n\"\"\"POST request to the backend.\n    Args:\n        endpoint (str): POST endpoint\n        data (Optional[dict]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        raise_for_status (bool): raise an exception on error\n    Returns:\n        Response object\n    \"\"\"\nurl_data = self.__build_url(\nendpoint, data=data, json=json, files=files, project=project)\nresponse = self._http_client.post(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\n
    "},{"location":"sdk/reference/api/common/types/","title":"Types","text":""},{"location":"sdk/reference/api/connectors/connector/","title":"Connector","text":"

    Bases: ModelFactoryMixin

    A Connector allows to connect and access data stored in various places. The list of available connectors can be found here.

    Parameters:

    Name Type Description Default connector_type Union[ConnectorType, str]

    Type of the connector to be created

    None credentials dict

    Connector credentials

    None name Optional[str]

    (optional) Connector name

    None client Client

    (optional) Client to connect to the backend

    None

    Attributes:

    Name Type Description uid UID

    UID fo the connector instance (creating internally)

    type ConnectorType

    Type of the connector

    Source code in ydata/sdk/connectors/connector.py
    class Connector(ModelFactoryMixin):\n\"\"\"A [`Connector`][ydata.sdk.connectors.Connector] allows to connect and\n    access data stored in various places. The list of available connectors can\n    be found [here][ydata.sdk.connectors.ConnectorType].\n    Arguments:\n        connector_type (Union[ConnectorType, str]): Type of the connector to be created\n        credentials (dict): Connector credentials\n        name (Optional[str]): (optional) Connector name\n        client (Client): (optional) Client to connect to the backend\n    Attributes:\n        uid (UID): UID fo the connector instance (creating internally)\n        type (ConnectorType): Type of the connector\n    \"\"\"\ndef __init__(self, connector_type: Union[ConnectorType, str] = None, credentials: Optional[Dict] = None,  name: Optional[str] = None, client: Optional[Client] = None):\nself._init_common(client=client)\nself._model: Optional[mConnector] = self._create_model(\nconnector_type, credentials, name, client=client)\n@init_client\ndef _init_common(self, client: Optional[Client] = None):\nself._client = client\nself._logger = create_logger(__name__, level=LOG_LEVEL)\n@property\ndef uid(self) -> UID:\nreturn self._model.uid\n@property\ndef type(self) -> str:\nreturn self._model.type\n@staticmethod\n@init_client\ndef get(uid: UID, client: Optional[Client] = None) -> \"Connector\":\n\"\"\"Get an existing connector.\n        Arguments:\n            uid (UID): Connector identifier\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            Connector\n        \"\"\"\nconnectors: ConnectorsList = Connector.list(client=client)\ndata = connectors.get_by_uid(uid)\nmodel = mConnector(**data)\nconnector = ModelFactoryMixin._init_from_model_data(Connector, model)\nreturn connector\n@staticmethod\ndef _init_connector_type(connector_type: Union[ConnectorType, str]) -> ConnectorType:\nif isinstance(connector_type, str):\ntry:\nconnector_type = ConnectorType(connector_type)\nexcept Exception:\nc_list = \", \".join([c.value for c in ConnectorType])\nraise InvalidConnectorError(\nf\"ConnectorType '{connector_type}' does not exist.\\nValid connector types are: {c_list}.\")\nreturn connector_type\n@staticmethod\ndef _init_credentials(connector_type: ConnectorType, credentials: Union[str, Path, Dict, Credentials]) -> Credentials:\n_credentials = None\nif isinstance(credentials, str):\ncredentials = Path(credentials)\nif isinstance(credentials, Path):\ntry:\n_credentials = json_loads(credentials.open().read())\nexcept Exception:\nraise CredentialTypeError(\n'Could not read the credentials. Please, check your path or credentials structure.')\ntry:\nfrom ydata.sdk.connectors._models.connector_map import TYPE_TO_CLASS\ncredential_cls = TYPE_TO_CLASS.get(connector_type.value)\n_credentials = credential_cls(**_credentials)\nexcept Exception:\nraise CredentialTypeError(\n\"Could not create the credentials. Verify the path or the structure your credentials.\")\nreturn _credentials\n@staticmethod\ndef create(connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials], name: Optional[str] = None, client: Optional[Client] = None) -> \"Connector\":\n\"\"\"Create a new connector.\n        Arguments:\n            connector_type (Union[ConnectorType, str]): Type of the connector to be created\n            credentials (dict): Connector credentials\n            name (Optional[str]): (optional) Connector name\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            New connector\n        \"\"\"\nmodel = Connector._create_model(\nconnector_type=connector_type, credentials=credentials, name=name, client=client)\nconnector = ModelFactoryMixin._init_from_model_data(\nConnector, model)\nreturn connector\n@classmethod\n@init_client\ndef _create_model(cls, connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials], name: Optional[str] = None, client: Optional[Client] = None) -> mConnector:\n_name = name if name is not None else str(uuid4())\n_connector_type = Connector._init_connector_type(connector_type)\n_credentials = Connector._init_credentials(_connector_type, credentials)\npayload = {\n\"type\": _connector_type.value,\n\"credentials\": _credentials.dict(by_alias=True),\n\"name\": _name\n}\nresponse = client.post('/connector/', json=payload)\ndata: list = response.json()\nreturn mConnector(**data)\n@staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> ConnectorsList:\n\"\"\"List the connectors instances.\n        Arguments:\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            List of connectors\n        \"\"\"\nresponse = client.get('/connector')\ndata: list = response.json()\nreturn ConnectorsList(data)\ndef __repr__(self):\nreturn self._model.__repr__()\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.create","title":"create(connector_type, credentials, name=None, client=None) staticmethod","text":"

    Create a new connector.

    Parameters:

    Name Type Description Default connector_type Union[ConnectorType, str]

    Type of the connector to be created

    required credentials dict

    Connector credentials

    required name Optional[str]

    (optional) Connector name

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description Connector

    New connector

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\ndef create(connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials], name: Optional[str] = None, client: Optional[Client] = None) -> \"Connector\":\n\"\"\"Create a new connector.\n    Arguments:\n        connector_type (Union[ConnectorType, str]): Type of the connector to be created\n        credentials (dict): Connector credentials\n        name (Optional[str]): (optional) Connector name\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        New connector\n    \"\"\"\nmodel = Connector._create_model(\nconnector_type=connector_type, credentials=credentials, name=name, client=client)\nconnector = ModelFactoryMixin._init_from_model_data(\nConnector, model)\nreturn connector\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.get","title":"get(uid, client=None) staticmethod","text":"

    Get an existing connector.

    Parameters:

    Name Type Description Default uid UID

    Connector identifier

    required client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description Connector

    Connector

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\n@init_client\ndef get(uid: UID, client: Optional[Client] = None) -> \"Connector\":\n\"\"\"Get an existing connector.\n    Arguments:\n        uid (UID): Connector identifier\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        Connector\n    \"\"\"\nconnectors: ConnectorsList = Connector.list(client=client)\ndata = connectors.get_by_uid(uid)\nmodel = mConnector(**data)\nconnector = ModelFactoryMixin._init_from_model_data(Connector, model)\nreturn connector\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.list","title":"list(client=None) staticmethod","text":"

    List the connectors instances.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description ConnectorsList

    List of connectors

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> ConnectorsList:\n\"\"\"List the connectors instances.\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        List of connectors\n    \"\"\"\nresponse = client.get('/connector')\ndata: list = response.json()\nreturn ConnectorsList(data)\n
    "},{"location":"sdk/reference/api/connectors/connector/#connectortype","title":"ConnectorType","text":"

    Bases: Enum

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AWS_S3","title":"AWS_S3 = 'aws-s3' class-attribute instance-attribute","text":"

    AWS S3 connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AZURE_BLOB","title":"AZURE_BLOB = 'azure-blob' class-attribute instance-attribute","text":"

    Azure Blob connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AZURE_SQL","title":"AZURE_SQL = 'azure-sql' class-attribute instance-attribute","text":"

    AzureSQL connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.BIGQUERY","title":"BIGQUERY = 'google-bigquery' class-attribute instance-attribute","text":"

    BigQuery connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.FILE","title":"FILE = 'file' class-attribute instance-attribute","text":"

    File connector (placeholder)

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.GCS","title":"GCS = 'gcs' class-attribute instance-attribute","text":"

    Google Cloud Storage connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.MYSQL","title":"MYSQL = 'mysql' class-attribute instance-attribute","text":"

    MySQL connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.SNOWFLAKE","title":"SNOWFLAKE = 'snowflake' class-attribute instance-attribute","text":"

    Snowflake connector

    "},{"location":"sdk/reference/api/datasources/datasource/","title":"DataSource","text":"

    Bases: ModelFactoryMixin

    A DataSource represents a dataset to be used by a Synthesizer as training data.

    Parameters:

    Name Type Description Default connector Connector

    Connector from which the datasource is created

    required datatype Optional[Union[DataSourceType, str]]

    (optional) DataSource type

    TABULAR name Optional[str]

    (optional) DataSource name

    None wait_for_metadata bool

    If True, wait until the metadata is fully calculated

    True client Client

    (optional) Client to connect to the backend

    None **config

    Datasource specific configuration

    {}

    Attributes:

    Name Type Description uid UID

    UID fo the datasource instance

    datatype DataSourceType

    Data source type

    status Status

    Status of the datasource

    metadata Metadata

    Metadata associated to the datasource

    Source code in ydata/sdk/datasources/datasource.py
    class DataSource(ModelFactoryMixin):\n\"\"\"A [`DataSource`][ydata.sdk.datasources.DataSource] represents a dataset\n    to be used by a Synthesizer as training data.\n    Arguments:\n        connector (Connector): Connector from which the datasource is created\n        datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n        name (Optional[str]): (optional) DataSource name\n        wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n        client (Client): (optional) Client to connect to the backend\n        **config: Datasource specific configuration\n    Attributes:\n        uid (UID): UID fo the datasource instance\n        datatype (DataSourceType): Data source type\n        status (Status): Status of the datasource\n        metadata (Metadata): Metadata associated to the datasource\n    \"\"\"\ndef __init__(self, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, name: Optional[str] = None, wait_for_metadata: bool = True, client: Optional[Client] = None, **config):\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\nself._init_common(client=client)\nself._model: Optional[mDataSource] = self._create_model(\nconnector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name, client=self._client)\nif wait_for_metadata:\nself._model = DataSource._wait_for_metadata(self)._model\n@init_client\ndef _init_common(self, client: Optional[Client] = None):\nself._client = client\nself._logger = create_logger(__name__, level=LOG_LEVEL)\n@property\ndef uid(self) -> UID:\nreturn self._model.uid\n@property\ndef datatype(self) -> DataSourceType:\nreturn self._model.datatype\n@property\ndef status(self) -> Status:\ntry:\nself._model = self.get(self._model.uid, self._client)._model\nreturn self._model.status\nexcept Exception:  # noqa: PIE786\nreturn Status.UNKNOWN\n@property\ndef metadata(self) -> Metadata:\nreturn self._model.metadata\n@staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> DataSourceList:\n\"\"\"List the  [`DataSource`][ydata.sdk.datasources.DataSource]\n        instances.\n        Arguments:\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            List of datasources\n        \"\"\"\ndef __process_data(data: list) -> list:\nto_del = ['metadata']\nfor e in data:\nfor k in to_del:\ne.pop(k, None)\nreturn data\nresponse = client.get('/datasource')\ndata: list = response.json()\ndata = __process_data(data)\nreturn DataSourceList(data)\n@staticmethod\n@init_client\ndef get(uid: UID, client: Optional[Client] = None) -> \"DataSource\":\n\"\"\"Get an existing [`DataSource`][ydata.sdk.datasources.DataSource].\n        Arguments:\n            uid (UID): DataSource identifier\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            DataSource\n        \"\"\"\nresponse = client.get(f'/datasource/{uid}')\ndata: list = response.json()\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(\nConnectorType(data['connector']['type']))\nmodel = DataSource._model_from_api(data, datasource_type)\ndatasource = ModelFactoryMixin._init_from_model_data(DataSource, model)\nreturn datasource\n@classmethod\ndef create(cls, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, name: Optional[str] = None, wait_for_metadata: bool = True, client: Optional[Client] = None, **config) -> \"DataSource\":\n\"\"\"Create a new [`DataSource`][ydata.sdk.datasources.DataSource].\n        Arguments:\n            connector (Connector): Connector from which the datasource is created\n            datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n            name (Optional[str]): (optional) DataSource name\n            wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n            client (Client): (optional) Client to connect to the backend\n            **config: Datasource specific configuration\n        Returns:\n            DataSource\n        \"\"\"\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\nreturn cls._create(connector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name, wait_for_metadata=wait_for_metadata, client=client)\n@classmethod\ndef _create(cls, connector: Connector, datasource_type: Type[mDataSource], datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, config: Optional[Dict] = None, name: Optional[str] = None, wait_for_metadata: bool = True, client: Optional[Client] = None) -> \"DataSource\":\nmodel = DataSource._create_model(\nconnector, datasource_type, datatype, config, name, client)\ndatasource = ModelFactoryMixin._init_from_model_data(DataSource, model)\nif wait_for_metadata:\ndatasource._model = DataSource._wait_for_metadata(datasource)._model\nreturn datasource\n@classmethod\n@init_client\ndef _create_model(cls, connector: Connector, datasource_type: Type[mDataSource], datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, config: Optional[Dict] = None, name: Optional[str] = None, client: Optional[Client] = None) -> mDataSource:\n_name = name if name is not None else str(uuid4())\n_config = config if config is not None else {}\npayload = {\n\"name\": _name,\n\"connector\": {\n\"uid\": connector.uid,\n\"type\": connector.type.value\n},\n\"dataType\": datatype.value\n}\nif connector.type != ConnectorType.FILE:\n_config = datasource_type(**config).to_payload()\npayload.update(_config)\nresponse = client.post('/datasource/', json=payload)\ndata: list = response.json()\nreturn DataSource._model_from_api(data, datasource_type)\n@staticmethod\ndef _wait_for_metadata(datasource):\nlogger = create_logger(__name__, level=LOG_LEVEL)\nwhile datasource.status not in [Status.AVAILABLE, Status.FAILED, Status.UNAVAILABLE]:\nlogger.info(f'Calculating metadata [{datasource.status}]')\ndatasource = DataSource.get(uid=datasource.uid, client=datasource._client)\nsleep(BACKOFF)\nreturn datasource\n@staticmethod\ndef _resolve_api_status(api_status: Dict) -> Status:\nstatus = Status(api_status.get('state', Status.UNKNOWN.name))\nvalidation = ValidationState(api_status.get('validation', {}).get(\n'state', ValidationState.UNKNOWN.name))\nif validation == ValidationState.FAILED:\nstatus = Status.FAILED\nreturn status\n@staticmethod\ndef _model_from_api(data: Dict, datasource_type: Type[mDataSource]) -> mDataSource:\ndata['datatype'] = data.pop('dataType')\ndata['state'] = data['status']\ndata['status'] = DataSource._resolve_api_status(data['status'])\ndata = filter_dict(datasource_type, data)\nmodel = datasource_type(**data)\nreturn model\ndef __repr__(self):\nreturn self._model.__repr__()\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.create","title":"create(connector, datatype=DataSourceType.TABULAR, name=None, wait_for_metadata=True, client=None, **config) classmethod","text":"

    Create a new DataSource.

    Parameters:

    Name Type Description Default connector Connector

    Connector from which the datasource is created

    required datatype Optional[Union[DataSourceType, str]]

    (optional) DataSource type

    TABULAR name Optional[str]

    (optional) DataSource name

    None wait_for_metadata bool

    If True, wait until the metadata is fully calculated

    True client Client

    (optional) Client to connect to the backend

    None **config

    Datasource specific configuration

    {}

    Returns:

    Type Description DataSource

    DataSource

    Source code in ydata/sdk/datasources/datasource.py
    @classmethod\ndef create(cls, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, name: Optional[str] = None, wait_for_metadata: bool = True, client: Optional[Client] = None, **config) -> \"DataSource\":\n\"\"\"Create a new [`DataSource`][ydata.sdk.datasources.DataSource].\n    Arguments:\n        connector (Connector): Connector from which the datasource is created\n        datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n        name (Optional[str]): (optional) DataSource name\n        wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n        client (Client): (optional) Client to connect to the backend\n        **config: Datasource specific configuration\n    Returns:\n        DataSource\n    \"\"\"\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\nreturn cls._create(connector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name, wait_for_metadata=wait_for_metadata, client=client)\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.get","title":"get(uid, client=None) staticmethod","text":"

    Get an existing DataSource.

    Parameters:

    Name Type Description Default uid UID

    DataSource identifier

    required client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description DataSource

    DataSource

    Source code in ydata/sdk/datasources/datasource.py
    @staticmethod\n@init_client\ndef get(uid: UID, client: Optional[Client] = None) -> \"DataSource\":\n\"\"\"Get an existing [`DataSource`][ydata.sdk.datasources.DataSource].\n    Arguments:\n        uid (UID): DataSource identifier\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        DataSource\n    \"\"\"\nresponse = client.get(f'/datasource/{uid}')\ndata: list = response.json()\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(\nConnectorType(data['connector']['type']))\nmodel = DataSource._model_from_api(data, datasource_type)\ndatasource = ModelFactoryMixin._init_from_model_data(DataSource, model)\nreturn datasource\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.list","title":"list(client=None) staticmethod","text":"

    List the DataSource instances.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description DataSourceList

    List of datasources

    Source code in ydata/sdk/datasources/datasource.py
    @staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> DataSourceList:\n\"\"\"List the  [`DataSource`][ydata.sdk.datasources.DataSource]\n    instances.\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        List of datasources\n    \"\"\"\ndef __process_data(data: list) -> list:\nto_del = ['metadata']\nfor e in data:\nfor k in to_del:\ne.pop(k, None)\nreturn data\nresponse = client.get('/datasource')\ndata: list = response.json()\ndata = __process_data(data)\nreturn DataSourceList(data)\n
    "},{"location":"sdk/reference/api/datasources/datasource/#status","title":"Status","text":"

    Bases: StringEnum

    Represent the status of a DataSource.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.Status.AVAILABLE","title":"AVAILABLE = 'available' class-attribute instance-attribute","text":"

    The DataSource is available and ready to be used.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.Status.DELETED","title":"DELETED = 'deleted' class-attribute instance-attribute","text":"

    The DataSource is to be deleted or has been deleted.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.Status.FAILED","title":"FAILED = 'failed' class-attribute instance-attribute","text":"

    The DataSource preparation or validation has failed.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.Status.PREPARING","title":"PREPARING = 'preparing' class-attribute instance-attribute","text":"

    The DataSource is being prepared.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.Status.UNAVAILABLE","title":"UNAVAILABLE = 'unavailable' class-attribute instance-attribute","text":"

    The DataSource is unavailable at the moment.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.Status.UNKNOWN","title":"UNKNOWN = 'unknown' class-attribute instance-attribute","text":"

    The DataSource status could not be retrieved.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.Status.VALIDATING","title":"VALIDATING = 'validating' class-attribute instance-attribute","text":"

    The DataSource is being validated.

    "},{"location":"sdk/reference/api/datasources/datasource/#datasourcetype","title":"DataSourceType","text":"

    Bases: StringEnum

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.TABULAR","title":"TABULAR = 'tabular' class-attribute instance-attribute","text":"

    The DataSource is tabular (i.e. it does not have a temporal dimension).

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.TIMESERIES","title":"TIMESERIES = 'timeseries' class-attribute instance-attribute","text":"

    The DataSource has a temporal dimension.

    "},{"location":"sdk/reference/api/datasources/metadata/","title":"Metadata","text":"

    Bases: BaseModel

    The Metadata object contains descriptive information about a.

    DataSource

    Attributes:

    Name Type Description columns List[Column]

    columns information

    "},{"location":"sdk/reference/api/synthesizers/base/","title":"Synthesizer","text":"

    Bases: ABC, ModelFactoryMixin

    Main synthesizer class.

    This class cannot be directly instanciated because of the specificities between RegularSynthesizer and TimeSeriesSynthesizer sample methods.

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer--methods","title":"Methods","text":"
    • fit: train a synthesizer instance.
    • sample: request synthetic data.
    • status: current status of the synthesizer instance.
    Note

    The synthesizer instance is created in the backend only when the fit method is called.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None Source code in ydata/sdk/synthesizers/synthesizer.py
    @typechecked\nclass BaseSynthesizer(ABC, ModelFactoryMixin):\n\"\"\"Main synthesizer class.\n    This class cannot be directly instanciated because of the specificities between [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer] and [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] `sample` methods.\n    Methods\n    -------\n    - `fit`: train a synthesizer instance.\n    - `sample`: request synthetic data.\n    - `status`: current status of the synthesizer instance.\n    Note:\n            The synthesizer instance is created in the backend only when the `fit` method is called.\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    \"\"\"\ndef __init__(self, uid: UID | None = None, name: str | None = None, project: Project | None = None, client: Client | None = None):\nself._init_common(client=client)\nself._model = mSynthesizer(uid=uid, name=name or str(\nuuid4())) if uid or project else None\nself.__project = project\n@init_client\ndef _init_common(self, client: Optional[Client] = None):\nself._client = client\nself._logger = create_logger(__name__, level=LOG_LEVEL)\ndef fit(self, X: Union[DataSource, pdDataFrame],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\ndatatype: Optional[Union[DataSourceType, str]] = None,\nsortbykey: Optional[Union[str, List[str]]] = None,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.\n        The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].\n        By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.\n        The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]\n            sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Target for the dataset\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\nif self._is_initialized():\nraise AlreadyFittedError()\n_datatype = DataSourceType(datatype) if isinstance(\nX, pdDataFrame) else DataSourceType(X.datatype)\ndataset_attrs = self._init_datasource_attributes(\nsortbykey, entities, generate_cols, exclude_cols, dtypes)\nself._validate_datasource_attributes(X, dataset_attrs, _datatype, target)\n# If the training data is a pandas dataframe, we first need to create a data source and then the instance\nif isinstance(X, pdDataFrame):\nif X.empty:\nraise EmptyDataError(\"The DataFrame is empty\")\n_X = LocalDataSource(source=X, datatype=_datatype, client=self._client)\nelse:\nif datatype != _datatype:\nwarn(\"When the training data is a DataSource, the argument `datatype` is ignored.\",\nDataSourceTypeWarning)\n_X = X\nif _X.status != dsStatus.AVAILABLE:\nraise DataSourceNotAvailableError(\nf\"The datasource '{_X.uid}' is not available (status = {_X.status.value})\")\nif isinstance(dataset_attrs, dict):\ndataset_attrs = DataSourceAttrs(**dataset_attrs)\nself._fit_from_datasource(\nX=_X, dataset_attrs=dataset_attrs, target=target,\nanonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)\n@staticmethod\ndef _init_datasource_attributes(\nsortbykey: Optional[Union[str, List[str]]],\nentities: Optional[Union[str, List[str]]],\ngenerate_cols: Optional[List[str]],\nexclude_cols: Optional[List[str]],\ndtypes: Optional[Dict[str, Union[str, DataType]]]) -> DataSourceAttrs:\ndataset_attrs = {\n'sortbykey': sortbykey if sortbykey is not None else [],\n'entities': entities if entities is not None else [],\n'generate_cols': generate_cols if generate_cols is not None else [],\n'exclude_cols': exclude_cols if exclude_cols is not None else [],\n'dtypes': {k: DataType(v) for k, v in dtypes.items()} if dtypes is not None else {}\n}\nreturn DataSourceAttrs(**dataset_attrs)\n@staticmethod\ndef _validate_datasource_attributes(X: Union[DataSource, pdDataFrame], dataset_attrs: DataSourceAttrs, datatype: DataSourceType, target: Optional[str]):\ncolumns = []\nif isinstance(X, pdDataFrame):\ncolumns = X.columns\nif datatype is None:\nraise DataTypeMissingError(\n\"Argument `datatype` is mandatory for pandas.DataFrame training data\")\ndatatype = DataSourceType(datatype)\nelse:\ncolumns = [c.name for c in X.metadata.columns]\nif target is not None and target not in columns:\nraise DataSourceAttrsError(\n\"Invalid target: column '{target}' does not exist\")\nif datatype == DataSourceType.TIMESERIES:\nif not dataset_attrs.sortbykey:\nraise DataSourceAttrsError(\n\"The argument `sortbykey` is mandatory for timeseries datasource.\")\ninvalid_fields = {}\nfor field, v in dataset_attrs.dict().items():\nfield_columns = v if field != 'dtypes' else v.keys()\nnot_in_cols = [c for c in field_columns if c not in columns]\nif len(not_in_cols) > 0:\ninvalid_fields[field] = not_in_cols\nif len(invalid_fields) > 0:\nerror_msgs = [\"\\t- Field '{}': columns {} do not exist\".format(\nf, ', '.join(v)) for f, v in invalid_fields.items()]\nraise DataSourceAttrsError(\n\"The dataset attributes are invalid:\\n {}\".format('\\n'.join(error_msgs)))\n@staticmethod\ndef _metadata_to_payload(\ndatatype: DataSourceType, ds_metadata: Metadata,\ndataset_attrs: Optional[DataSourceAttrs] = None, target: str | None = None\n) -> dict:\n\"\"\"Transform a the metadata and dataset attributes into a valid\n        payload.\n        Arguments:\n            datatype (DataSourceType): datasource type\n            ds_metadata (Metadata): datasource metadata object\n            dataset_attrs ( Optional[DataSourceAttrs] ): (optional) Dataset attributes\n            target (Optional[str]): (optional) target column name\n        Returns:\n            metadata payload dictionary\n        \"\"\"\ncolumns = [\n{\n'name': c.name,\n'generation': True and c.name not in dataset_attrs.exclude_cols,\n'dataType': DataType(dataset_attrs.dtypes[c.name]).value if c.name in dataset_attrs.dtypes else c.datatype,\n'varType': c.vartype,\n}\nfor c in ds_metadata.columns]\nmetadata = {\n'columns': columns,\n'target': target\n}\nif dataset_attrs is not None:\nif datatype == DataSourceType.TIMESERIES:\nmetadata['sortBy'] = [c for c in dataset_attrs.sortbykey]\nmetadata['entity'] = [c for c in dataset_attrs.entities]\nreturn metadata\ndef _fit_from_datasource(\nself,\nX: DataSource,\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\ndataset_attrs: Optional[DataSourceAttrs] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None\n) -> None:\nmetadata = self._metadata_to_payload(\nDataSourceType(X.datatype), X.metadata, dataset_attrs, target)\npayload = {\n'name': self._model.name,\n'dataSourceUID': X.uid,\n'metadata': metadata,\n'extraData': {},\n'privacyLevel': privacy_level.value\n}\nif anonymize is not None:\npayload[\"extraData\"][\"anonymize\"] = anonymize\nif condition_on is not None:\npayload[\"extraData\"][\"condition_on\"] = condition_on\nresponse = self._client.post(\n'/synthesizer/', json=payload, project=self.__project)\ndata: list = response.json()\nself._model, _ = self._model_from_api(X.datatype, data)\nwhile self.status not in [Status.READY, Status.FAILED]:\nself._logger.info('Training the synthesizer...')\nsleep(BACKOFF)\nif self.status == Status.FAILED:\nraise FittingError('Could not train the synthesizer')\n@staticmethod\ndef _model_from_api(datatype: str, data: Dict) -> Tuple[mSynthesizer, Type[\"BaseSynthesizer\"]]:\nfrom ydata.sdk.synthesizers._models.synthesizer_map import TYPE_TO_CLASS\nsynth_cls = TYPE_TO_CLASS.get(SynthesizerType(datatype).value)\ndata['status'] = synth_cls._resolve_api_status(data['status'])\ndata = filter_dict(mSynthesizer, data)\nreturn mSynthesizer(**data), synth_cls\n@abstractmethod\ndef sample(self) -> pdDataFrame:\n\"\"\"Abstract method to sample from a synthesizer.\"\"\"\ndef _sample(self, payload: Dict) -> pdDataFrame:\n\"\"\"Sample from a synthesizer.\n        Arguments:\n            payload (dict): payload configuring the sample request\n        Returns:\n            pandas `DataFrame`\n        \"\"\"\nresponse = self._client.post(\nf\"/synthesizer/{self.uid}/sample\", json=payload, project=self.__project)\ndata: Dict = response.json()\nsample_uid = data.get('uid')\nsample_status = None\nwhile sample_status not in ['finished', 'failed']:\nself._logger.info('Sampling from the synthesizer...')\nresponse = self._client.get(\nf'/synthesizer/{self.uid}/history', project=self.__project)\nhistory: Dict = response.json()\nsample_data = next((s for s in history if s.get('uid') == sample_uid), None)\nsample_status = sample_data.get('status', {}).get('state')\nsleep(BACKOFF)\nresponse = self._client.get_static_file(\nf'/synthesizer/{self.uid}/sample/{sample_uid}/sample.csv', project=self.__project)\ndata = StringIO(response.content.decode())\nreturn read_csv(data)\n@property\ndef uid(self) -> UID:\n\"\"\"Get the status of a synthesizer instance.\n        Returns:\n            Synthesizer status\n        \"\"\"\nif not self._is_initialized():\nreturn Status.NOT_INITIALIZED\nreturn self._model.uid\n@property\ndef status(self) -> Status:\n\"\"\"Get the status of a synthesizer instance.\n        Returns:\n            Synthesizer status\n        \"\"\"\nif not self._is_initialized():\nreturn Status.NOT_INITIALIZED\ntry:\nself = self.get(self._model.uid, self._client)\nreturn self._model.status\nexcept Exception:  # noqa: PIE786\nreturn Status.UNKNOWN\ndef get(self):\nassert self._is_initialized() and self._model.uid, InputError(\n\"Please provide the synthesizer `uid`\")\nresponse = self._client.get(f'/synthesizer/{self.uid}', project=self.__project)\ndata = filter_dict(mSynthesizer, response.json())\nself._model = mSynthesizer(**data)\nreturn self\n@staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> SynthesizersList:\n\"\"\"List the synthesizer instances.\n        Arguments:\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            List of synthesizers\n        \"\"\"\ndef __process_data(data: list) -> list:\nto_del = ['metadata', 'report', 'mode']\nfor e in data:\nfor k in to_del:\ne.pop(k, None)\nreturn data\nresponse = client.get('/synthesizer')\ndata: list = response.json()\ndata = __process_data(data)\nreturn SynthesizersList(data)\ndef _is_initialized(self) -> bool:\n\"\"\"Determine if a synthesizer is instanciated or not.\n        Returns:\n            True if the synthesizer is instanciated\n        \"\"\"\nreturn self._model is not None\n@staticmethod\ndef _resolve_api_status(api_status: Dict) -> Status:\n\"\"\"Determine the status of the Synthesizer.\n        The status of the synthesizer instance is determined by the state of\n        its different components.\n        Arguments:\n            api_status (dict): json from the endpoint GET /synthesizer\n        Returns:\n            Synthesizer Status\n        \"\"\"\nstatus = Status(api_status.get('state', Status.UNKNOWN.name))\nif status == Status.PREPARE:\nif PrepareState(api_status.get('prepare', {}).get(\n'state', PrepareState.UNKNOWN.name)) == PrepareState.FAILED:\nreturn Status.FAILED\nelif status == Status.TRAIN:\nif TrainingState(api_status.get('training', {}).get(\n'state', TrainingState.UNKNOWN.name)) == TrainingState.FAILED:\nreturn Status.FAILED\nelif status == Status.REPORT:\nreturn Status.READY\nreturn status\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.status","title":"status: Status property","text":"

    Get the status of a synthesizer instance.

    Returns:

    Type Description Status

    Synthesizer status

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.uid","title":"uid: UID property","text":"

    Get the status of a synthesizer instance.

    Returns:

    Type Description UID

    Synthesizer status

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource. When the training dataset is a pandas DataFrame, the argument datatype is required as it cannot be deduced.

    The argumentsortbykey is mandatory for TimeSeries.

    By default, if generate_cols or exclude_cols are not specified, all columns are generated by the synthesizer. The argument exclude_cols has precedence over generate_cols, i.e. a column col will not be generated if it is in both list.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY datatype Optional[Union[DataSourceType, str]]

    (optional) Dataset datatype - required if X is a pandas.DataFrame

    None sortbykey Union[str, List[str]]

    (optional) column(s) to use to sort timeseries datasets

    None entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Target for the dataset

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/synthesizer.py
    def fit(self, X: Union[DataSource, pdDataFrame],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\ndatatype: Optional[Union[DataSourceType, str]] = None,\nsortbykey: Optional[Union[str, List[str]]] = None,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.\n    The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].\n    By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.\n    The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]\n        sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Target for the dataset\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\nif self._is_initialized():\nraise AlreadyFittedError()\n_datatype = DataSourceType(datatype) if isinstance(\nX, pdDataFrame) else DataSourceType(X.datatype)\ndataset_attrs = self._init_datasource_attributes(\nsortbykey, entities, generate_cols, exclude_cols, dtypes)\nself._validate_datasource_attributes(X, dataset_attrs, _datatype, target)\n# If the training data is a pandas dataframe, we first need to create a data source and then the instance\nif isinstance(X, pdDataFrame):\nif X.empty:\nraise EmptyDataError(\"The DataFrame is empty\")\n_X = LocalDataSource(source=X, datatype=_datatype, client=self._client)\nelse:\nif datatype != _datatype:\nwarn(\"When the training data is a DataSource, the argument `datatype` is ignored.\",\nDataSourceTypeWarning)\n_X = X\nif _X.status != dsStatus.AVAILABLE:\nraise DataSourceNotAvailableError(\nf\"The datasource '{_X.uid}' is not available (status = {_X.status.value})\")\nif isinstance(dataset_attrs, dict):\ndataset_attrs = DataSourceAttrs(**dataset_attrs)\nself._fit_from_datasource(\nX=_X, dataset_attrs=dataset_attrs, target=target,\nanonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.list","title":"list(client=None) staticmethod","text":"

    List the synthesizer instances.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description SynthesizersList

    List of synthesizers

    Source code in ydata/sdk/synthesizers/synthesizer.py
    @staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> SynthesizersList:\n\"\"\"List the synthesizer instances.\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        List of synthesizers\n    \"\"\"\ndef __process_data(data: list) -> list:\nto_del = ['metadata', 'report', 'mode']\nfor e in data:\nfor k in to_del:\ne.pop(k, None)\nreturn data\nresponse = client.get('/synthesizer')\ndata: list = response.json()\ndata = __process_data(data)\nreturn SynthesizersList(data)\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.sample","title":"sample() abstractmethod","text":"

    Abstract method to sample from a synthesizer.

    Source code in ydata/sdk/synthesizers/synthesizer.py
    @abstractmethod\ndef sample(self) -> pdDataFrame:\n\"\"\"Abstract method to sample from a synthesizer.\"\"\"\n
    "},{"location":"sdk/reference/api/synthesizers/base/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"sdk/reference/api/synthesizers/regular/","title":"Regular","text":"

    Bases: BaseSynthesizer

    Source code in ydata/sdk/synthesizers/regular.py
    class RegularSynthesizer(BaseSynthesizer):\ndef sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:\n\"\"\"Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]\n        instance.\n        Arguments:\n            n_samples (int): number of rows in the sample\n            condition_on: (Optional[dict]): (optional) conditional sampling parameters\n        Returns:\n            synthetic data\n        \"\"\"\nif n_samples < 1:\nraise InputError(\"Parameter 'n_samples' must be greater than 0\")\npayload = {\"numberOfRecords\": n_samples}\nif condition_on is not None:\npayload[\"extraData\"] = {\n\"condition_on\": condition_on\n}\nreturn self._sample(payload=payload)\ndef fit(self, X: Union[DataSource, pdDataFrame],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Target column\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\nBaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,\ngenerate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,\ntarget=target, anonymize=anonymize, privacy_level=privacy_level,\ncondition_on=condition_on)\ndef __repr__(self):\nif self._model is not None:\nreturn self._model.__repr__()\nelse:\nreturn \"RegularSynthesizer(Not Initialized)\"\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.regular.RegularSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Target column

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/regular.py
    def fit(self, X: Union[DataSource, pdDataFrame],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Target column\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\nBaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,\ngenerate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,\ntarget=target, anonymize=anonymize, privacy_level=privacy_level,\ncondition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.regular.RegularSynthesizer.sample","title":"sample(n_samples=1, condition_on=None)","text":"

    Sample from a RegularSynthesizer instance.

    Parameters:

    Name Type Description Default n_samples int

    number of rows in the sample

    1 condition_on Optional[dict]

    (Optional[dict]): (optional) conditional sampling parameters

    None

    Returns:

    Type Description DataFrame

    synthetic data

    Source code in ydata/sdk/synthesizers/regular.py
    def sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:\n\"\"\"Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]\n    instance.\n    Arguments:\n        n_samples (int): number of rows in the sample\n        condition_on: (Optional[dict]): (optional) conditional sampling parameters\n    Returns:\n        synthetic data\n    \"\"\"\nif n_samples < 1:\nraise InputError(\"Parameter 'n_samples' must be greater than 0\")\npayload = {\"numberOfRecords\": n_samples}\nif condition_on is not None:\npayload[\"extraData\"] = {\n\"condition_on\": condition_on\n}\nreturn self._sample(payload=payload)\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"sdk/reference/api/synthesizers/timeseries/","title":"TimeSeries","text":"

    Bases: BaseSynthesizer

    Source code in ydata/sdk/synthesizers/timeseries.py
    class TimeSeriesSynthesizer(BaseSynthesizer):\ndef sample(self, n_entities: int, condition_on: Optional[dict] = None) -> pdDataFrame:\n\"\"\"Sample from a [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] instance.\n        If a training dataset was not using any `entity` column, the Synthesizer assumes a single entity.\n        A [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] always sample the full trajectory of its entities.\n        Arguments:\n            n_entities (int): number of entities to sample\n            condition_on: (Optional[dict]): (optional) conditional sampling parameters\n        Returns:\n            synthetic data\n        \"\"\"\nif n_entities is not None and n_entities < 1:\nraise InputError(\"Parameter 'n_entities' must be greater than 0\")\npayload = {\"numberOfRecords\": n_entities}\nif condition_on is not None:\npayload[\"extraData\"] = {\n\"condition_on\": condition_on\n}\nreturn self._sample(payload=payload)\ndef fit(self, X: Union[DataSource, pdDataFrame],\nsortbykey: Optional[Union[str, List[str]]],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            sortbykey (Union[str, List[str]]): column(s) to use to sort timeseries datasets\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Metadata associated to the datasource\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\nBaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TIMESERIES, sortbykey=sortbykey,\nentities=entities, generate_cols=generate_cols, exclude_cols=exclude_cols,\ndtypes=dtypes, target=target, anonymize=anonymize, privacy_level=privacy_level,\ncondition_on=condition_on)\ndef __repr__(self):\nif self._model is not None:\nreturn self._model.__repr__()\nelse:\nreturn \"TimeSeriesSynthesizer(Not Initialized)\"\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.timeseries.TimeSeriesSynthesizer.fit","title":"fit(X, sortbykey, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required sortbykey Union[str, List[str]]

    column(s) to use to sort timeseries datasets

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Metadata associated to the datasource

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/timeseries.py
    def fit(self, X: Union[DataSource, pdDataFrame],\nsortbykey: Optional[Union[str, List[str]]],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        sortbykey (Union[str, List[str]]): column(s) to use to sort timeseries datasets\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Metadata associated to the datasource\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\nBaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TIMESERIES, sortbykey=sortbykey,\nentities=entities, generate_cols=generate_cols, exclude_cols=exclude_cols,\ndtypes=dtypes, target=target, anonymize=anonymize, privacy_level=privacy_level,\ncondition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.timeseries.TimeSeriesSynthesizer.sample","title":"sample(n_entities, condition_on=None)","text":"

    Sample from a TimeSeriesSynthesizer instance.

    If a training dataset was not using any entity column, the Synthesizer assumes a single entity. A TimeSeriesSynthesizer always sample the full trajectory of its entities.

    Parameters:

    Name Type Description Default n_entities int

    number of entities to sample

    required condition_on Optional[dict]

    (Optional[dict]): (optional) conditional sampling parameters

    None

    Returns:

    Type Description DataFrame

    synthetic data

    Source code in ydata/sdk/synthesizers/timeseries.py
    def sample(self, n_entities: int, condition_on: Optional[dict] = None) -> pdDataFrame:\n\"\"\"Sample from a [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] instance.\n    If a training dataset was not using any `entity` column, the Synthesizer assumes a single entity.\n    A [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] always sample the full trajectory of its entities.\n    Arguments:\n        n_entities (int): number of entities to sample\n        condition_on: (Optional[dict]): (optional) conditional sampling parameters\n    Returns:\n        synthetic data\n    \"\"\"\nif n_entities is not None and n_entities < 1:\nraise InputError(\"Parameter 'n_entities' must be greater than 0\")\npayload = {\"numberOfRecords\": n_entities}\nif condition_on is not None:\npayload[\"extraData\"] = {\n\"condition_on\": condition_on\n}\nreturn self._sample(payload=payload)\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"support/help-troubleshooting/","title":"Help & Troubleshooting","text":""}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Welcome","text":"

    YData Fabric is a Data-Centric AI development platform that accelerates AI development by helping data practitioners achieve production-quality data.

    Much like for software engineering the quality of code is a must for the success of software development, Fabric accounts for the data quality requirements for data-driven applications. It introduces standards, processes, and acceleration to empower data science, analytics, and data engineering teams.

    "},{"location":"#try-fabric","title":"Try Fabric","text":"
    • Get started with Fabric Community
    "},{"location":"#why-adopt-ydata-fabric","title":"Why adopt YData Fabric?","text":"

    With Fabric, you can standardize the understanding of your data, quickly identify data quality issues, streamline and version your data preparation workflows and finally leverage synthetic data for privacy-compliance or as a tool to boost ML performance. Fabric is a development environment that supports a faster and easier process of preparing data for AI development. Data practitioners are using Fabric to:

    • Establish a centralized and collaborative repository for data projects.
    • Create and share comprehensive documentation of data, encompassing data schema, structure, and personally identifiable information (PII).
    • Prevent data quality issues with standardized data quality profiling, providing visual understanding and warnings on potential issues.
    • Accelerate data preparation with customizable recipes.
    • Improve machine learning performance with optimal data preparation through solutions such as synthetic data.
    • Shorten access to data with privacy-compliant synthetic data generatio.
    • Build and streamline data preparation workflows effortlessly through a user-friendly drag-and-drop interface.
    • Efficiently manage business rules, conduct comparisons, and implement version control for data workflows using pipelines.
    "},{"location":"#key-features","title":"\ud83d\udcdd Key features","text":""},{"location":"#data-catalog","title":"Data Catalog","text":"

    Fabric Data Catalog provides a centralized perspective on datasets within a project-basis, optimizing data management through seamless integration with the organization's existing data architectures via scalable connectors (e.g., MySQL, Google Cloud Storage, AWS S3). It standardizes data quality profiling, streamlining the processes of efficient data cleaning and preparation, while also automating the identification of Personally Identifiable Information (PII) to facilitate compliance with privacy regulations.

    Explore how a Data Catalog through a centralized repository of your datasets, schema validation, and automated data profiling.

    "},{"location":"#labs","title":"Labs","text":"

    Fabric's Labs environments provide collaborative, scalable, and secure workspaces layered on a flexible infrastructure, enabling users to seamlessly switch between CPUs and GPUs based on their computational needs. Labs are familiar environments that empower data developers with powerful IDEs (Jupyter Notebooks, Visual Code or H2O flow) and a seamless experience with the tools they already love combined with YData's cutting-edge SDK for data preparation.

    Learn how to use the Labs to generate synthetic data in a familiar Python interface.

    "},{"location":"#synthetic-data","title":"Synthetic data","text":"

    Synthetic data, enabled by YData Fabric, provides data developers with a user-friendly interfaces (UI and code) for generating artificial datasets, offering a versatile solution across formats like tabular, time-series and multi-table datasets. The generated synthetic data holds the same value of the original and aligns intricately with specific business rules, contributing to machine learning models enhancement, mitigation of privacy concerns and more robustness for data developments. Fabric offers synthetic data that is ease to adapt and configure, allows customization in what concerns privacy-utility trade-offs.

    Learn how you to create high-quality synthetic data within a user-friendly UI using Fabric\u2019s data synthesis flow.

    "},{"location":"#pipelines","title":"Pipelines","text":"

    Fabric Pipelines streamlines data preparation workflows by automating, orchestrating, and optimizing data pipelines, providing benefits such as flexibility, scalability, monitoring, and reproducibility for efficient and reliable data processing. The intuitive drag-and-drop interface, leveraging Jupyter notebooks or Python scripts, expedites the pipeline setup process, providing data developers with a quick and user-friendly experience.

    Explore how you can leverage Fabric Pipelines to build versionable and reproducible data preparation workflows for ML development.

    "},{"location":"#tutorials","title":"Tutorials","text":"

    To understand how to best apply Fabric to your use cases, start by exploring the following tutorials:

    • Handling Imbalanced Data for Improved Fraud DetectionLearn how to implement high-performant fraud detection models by incorporating synthetic data to balance your datasets.

    • Prediction with Quality Inspection Learn how to develop data preparation workflows with automated data quality checks and Pipelines.

    • Generating Synthetic Data for Financial TransactionsLearn how to use synthetic data generation to replicate your existing relational databases while ensuring referential integrity.

    You can find additional examples and use cases at YData Academy GitHub Repository.

    "},{"location":"#support","title":"\ud83d\ude4b Support","text":"

    Facing an issue? We\u2019re committed to providing all the support you need to ensure a smooth experience using Fabric:

    • Create a support ticket: our team will help you move forward!
    • Contact a Fabric specialist: for personalized guidance or full access to the platform
    "},{"location":"examples/synthesize_tabular_data/","title":"Synthesize tabular data","text":"

    Use YData's RegularSynthesizer to generate tabular synthetic data

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\ndef main():\n\"\"\"In this example, we demonstrate how to train a synthesizer from a pandas\n    DataFrame.\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\nX = get_dataset('census')\n# We initialize a regular synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = RegularSynthesizer()\n# We train the synthesizer on our dataset\nsynth.fit(X)\n# We request a synthetic dataset with 50 rows\nsample = synth.sample(n_samples=50)\nprint(sample.shape)\nif __name__ == \"__main__\":\nmain()\n
    "},{"location":"examples/synthesize_timeseries_data/","title":"Synthesize time-series data","text":"

    Use YData's TimeSeriesSynthesizer to generate time-series synthetic data

    Tabular data is the most common type of data we encounter in data problems.

    When thinking about tabular data, we assume independence between different records, but this does not happen in reality. Suppose we check events from our day-to-day life, such as room temperature changes, bank account transactions, stock price fluctuations, and air quality measurements in our neighborhood. In that case, we might end up with datasets where measures and records evolve and are related through time. This type of data is known to be sequential or time-series data.

    Thus, sequential or time-series data refers to any data containing elements ordered into sequences in a structured format. Dissecting any time-series dataset, we see differences in variables' behavior that need to be understood for an effective generation of synthetic data. Typically any time-series dataset is composed of the following:

    • Variables that define the order of time (these can be simple with one variable or composed)
    • Time-variant variables
    • Variables that refer to entities (single or multiple entities)
    • Variables that are attributes (those that don't depend on time but rather on the entity)

    Below find an example:

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import TimeSeriesSynthesizer\n# Do not forget to add your token as env variable\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'\nX = get_dataset('occupancy')\n# We initialize a time series synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = TimeSeriesSynthesizer()\n# We train the synthesizer on our dataset\n# sortbykey -> variable that define the time order for the sequence\nsynth.fit(X, sortbykey='date')\n# By default it is requested a synthetic sample with the same length as the original data\n# The TimeSeriesSynthesizer is designed to replicate temporal series and therefore the original time-horizon is respected\nsample = synth.sample(n_entities=1)\n
    "},{"location":"examples/synthesize_with_anonymization/","title":"Anonymization","text":"

    YData Synthesizers offers a way to anonymize sensitive information such that the original values are not present in the synthetic data but replaced by fake values.

    Does the model retain the original values?

    No! The anonymization is performed before the model training such that it never sees the original values.

    The anonymization is performed by specifying which columns need to be anonymized and how to perform the anonymization. The anonymization rules are defined as a dictionary with the following format:

    {column_name: anonymization_rule}

    While here are some predefined anonymization rules such as name, email, company, it is also possible to create a rule using a regular expression. The anonymization rules have to be passed to a synthesizer in its fit method using the parameter anonymize.

    What is the difference between anonymization and privacy?

    Anonymization makes sure sensitive information are hidden from the data. Privacy makes sure it is not possible to infer the original data points from the synthetic data points via statistical attacks.

    Therefore, for data sharing anonymization and privacy controls are complementary.

    The example below demonstrates how to anonymize the column Name by fake names and the column Ticket by a regular expression:

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\ndef main():\n\"\"\"In this example, we demonstrate how to train a synthesizer from a pandas\n    DataFrame.\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\nX = get_dataset('titanic')\n# We initialize a regular synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = RegularSynthesizer()\n# We define anonymization rules, which is a dictionary with format:\n# {column_name: anonymization_rule, ...}\n# while here are some predefined anonymization rules like: name, email, company\n# it is also possible to create a rule using a regular expression\nrules = {\n\"Name\": \"name\",\n\"Ticket\": \"[A-Z]{2}-[A-Z]{4}\"\n}\n# We train the synthesizer on our dataset\nsynth.fit(\nX,\nname=\"titanic_synthesizer\",\nanonymize=rules\n)\n# We request a synthetic dataset with 50 rows\nsample = synth.sample(n_samples=50)\nprint(sample[[\"Name\", \"Ticket\"]].head(3))\nif __name__ == \"__main__\":\nmain()\n

    "},{"location":"examples/synthesize_with_conditional_sampling/","title":"Conditional sampling","text":"

    YData Synthesizers support conditional sampling. The fit method has an optional parameter named condition_on, which receives a list of features to condition upon. Furthermore, the sample method receives the conditions to be applied through another optional parameter also named condition_on. For now, two types of conditions are supported:

    • Condition upon a categorical (or string) feature. The parameters are the name of the feature and a list of values (i.e., categories) to be considered. Each category also has its percentage of representativeness. For example, if we want to condition upon two categories, we need to define the percentage of rows each of these categories will have on the synthetic dataset. Naturally, the sum of such percentages needs to be 1. The default percentage is also 1 since it is the required value for a single category.
    • Condition upon a numerical feature. The parameters are the name of the feature and the minimum and maximum of the range to be considered. This feature will present a uniform distribution on the synthetic dataset, limited by the specified range.

    The example below demonstrates how to train and sample from a synthesizer using conditional sampling:

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n# Do not forget to add your token as env variables.\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined.\ndef main():\n\"\"\"In this example, we demonstrate how to train and\n    sample from a synthesizer using conditional sampling.\"\"\"\nX = get_dataset('census')\n# We initialize a regular synthesizer.\n# As long as the synthesizer does not call `fit`, it exists only locally.\nsynth = RegularSynthesizer()\n# We train the synthesizer on our dataset setting\n# the features to condition upon.\nsynth.fit(\nX,\nname=\"census_synthesizer\",\ncondition_on=[\"sex\", \"native-country\", \"age\"]\n)\n# We request a synthetic dataset with specific condition rules.\nsample = synth.sample(\nn_samples=500,\ncondition_on={\n\"sex\": {\n\"categories\": [\"Female\"]\n},\n\"native-country\": {\n\"categories\": [(\"United-States\", 0.6),\n(\"Mexico\", 0.4)]\n},\n\"age\": {\n\"minimum\": 55,\n\"maximum\": 60\n}\n}\n)\nprint(sample)\nif __name__ == \"__main__\":\nmain()\n
    "},{"location":"examples/synthesize_with_privacy_control/","title":"Privacy control","text":"

    YData Synthesizers offers 3 different levels of privacy:

    1. high privacy: the model is optimized for privacy purposes,
    2. high fidelity (default): the model is optimized for high fidelity,
    3. balanced: tradeoff between privacy and fidelity.

    The default privacy level is high fidelity. The privacy level can be changed by the user at the moment a synthesizer level is trained by using the parameter privacy_level. The parameter expect a PrivacyLevel value.

    What is the difference between anonymization and privacy?

    Anonymization makes sure sensitive information are hidden from the data. Privacy makes sure it is not possible to infer the original data points from the synthetic data points via statistical attacks.

    Therefore, for data sharing anonymization and privacy controls are complementary.

    The example below demonstrates how to train a synthesizer configured for high privacy:

    import os\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import PrivacyLevel, RegularSynthesizer\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\ndef main():\n\"\"\"In this example, we demonstrate how to train a synthesizer\n    with a high-privacy setting from a pandas DataFrame.\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\nX = get_dataset('titanic')\n# We initialize a regular synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = RegularSynthesizer()\n# We train the synthesizer on our dataset setting the privacy level to high\nsynth.fit(\nX,\nname=\"titanic_synthesizer\",\nprivacy_level=PrivacyLevel.HIGH_PRIVACY\n)\n# We request a synthetic dataset with 50 rows\nsample = synth.sample(n_samples=50)\nprint(sample)\nif __name__ == \"__main__\":\nmain()\n
    "},{"location":"examples/synthesizer_multitable/","title":"Synthesize Multi Table","text":"

    Use YData's MultiTableSynthesizer to generate multi table synthetic data from multiple RDBMS tables

    Multi table is the way to synthesize data from multiple tables from a database, with a relational in mind...

    Quickstart example:

    import os\nfrom ydata.sdk.datasources import DataSource\nfrom ydata.sdk.synthesizers import MultiTableSynthesizer\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.\n# After training a Multi Table Synthesizer, we request a sample.\n# In this case, we don't return the Dataset for the sample, it will be saved in the database\n# that the connector refers to.\nX = DataSource.get('<DATASOURCE_UID>')\n# Initialize a multi table synthesizer with the connector to write to\n# As long as the synthesizer does not call `fit`, it exists only locally\n# write_connector can be an UID or a Connector instance\nsynth = MultiTableSynthesizer(write_connector='<CONNECTOR_UID')\n# The synthesizer training is requested\nsynth.fit(X)\n# We request a synthetic dataset with a fracion of 1.5\nsynth.sample(frac=1.5)\n

    Sample write connector overriding example:

    import os\nfrom ydata.sdk.connectors import Connector\nfrom ydata.sdk.datasources import DataSource\nfrom ydata.sdk.synthesizers import MultiTableSynthesizer\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.\n# After training a Multi Table Synthesizer, we request a sample.\n# In this case, we don't return the Dataset for the sample, it will be saved in the database\n# that the connector refers to.\nX = DataSource.get('<DATASOURCE_UID>')\n# For demonstration purposes, we will use a connector instance, but you can just send the UID\nwrite_connector = Connector.get('<CONNECTOR_UID>')\n# Initialize a multi table synthesizer with the connector to write to\n# As long as the synthesizer does not call `fit`, it exists only locally\n# write_connector can be an UID or a Connector instance\nsynth = MultiTableSynthesizer(write_connector=write_connector)\n# The synthesizer training is requested\nsynth.fit(X)\n# We request a synthetic dataset with a fracion of 1.5\n# In this case we use a Connector instance.\n# You can just use the <CONNECTOR_UID> you don't need to get the connector upfront.\nsynth.sample(frac=1.5, write_connector=write_connector)\n
    "},{"location":"get-started/","title":"Get started with Fabric","text":"

    The get started is here to help you if you are not yet familiar with YData Fabric or if you just want to learn more about data quality, data preparation workflows and how you can start leveraging synthetic data. Mention to YData Fabric Community

    "},{"location":"get-started/#create-your-first-data-with-the-data-catalog","title":"\ud83d\udcda Create your first Data with the Data Catalog","text":""},{"location":"get-started/#create-your-first-synthetic-data-generator","title":"\u2699\ufe0f Create your first Synthetic Data generator","text":""},{"location":"get-started/#create-your-first-lab","title":"\ud83e\uddea Create your first Lab","text":""},{"location":"get-started/#create-your-first-data-pipeline","title":"\ud83c\udf00 Create your first data Pipeline","text":""},{"location":"get-started/create_lab/","title":"How to create your first Lab environment","text":"

    Labs are code environments for a more flexible development of data-driven solutions while leveraging Fabric capabilities combined with already loved tools such as scikit-learn, numpy and pandas. To create your first Lab, you can use the \u201cCreate Lab\u201d from Fabric\u2019s home, or you can access it from the Labs module by selecting it on the left side menu, and clicking the \u201cCreate Lab\u201d button.

    Next, a menu with different IDEs will be shown. As a quickstart select Jupyter Lab. As labs are development environments you will be also asked what language you would prefer your environment to support: R or Python. Select Python.

    Select IDE Select language

    Bundles are environments with pre-installed packages. Select YData bundle, so we can leverage some other Fabric features such as Data Profiling, Synthetic Data and Pipelines.

    As a last step, you will be asked to configure the infrastructure resources for this new environment as well as giving it a Display Name. We will keep the defaults, but you have flexibility to select GPU acceleration or whether you need more computational resources for your developments.

    Finally, your Lab will be created and added to the \"Labs\" list, as per the image below. The status of the lab will be \ud83d\udfe1 while preparing, and this process takes a few minutes, as the infrastructure is being allocated to your development environment. As soon as the status changes to \ud83d\udfe2, you can open your lab by clicking in the button as shown below:

    Create a new notebook in the JupyterLab and give it a name. You are now ready to start your developments!

    Create a new notebook Notebook created

    Congrats! \ud83d\ude80 You have now successfully created your first Lab a code environment, so you can benefit from the most advanced Fabric features as well as compose complex data workflows. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_pipeline/","title":"How to create your first Pipeline","text":"

    Check this quickstart video on how to create your first Pipeline.

    The best way to get started with Pipelines is to use the interactive Pipeline editor available in the Labs with Jupyter Lab set as IDE. If you don't have a Lab yet, or you don't know how to create one, check our quickstart guide on how to create your first lab.

    Open an already existing lab.

    A Pipeline comprises one or more nodes that are connected (or not!) with each other to define execution dependencies. Each pipeline node is and should be implemented as a component that is expected to manage a single task, such as read the data, profiling the data, training a model, or even publishing a model to production environments.

    In this tutorial we will build a simple and generic pipeline that use a Dataset from Fabric's Data Catalog and profile to check it's quality. We have the notebooks template already available. For that you need to access the \"Academy\" folder as per the image below.

    Make sure to copy all the files in the folder \"3 - Pipelines/quickstart\" to the root folder of your lab, as per the image below.

    Now that we have our notebooks we need to make a small change in the notebook \"1. Read dataset\". Go back to your Data Catalog, from one of the datasets in your Catalog list, select the three vertical dots and click in \"Explore in Labs\" as shown in the image below.

    The following screen will be shown. Click in copy.

    Now that we have copied the code, let's get back to our \"1. Read data.ipynb\" notebook, and replace the first code cell by with the new code. This will allow us to use a dataset from the Data Catalog in our pipeline.

    Placeholder code Replaced with code snippet

    With our notebooks ready, we can now configure our Pipeline. For this quickstart we will be leveraging an already existing pipeline - double-click the file my_first_pipeline.pipeline. You should see a pipeline as depicted in the images below. To create a new Pipeline, you can open the lab launcher tab and select \"Pipeline Editor\".

    Open Pipeline My first pipeline

    Before running the pipeline, we need to check each component/step properties and configurations. Right-click each one of the steps, select \"Open Properties\", and a menu will be depicted in your right side. Make sure that you have \"YData - CPU\" selected as the Runtime Image as show below.

    Open properties Runtime image

    We are now ready to create and run our first pipeline. In the top left corner of the pipeline editor, the run button will be available for you to click.

    Accept the default values shown in the run dialog and start the run

    If the following message is shown, it means that you have create a run of your first pipeline.

    Now that you have created your first pipeline, you can select the Pipeline from Fabric's left side menu.

    Your most recent pipeline will be listed, as shown in below image.

    To check the run of your pipeline, jump into the \"Run\" tab. You will be able to see your first pipeline running!

    By clicking on top of the record you will be able to see the progress of the run step-by-step, and visualize the outputs of each and every step by clicking on each step and selecting the Visualizations tab.

    Congrats! \ud83d\ude80 You have now successfully created your first Pipeline a code environment, so you can benefit from Fabric's orchestration engine to crate scalable, versionable and comparable data workflows. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_syntheticdata_generator/","title":"How to create your first Synthetic Data generator","text":"

    Check this quickstart video on how to create your first Synthetic Data generator.

    To generate your first synthetic data, you need to have a Dataset already available in your Data Catalog. Check this tutorial to see how you can add your first dataset to Fabric\u2019s Data Catalog.

    With your first dataset created, you are now able to start the creation of your Synthetic Data generator. You can either select \"Synthetic Data\" from your left side menu, or you can select \"Create Synthetic Data\" in your project Home as shown in the image below.

    You'll be asked to select the dataset you wish to generate synthetic data from and verify the columns you'd like to include in the synthesis process, validating their Variable and Data Types.

    Data types are relevant for synthetic data quality

    Data Types are important to be revisited and aligned with the objectives for the synthetic data as they can highly impact the quality of the generated data. For example, let's say we have a column that is a \"Name\", while is some situations it would make sense to consider it a String, under the light of a dataset where \"Name\" refers to the name of the product purchases, it might be more beneficial to set it as a Category.

    Finally, as the last step of our process it comes the Synthetic Data specific configurations, for this particular case we only need to define a Display Name, and we can finish the process by clicking in the \"Save\" button as per the image below.

    Your Synthetic Data generator is now training and listed under \"Synthetic Data\". While the model is being trained, the Status will be \ud83d\udfe1, as soon as the training is completed successfully it will transition to \ud83d\udfe2 as per the image below.

    Once the Synthetic Data generator has finished training, you're ready to start generating your first synthetic dataset. You can start by exploring an overview of the model configurations and even download a PDF report with a comprehensive overview of your Synthetic Data Quality Metrics. Next, you can generate synthetic data samples by accessing the Generation tab or click on \"Go to Generation\".

    In this section, you are able to generate as many synthetic samples as you want. For that you need to define the number rows to generate and click \"Generate\", as depicted in the image below.

    A new line in your \"Sample History\" will be shown and as soon as the sample generation is completed you will be able to \"Compare\" your synthetic data with the original data, add as a Dataset with \"Add to Data Catalog\" and last but not the least download it as a file with \"Download csv\".

    Congrats! \ud83d\ude80 You have now successfully created your first Synthetic Data generator with Fabric. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/fabric_community/","title":"Get started with Fabric Community","text":"

    Fabric Community is a SaaS version that allows you to explore all the functionalities of Fabric first-hand: free, forever, for everyone. You\u2019ll be able to validate your data quality with automated profiling, unlock data sharing and improve your ML models with synthetic data, and increase your productivity with seamless integration:

    • Build 1 personal project;
    • Create your first Data Catalog and benefit from automated data profiling;
    • Train and generate synthetic data up to 2 models and datasets with 50 columns and 100K rows;
    • Optimize synthetic data quality for your use cases with an evaluation PDF report;
    • Create 1 development environment (Labs) and integrate it with your familiar ML packages and workflows.
    "},{"location":"get-started/fabric_community/#register","title":"Register","text":"

    To register for Fabric Community:

    • Access the Fabric Community Try Now and create your YData account by submitting the form
    • Check your email for your login credentials
    • Login into fabric.ydata.ai and enjoy!

    Once you login, you'll access the Home page and get started with your data preparation!

    "},{"location":"get-started/upload_csv/","title":"How to create your first Dataset from a CSV file","text":"

    Check this quickstart video on how to create your first Dataset from a CSV file.

    To create your first dataset in the Data Catalog, you can start by clicking on \"Add Dataset\" from the Home section. Or click to Data Catalog (on the left side menu) and click \u201cAdd Dataset\u201d.

    After that the below modal will be shown. You will need to select a connector. To upload a CSV file, we need to select \u201cUpload CSV\u201d.

    Once you've selected the \u201cUpload CSV\u201d connector, a new screen will appear, enabling you to upload your file and designate a name for your connector. This file upload connector will subsequently empower you to create one or more datasets from the same file at a later stage.

    Loading area Upload csv file

    With the Connector created, you'll be able to add a dataset and specify its properties:

    • Name: The name of your dataset;
    • Separator: This is an important parameter to make sure that we can parse your CSV correctly. The default value is \u201c,\u201d.
    • Data Type: Whether your dataset contains tabular or time-series (i.e., containing temporal dependency) data.

    Your created Connector (\u201cCensus File\u201d) and Dataset (\u201cCensus\u201d) will be added to the Data Catalog. As soon as the status is green, you can navigate your Dataset. Click in Open Dataset as per the image below.

    Within the Dataset details, you can gain valuable insights through our automated data quality profiling. This includes comprehensive metadata and an overview of your data, encompassing details like row count, identification of duplicates, and insights into the overall quality of your dataset.

    Or perhaps, you want to further explore through visualization, the profile of your data with both univariate and multivariate of your data.

    Congrats! \ud83d\ude80 You have now successfully created your first Connector and Dataset in Fabric\u2019s Data Catalog. Get ready for your journey of improved quality data for AI.

    "},{"location":"sdk/","title":"Overview","text":"

    YData SDK for improved data quality everywhere!

    ydata-sdk is here! Create a YData account so you can start using today!

    Create account

    "},{"location":"sdk/#overview","title":"Overview","text":"

    The YData SDK is an ecosystem of methods that allows users to, through a python interface, adopt a Data-Centric approach towards the AI development. The solution includes a set of integrated components for data ingestion, standardized data quality evaluation and data improvement, such as synthetic data generation, allowing an iterative improvement of the datasets used in high-impact business applications.

    Synthetic data can be used as Machine Learning performance enhancer, to augment or mitigate the presence of bias in real data. Furthermore, it can be used as a Privacy Enhancing Technology, to enable data-sharing initiatives or even to fuel testing environments.

    Under the YData-SDK hood, you can find a set of algorithms and metrics based on statistics and deep learning based techniques, that will help you to accelerate your data preparation.

    "},{"location":"sdk/#current-functionality","title":"Current functionality","text":"

    YData SDK is currently composed by the following main modules:

    • Datasources

      • YData\u2019s SDK includes several connectors for easy integration with existing data sources. It supports several storage types, like filesystems and RDBMS. Check the list of connectors.
      • SDK\u2019s Datasources run on top of Dask, which allows it to deal with not only small workloads but also larger volumes of data.
    • Synthesizers

      • Simplified interface to train a generative model and learn in a data-driven manner the behavior, the patterns and original data distribution. Optimize your model for privacy or utility use-cases.
      • From a trained synthesizer, you can generate synthetic samples as needed and parametrise the number of records needed.
      • Anonymization and privacy preserving capabilities to ensure that synthetic datasets does not contain Personal Identifiable Information (PII) and can safely be shared!
      • Conditional sampling can be used to restrict the domain and values of specific features in the sampled data.
    • Synthetic data quality report Coming soon

      • An extensive synthetic data quality report that measures 3 dimensions: privacy, utility and fidelity of the generated data. The report can be downloaded in PDF format for ease of sharing and compliance purposes or as a JSON to enable the integration in data flows.
    • Profiling Coming soon

      • A set of metrics and algorithms summarizes datasets quality in three main dimensions: warnings, univariate analysis and a multivariate perspective.
    "},{"location":"sdk/#supported-data-formats","title":"Supported data formats","text":"TabularTime-SeriesTransactionalRelational databases

    The RegularSynthesizer is perfect to synthesize high-dimensional data, that is time-indepentent with high quality results.

    Know more

    The TimeSeriesSynthesizer is perfect to synthesize both regularly and not evenly spaced time-series, from smart-sensors to stock.

    Know more

    The TimeSeriesSynthesizer supports transactional data, known to have highly irregular time intervals between records and directional relations between entities.

    Coming soon

    Know more

    The MultiTableSynthesizer is perfect to learn how to replicate the data within a relational database schema.

    Coming soon

    Know more

    "},{"location":"sdk/installation/","title":"Installation","text":"

    YData SDK is generally available through both Pypi and Conda allowing an easy process of installation. This experience allows combining YData SDK with other packages such as Pandas, Numpy or Scikit-Learn.

    YData SDK is available for the public through a token-based authentication system. If you don\u2019t have one yet, you can get your free license key during the installation process. You can check what features are available in the free version here.

    "},{"location":"sdk/installation/#installing-the-package","title":"Installing the package","text":"

    YData SDK supports python versions bigger than python 3.8, and can be installed in Windows, Linux or MacOS operating systems.

    Prior to the package installation, it is recommended the creation of a virtual or conda environment:

    pyenv
    pyenv virtualenv 3.10 ydatasdk\n

    And install ydata-sdk

    pypi
    pip install ydata-sdk\n
    "},{"location":"sdk/installation/#authentication","title":"Authentication","text":"

    Once you've installed ydata-sdk package you will need a token to run the functionalities. YData SDK uses a token based authentication system. To get access to your token, you need to create a YData account.

    YData SDK offers a free-trial and an enterprise version. To access your free-trial token, you need to create a YData account.

    The token will be available here, after login:

    With your account toke copied, you can set a new environment variable YDATA_TOKEN in the beginning of your development session.

        import os\nos.setenv['YDATA_TOKEN'] = '{add-your-token}'\n

    Once you have set your token, you are good to go to start exploring the incredible world of data-centric AI and smart synthetic data generation!

    Check out our quickstart guide!

    "},{"location":"sdk/quickstart/","title":"Quickstart","text":"

    YData SDK allows you to with an easy and familiar interface, to adopt a Data-Centric AI approach for the development of Machine Learning solutions. YData SDK features were designed to support structure data, including tabular data, time-series and transactional data.

    "},{"location":"sdk/quickstart/#read-data","title":"Read data","text":"

    To start leveraging the package features you should consume your data either through the Connectors or pandas.Dataframe. The list of available connectors can be found here [add a link].

    From pandas dataframeFrom a connector
        # Example for a Google Cloud Storage Connector\ncredentials = \"{insert-credentials-file-path}\"\n# We create a new connector for Google Cloud Storage\nconnector = Connector(connector_type='gcs', credentials=credentials)\n# Create a Datasource from the connector\n# Note that a connector can be re-used for several datasources\nX = DataSource(connector=connector, path='gs://<my_bucket>.csv')\n
        # Load a small dataset\nX = pd.read_csv('{insert-file-path.csv}')\n# Init a synthesizer\nsynth = RegularSynthesizer()\n# Train the synthesizer with the pandas Dataframe as input\n# The data is then sent to the cluster for processing\nsynth.fit(X)\n

    The synthesis process returns a pandas.DataFrame object. Note that if you are using the ydata-sdk free version, all of your data is sent to a remote cluster on YData's infrastructure.

    "},{"location":"sdk/quickstart/#data-synthesis-flow","title":"Data synthesis flow","text":"

    The process of data synthesis can be described into the following steps:

    stateDiagram-v2\n  state read_data\n  read_data --> init_synth\n  init_synth --> train_synth\n  train_synth --> generate_samples\n  generate_samples --> [*]

    The code snippet below shows how easy can be to start generating new synthetic data. The package includes a set of examples datasets for a quickstart.

        from ydata.sdk.dataset import get_dataset\n#read the example data\nX = get_dataset('census')\n# Init a synthesizer\nsynth = RegularSynthesizer()\n# Fit the synthesizer to the input data\nsynth.fit(X)\n# Sample new synthetic data. The below request ask for new 1000 synthetic rows\nsynth.sample(n_samples=1000)\n

    Do I need to prepare my data before synthesis?

    The sdk ensures that the original behaviour is replicated. For that reason, there is no need to preprocess outlier observations or missing data.

    By default all the missing data is replicated as NaN.

    "},{"location":"sdk/modules/connectors/","title":"Connectors","text":"

    YData SDK allows users to consume data assets from remote storages through Connectors. YData Connectors support different types of storages, from filesystems to RDBMS'.

    Below the list of available connectors:

    Connector Name Type Supported File Types Useful Links Notes AWS S3 Remote object storage CSV, Parquet https://aws.amazon.com/s3/ Google Cloud Storage Remote object storage CSV, Parquet https://cloud.google.com/storage Azure Blob Storage Remote object storage CSV, Parquet https://azure.microsoft.com/en-us/services/storage/blobs/ File Upload Local CSV - Maximum file size is 220MB. Bigger files should be uploaded and read from remote object storages MySQL RDBMS Not applicable https://www.mysql.com/ Supports reading whole schemas or specifying a query Azure SQL Server RDBMS Not applicable https://azure.microsoft.com/en-us/services/sql-database/campaign/ Supports reading whole schemas or specifying a query PostgreSQL RDBMS Not applicable https://www.postgresql.org/ Supports reading whole schemas or specifying a query Snowflake RDBMS Not applicable https://docs.snowflake.com/en/sql-reference-commands Supports reading whole schemas or specifying a query Google BigQuery Data warehouse Not applicable https://cloud.google.com/bigquery Azure Data Lake Data lake CSV, Parquet https://azure.microsoft.com/en-us/services/storage/data-lake-storage/

    More details can be found at Connectors APi Reference Docs.

    "},{"location":"sdk/modules/synthetic_data/","title":"Synthetic data generation","text":""},{"location":"sdk/modules/synthetic_data/#data-formats","title":"Data formats","text":""},{"location":"sdk/modules/synthetic_data/#tabular-data","title":"Tabular data","text":""},{"location":"sdk/modules/synthetic_data/#time-series-data","title":"Time-series data","text":""},{"location":"sdk/modules/synthetic_data/#transactions-data","title":"Transactions data","text":""},{"location":"sdk/modules/synthetic_data/#best-practices","title":"Best practices","text":""},{"location":"sdk/reference/api/common/client/","title":"Get client","text":"

    Deduce how to initialize or retrieve the client.

    This is meant to be a zero configuration for the user.

    Create and set a client globally
    from ydata.sdk.client import get_client\nget_client(set_as_global=True)\n

    Parameters:

    Name Type Description Default client_or_creds Optional[Union[Client, dict, str, Path]]

    Client to forward or credentials for initialization

    None set_as_global bool

    If True, set client as global

    False wait_for_auth bool

    If True, wait for the user to authenticate

    True

    Returns:

    Type Description Client

    Client instance

    Source code in ydata/sdk/common/client/utils.py
    def get_client(client_or_creds: Optional[Union[Client, Dict, str, Path]] = None, set_as_global: bool = False, wait_for_auth: bool = True) -> Client:\n\"\"\"Deduce how to initialize or retrieve the client.\n    This is meant to be a zero configuration for the user.\n    Example: Create and set a client globally\n            ```py\n            from ydata.sdk.client import get_client\n            get_client(set_as_global=True)\n            ```\n    Args:\n        client_or_creds (Optional[Union[Client, dict, str, Path]]): Client to forward or credentials for initialization\n        set_as_global (bool): If `True`, set client as global\n        wait_for_auth (bool): If `True`, wait for the user to authenticate\n    Returns:\n        Client instance\n    \"\"\"\nclient = None\nglobal WAITING_FOR_CLIENT\ntry:\n# If a client instance is set globally, return it\nif not set_as_global and Client.GLOBAL_CLIENT is not None:\nreturn Client.GLOBAL_CLIENT\n# Client exists, forward it\nif isinstance(client_or_creds, Client):\nreturn client_or_creds\n# Explicit credentials\n''' # For the first version, we deactivate explicit credentials via string or file for env var only\n        if isinstance(client_or_creds, (dict, str, Path)):\n            if isinstance(client_or_creds, str):  # noqa: SIM102\n                if Path(client_or_creds).is_file():\n                    client_or_creds = Path(client_or_creds)\n            if isinstance(client_or_creds, Path):\n                client_or_creds = json.loads(client_or_creds.open().read())\n            return Client(credentials=client_or_creds)\n        # Last try with environment variables\n        #if client_or_creds is None:\n        client = _client_from_env(wait_for_auth=wait_for_auth)\n        '''\ncredentials = environ.get(TOKEN_VAR)\nif credentials is not None:\nclient = Client(credentials=credentials)\nexcept ClientHandshakeError as e:\nwait_for_auth = False  # For now deactivate wait_for_auth until the backend is ready\nif wait_for_auth:\nWAITING_FOR_CLIENT = True\nstart = time()\nlogin_message_printed = False\nwhile client is None:\nif not login_message_printed:\nprint(\nf\"The token needs to be refreshed - please validate your token by browsing at the following URL:\\n\\n\\t{e.auth_link}\")\nlogin_message_printed = True\nwith suppress(ClientCreationError):\nsleep(BACKOFF)\nclient = get_client(wait_for_auth=False)\nnow = time()\nif now - start > CLIENT_INIT_TIMEOUT:\nWAITING_FOR_CLIENT = False\nbreak\nif client is None and not WAITING_FOR_CLIENT:\nsys.tracebacklimit = None\nraise ClientCreationError\nreturn client\n

    Main Client class used to abstract the connection to the backend.

    A normal user should not have to instanciate a Client by itself. However, in the future it will be useful for power-users to manage projects and connections.

    Parameters:

    Name Type Description Default credentials Optional[dict]

    (optional) Credentials to connect

    None project Optional[Project]

    (optional) Project to connect to. If not specified, the client will connect to the default user's project.

    None Source code in ydata/sdk/common/client/client.py
    @typechecked\nclass Client(metaclass=SingletonClient):\n\"\"\"Main Client class used to abstract the connection to the backend.\n    A normal user should not have to instanciate a [`Client`][ydata.sdk.common.client.Client] by itself.\n    However, in the future it will be useful for power-users to manage projects and connections.\n    Args:\n        credentials (Optional[dict]): (optional) Credentials to connect\n        project (Optional[Project]): (optional) Project to connect to. If not specified, the client will connect to the default user's project.\n    \"\"\"\ncodes = codes\ndef __init__(self, credentials: Optional[Union[str, Dict]] = None, project: Optional[Project] = None, set_as_global: bool = False):\nself._base_url = environ.get(\"YDATA_BASE_URL\", DEFAULT_URL)\nself._scheme = 'https'\nself._headers = {'Authorization': credentials}\nself._http_client = httpClient(\nheaders=self._headers, timeout=Timeout(10, read=None))\nself._handshake()\nself._default_project = project or self._get_default_project(credentials)\nif set_as_global:\nself.__set_global()\ndef post(\nself, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None,\nproject: Optional[Project] = None, files: Optional[Dict] = None, raise_for_status: bool = True\n) -> Response:\n\"\"\"POST request to the backend.\n        Args:\n            endpoint (str): POST endpoint\n            data (Optional[dict]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            raise_for_status (bool): raise an exception on error\n        Returns:\n            Response object\n        \"\"\"\nurl_data = self.__build_url(\nendpoint, data=data, json=json, files=files, project=project)\nresponse = self._http_client.post(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\ndef get(\nself, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None,\ncookies: Optional[Dict] = None, raise_for_status: bool = True\n) -> Response:\n\"\"\"GET request to the backend.\n        Args:\n            endpoint (str): GET endpoint\n            cookies (Optional[dict]): (optional) cookies data\n            raise_for_status (bool): raise an exception on error\n        Returns:\n            Response object\n        \"\"\"\nurl_data = self.__build_url(endpoint, params=params,\ncookies=cookies, project=project)\nresponse = self._http_client.get(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\ndef get_static_file(\nself, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True\n) -> Response:\n\"\"\"Retrieve a static file from the backend.\n        Args:\n            endpoint (str): GET endpoint\n            raise_for_status (bool): raise an exception on error\n        Returns:\n            Response object\n        \"\"\"\nurl_data = self.__build_url(endpoint, project=project)\nurl_data['url'] = f'{self._scheme}://{self._base_url}/static-content{endpoint}'\nresponse = self._http_client.get(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\ndef _handshake(self):\n\"\"\"Client handshake.\n        It is used to determine is the client can connect with its\n        current authorization token.\n        \"\"\"\nresponse = self.get('/profiles', params={}, raise_for_status=False)\nif response.status_code == Client.codes.FOUND:\nparser = LinkExtractor()\nparser.feed(response.text)\nraise ClientHandshakeError(auth_link=parser.link)\ndef _get_default_project(self, token: str):\nresponse = self.get('/profiles/me', params={}, cookies={'access_token': token})\ndata: Dict = response.json()\nreturn data['myWorkspace']\ndef __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,\njson: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\ncookies: Optional[Dict] = None) -> Dict:\n\"\"\"Build a request for the backend.\n        Args:\n            endpoint (str): backend endpoint\n            params (Optional[dict]): URL parameters\n            data (Optional[Project]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            cookies (Optional[dict]): (optional) cookies data\n        Returns:\n            dictionary containing the information to perform a request\n        \"\"\"\n_params = params if params is not None else {\n'ns': project or self._default_project\n}\nurl_data = {\n'url': f'{self._scheme}://{self._base_url}/api{endpoint}',\n'headers': self._headers,\n'params': _params,\n}\nif data is not None:\nurl_data['data'] = data\nif json is not None:\nurl_data['json'] = json\nif files is not None:\nurl_data['files'] = files\nif cookies is not None:\nurl_data['cookies'] = cookies\nreturn url_data\ndef __set_global(self) -> None:\n\"\"\"Sets a client instance as global.\"\"\"\n# If the client is stateful, close it gracefully!\nClient.GLOBAL_CLIENT = self\ndef __raise_for_status(self, response: Response) -> None:\n\"\"\"Raise an exception if the response is not OK.\n        When an exception is raised, we try to convert it to a ResponseError which is\n        a wrapper around a backend error. This usually gives enough context and provides\n        nice error message.\n        If it cannot be converted to ResponseError, it is re-raised.\n        Args:\n            response (Response): response to analyze\n        \"\"\"\ntry:\nresponse.raise_for_status()\nexcept HTTPStatusError as e:\nwith suppress(Exception):\ne = ResponseError(**response.json())\nraise e\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__build_url","title":"__build_url(endpoint, params=None, data=None, json=None, project=None, files=None, cookies=None)","text":"

    Build a request for the backend.

    Parameters:

    Name Type Description Default endpoint str

    backend endpoint

    required params Optional[dict]

    URL parameters

    None data Optional[Project]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None cookies Optional[dict]

    (optional) cookies data

    None

    Returns:

    Type Description Dict

    dictionary containing the information to perform a request

    Source code in ydata/sdk/common/client/client.py
    def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,\njson: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\ncookies: Optional[Dict] = None) -> Dict:\n\"\"\"Build a request for the backend.\n    Args:\n        endpoint (str): backend endpoint\n        params (Optional[dict]): URL parameters\n        data (Optional[Project]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        cookies (Optional[dict]): (optional) cookies data\n    Returns:\n        dictionary containing the information to perform a request\n    \"\"\"\n_params = params if params is not None else {\n'ns': project or self._default_project\n}\nurl_data = {\n'url': f'{self._scheme}://{self._base_url}/api{endpoint}',\n'headers': self._headers,\n'params': _params,\n}\nif data is not None:\nurl_data['data'] = data\nif json is not None:\nurl_data['json'] = json\nif files is not None:\nurl_data['files'] = files\nif cookies is not None:\nurl_data['cookies'] = cookies\nreturn url_data\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__raise_for_status","title":"__raise_for_status(response)","text":"

    Raise an exception if the response is not OK.

    When an exception is raised, we try to convert it to a ResponseError which is a wrapper around a backend error. This usually gives enough context and provides nice error message.

    If it cannot be converted to ResponseError, it is re-raised.

    Parameters:

    Name Type Description Default response Response

    response to analyze

    required Source code in ydata/sdk/common/client/client.py
    def __raise_for_status(self, response: Response) -> None:\n\"\"\"Raise an exception if the response is not OK.\n    When an exception is raised, we try to convert it to a ResponseError which is\n    a wrapper around a backend error. This usually gives enough context and provides\n    nice error message.\n    If it cannot be converted to ResponseError, it is re-raised.\n    Args:\n        response (Response): response to analyze\n    \"\"\"\ntry:\nresponse.raise_for_status()\nexcept HTTPStatusError as e:\nwith suppress(Exception):\ne = ResponseError(**response.json())\nraise e\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__set_global","title":"__set_global()","text":"

    Sets a client instance as global.

    Source code in ydata/sdk/common/client/client.py
    def __set_global(self) -> None:\n\"\"\"Sets a client instance as global.\"\"\"\n# If the client is stateful, close it gracefully!\nClient.GLOBAL_CLIENT = self\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.get","title":"get(endpoint, params=None, project=None, cookies=None, raise_for_status=True)","text":"

    GET request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    GET endpoint

    required cookies Optional[dict]

    (optional) cookies data

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def get(\nself, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None,\ncookies: Optional[Dict] = None, raise_for_status: bool = True\n) -> Response:\n\"\"\"GET request to the backend.\n    Args:\n        endpoint (str): GET endpoint\n        cookies (Optional[dict]): (optional) cookies data\n        raise_for_status (bool): raise an exception on error\n    Returns:\n        Response object\n    \"\"\"\nurl_data = self.__build_url(endpoint, params=params,\ncookies=cookies, project=project)\nresponse = self._http_client.get(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.get_static_file","title":"get_static_file(endpoint, project=None, raise_for_status=True)","text":"

    Retrieve a static file from the backend.

    Parameters:

    Name Type Description Default endpoint str

    GET endpoint

    required raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def get_static_file(\nself, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True\n) -> Response:\n\"\"\"Retrieve a static file from the backend.\n    Args:\n        endpoint (str): GET endpoint\n        raise_for_status (bool): raise an exception on error\n    Returns:\n        Response object\n    \"\"\"\nurl_data = self.__build_url(endpoint, project=project)\nurl_data['url'] = f'{self._scheme}://{self._base_url}/static-content{endpoint}'\nresponse = self._http_client.get(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.post","title":"post(endpoint, data=None, json=None, project=None, files=None, raise_for_status=True)","text":"

    POST request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    POST endpoint

    required data Optional[dict]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def post(\nself, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None,\nproject: Optional[Project] = None, files: Optional[Dict] = None, raise_for_status: bool = True\n) -> Response:\n\"\"\"POST request to the backend.\n    Args:\n        endpoint (str): POST endpoint\n        data (Optional[dict]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        raise_for_status (bool): raise an exception on error\n    Returns:\n        Response object\n    \"\"\"\nurl_data = self.__build_url(\nendpoint, data=data, json=json, files=files, project=project)\nresponse = self._http_client.post(**url_data)\nif response.status_code != Client.codes.OK and raise_for_status:\nself.__raise_for_status(response)\nreturn response\n
    "},{"location":"sdk/reference/api/common/types/","title":"Types","text":""},{"location":"sdk/reference/api/connectors/connector/","title":"Connector","text":"

    Bases: ModelFactoryMixin

    A Connector allows to connect and access data stored in various places. The list of available connectors can be found here.

    Parameters:

    Name Type Description Default connector_type Union[ConnectorType, str]

    Type of the connector to be created

    None credentials dict

    Connector credentials

    None name Optional[str]

    (optional) Connector name

    None client Client

    (optional) Client to connect to the backend

    None

    Attributes:

    Name Type Description uid UID

    UID fo the connector instance (creating internally)

    type ConnectorType

    Type of the connector

    Source code in ydata/sdk/connectors/connector.py
    class Connector(ModelFactoryMixin):\n\"\"\"A [`Connector`][ydata.sdk.connectors.Connector] allows to connect and\n    access data stored in various places. The list of available connectors can\n    be found [here][ydata.sdk.connectors.ConnectorType].\n    Arguments:\n        connector_type (Union[ConnectorType, str]): Type of the connector to be created\n        credentials (dict): Connector credentials\n        name (Optional[str]): (optional) Connector name\n        client (Client): (optional) Client to connect to the backend\n    Attributes:\n        uid (UID): UID fo the connector instance (creating internally)\n        type (ConnectorType): Type of the connector\n    \"\"\"\ndef __init__(self, connector_type: Union[ConnectorType, str] = None, credentials: Optional[Dict] = None,  name: Optional[str] = None, client: Optional[Client] = None):\nself._init_common(client=client)\nself._model: Optional[mConnector] = self._create_model(\nconnector_type, credentials, name, client=client)\n@init_client\ndef _init_common(self, client: Optional[Client] = None):\nself._client = client\nself._logger = create_logger(__name__, level=LOG_LEVEL)\n@property\ndef uid(self) -> UID:\nreturn self._model.uid\n@property\ndef type(self) -> ConnectorType:\nreturn self._model.type\n@staticmethod\n@init_client\ndef get(uid: UID, client: Optional[Client] = None) -> \"Connector\":\n\"\"\"Get an existing connector.\n        Arguments:\n            uid (UID): Connector identifier\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            Connector\n        \"\"\"\nconnectors: ConnectorsList = Connector.list(client=client)\ndata = connectors.get_by_uid(uid)\nmodel = mConnector(**data)\nconnector = ModelFactoryMixin._init_from_model_data(Connector, model)\nreturn connector\n@staticmethod\ndef _init_connector_type(connector_type: Union[ConnectorType, str]) -> ConnectorType:\nif isinstance(connector_type, str):\ntry:\nconnector_type = ConnectorType(connector_type)\nexcept Exception:\nc_list = \", \".join([c.value for c in ConnectorType])\nraise InvalidConnectorError(\nf\"ConnectorType '{connector_type}' does not exist.\\nValid connector types are: {c_list}.\")\nreturn connector_type\n@staticmethod\ndef _init_credentials(connector_type: ConnectorType, credentials: Union[str, Path, Dict, Credentials]) -> Credentials:\n_credentials = None\nif isinstance(credentials, str):\ncredentials = Path(credentials)\nif isinstance(credentials, Path):\ntry:\n_credentials = json_loads(credentials.open().read())\nexcept Exception:\nraise CredentialTypeError(\n'Could not read the credentials. Please, check your path or credentials structure.')\ntry:\nfrom ydata.sdk.connectors._models.connector_map import TYPE_TO_CLASS\ncredential_cls = TYPE_TO_CLASS.get(connector_type.value)\n_credentials = credential_cls(**_credentials)\nexcept Exception:\nraise CredentialTypeError(\n\"Could not create the credentials. Verify the path or the structure your credentials.\")\nreturn _credentials\n@staticmethod\ndef create(connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials], name: Optional[str] = None, client: Optional[Client] = None) -> \"Connector\":\n\"\"\"Create a new connector.\n        Arguments:\n            connector_type (Union[ConnectorType, str]): Type of the connector to be created\n            credentials (dict): Connector credentials\n            name (Optional[str]): (optional) Connector name\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            New connector\n        \"\"\"\nmodel = Connector._create_model(\nconnector_type=connector_type, credentials=credentials, name=name, client=client)\nconnector = ModelFactoryMixin._init_from_model_data(\nConnector, model)\nreturn connector\n@classmethod\n@init_client\ndef _create_model(cls, connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials], name: Optional[str] = None, client: Optional[Client] = None) -> mConnector:\n_name = name if name is not None else str(uuid4())\n_connector_type = Connector._init_connector_type(connector_type)\n_credentials = Connector._init_credentials(_connector_type, credentials)\npayload = {\n\"type\": _connector_type.value,\n\"credentials\": _credentials.dict(by_alias=True),\n\"name\": _name\n}\nresponse = client.post('/connector/', json=payload)\ndata: list = response.json()\nreturn mConnector(**data)\n@staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> ConnectorsList:\n\"\"\"List the connectors instances.\n        Arguments:\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            List of connectors\n        \"\"\"\nresponse = client.get('/connector')\ndata: list = response.json()\nreturn ConnectorsList(data)\ndef __repr__(self):\nreturn self._model.__repr__()\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.create","title":"create(connector_type, credentials, name=None, client=None) staticmethod","text":"

    Create a new connector.

    Parameters:

    Name Type Description Default connector_type Union[ConnectorType, str]

    Type of the connector to be created

    required credentials dict

    Connector credentials

    required name Optional[str]

    (optional) Connector name

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description Connector

    New connector

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\ndef create(connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials], name: Optional[str] = None, client: Optional[Client] = None) -> \"Connector\":\n\"\"\"Create a new connector.\n    Arguments:\n        connector_type (Union[ConnectorType, str]): Type of the connector to be created\n        credentials (dict): Connector credentials\n        name (Optional[str]): (optional) Connector name\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        New connector\n    \"\"\"\nmodel = Connector._create_model(\nconnector_type=connector_type, credentials=credentials, name=name, client=client)\nconnector = ModelFactoryMixin._init_from_model_data(\nConnector, model)\nreturn connector\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.get","title":"get(uid, client=None) staticmethod","text":"

    Get an existing connector.

    Parameters:

    Name Type Description Default uid UID

    Connector identifier

    required client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description Connector

    Connector

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\n@init_client\ndef get(uid: UID, client: Optional[Client] = None) -> \"Connector\":\n\"\"\"Get an existing connector.\n    Arguments:\n        uid (UID): Connector identifier\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        Connector\n    \"\"\"\nconnectors: ConnectorsList = Connector.list(client=client)\ndata = connectors.get_by_uid(uid)\nmodel = mConnector(**data)\nconnector = ModelFactoryMixin._init_from_model_data(Connector, model)\nreturn connector\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.list","title":"list(client=None) staticmethod","text":"

    List the connectors instances.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description ConnectorsList

    List of connectors

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> ConnectorsList:\n\"\"\"List the connectors instances.\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        List of connectors\n    \"\"\"\nresponse = client.get('/connector')\ndata: list = response.json()\nreturn ConnectorsList(data)\n
    "},{"location":"sdk/reference/api/connectors/connector/#connectortype","title":"ConnectorType","text":"

    Bases: Enum

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AWS_S3","title":"AWS_S3 = 'aws-s3' class-attribute instance-attribute","text":"

    AWS S3 connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AZURE_BLOB","title":"AZURE_BLOB = 'azure-blob' class-attribute instance-attribute","text":"

    Azure Blob connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AZURE_SQL","title":"AZURE_SQL = 'azure-sql' class-attribute instance-attribute","text":"

    AzureSQL connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.BIGQUERY","title":"BIGQUERY = 'google-bigquery' class-attribute instance-attribute","text":"

    BigQuery connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.FILE","title":"FILE = 'file' class-attribute instance-attribute","text":"

    File connector (placeholder)

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.GCS","title":"GCS = 'gcs' class-attribute instance-attribute","text":"

    Google Cloud Storage connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.MYSQL","title":"MYSQL = 'mysql' class-attribute instance-attribute","text":"

    MySQL connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.SNOWFLAKE","title":"SNOWFLAKE = 'snowflake' class-attribute instance-attribute","text":"

    Snowflake connector

    "},{"location":"sdk/reference/api/datasources/datasource/","title":"DataSource","text":"

    Bases: ModelFactoryMixin

    A DataSource represents a dataset to be used by a Synthesizer as training data.

    Parameters:

    Name Type Description Default connector Connector

    Connector from which the datasource is created

    required datatype Optional[Union[DataSourceType, str]]

    (optional) DataSource type

    TABULAR name Optional[str]

    (optional) DataSource name

    None wait_for_metadata bool

    If True, wait until the metadata is fully calculated

    True client Client

    (optional) Client to connect to the backend

    None **config

    Datasource specific configuration

    {}

    Attributes:

    Name Type Description uid UID

    UID fo the datasource instance

    datatype DataSourceType

    Data source type

    status Status

    Status of the datasource

    metadata Metadata

    Metadata associated to the datasource

    Source code in ydata/sdk/datasources/datasource.py
    class DataSource(ModelFactoryMixin):\n\"\"\"A [`DataSource`][ydata.sdk.datasources.DataSource] represents a dataset\n    to be used by a Synthesizer as training data.\n    Arguments:\n        connector (Connector): Connector from which the datasource is created\n        datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n        name (Optional[str]): (optional) DataSource name\n        wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n        client (Client): (optional) Client to connect to the backend\n        **config: Datasource specific configuration\n    Attributes:\n        uid (UID): UID fo the datasource instance\n        datatype (DataSourceType): Data source type\n        status (Status): Status of the datasource\n        metadata (Metadata): Metadata associated to the datasource\n    \"\"\"\ndef __init__(self, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, name: Optional[str] = None, wait_for_metadata: bool = True, client: Optional[Client] = None, **config):\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\nself._init_common(client=client)\nself._model: Optional[mDataSource] = self._create_model(\nconnector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name, client=self._client)\nif wait_for_metadata:\nself._model = DataSource._wait_for_metadata(self)._model\n@init_client\ndef _init_common(self, client: Optional[Client] = None):\nself._client = client\nself._logger = create_logger(__name__, level=LOG_LEVEL)\n@property\ndef uid(self) -> UID:\nreturn self._model.uid\n@property\ndef datatype(self) -> DataSourceType:\nreturn self._model.datatype\n@property\ndef status(self) -> Status:\ntry:\nself._model = self.get(self._model.uid, self._client)._model\nreturn self._model.status\nexcept Exception:  # noqa: PIE786\nreturn Status.UNKNOWN\n@property\ndef metadata(self) -> Metadata:\nreturn self._model.metadata\n@staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> DataSourceList:\n\"\"\"List the  [`DataSource`][ydata.sdk.datasources.DataSource]\n        instances.\n        Arguments:\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            List of datasources\n        \"\"\"\ndef __process_data(data: list) -> list:\nto_del = ['metadata']\nfor e in data:\nfor k in to_del:\ne.pop(k, None)\nreturn data\nresponse = client.get('/datasource')\ndata: list = response.json()\ndata = __process_data(data)\nreturn DataSourceList(data)\n@staticmethod\n@init_client\ndef get(uid: UID, client: Optional[Client] = None) -> \"DataSource\":\n\"\"\"Get an existing [`DataSource`][ydata.sdk.datasources.DataSource].\n        Arguments:\n            uid (UID): DataSource identifier\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            DataSource\n        \"\"\"\nresponse = client.get(f'/datasource/{uid}')\ndata: list = response.json()\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(\nConnectorType(data['connector']['type']))\nmodel = DataSource._model_from_api(data, datasource_type)\ndatasource = ModelFactoryMixin._init_from_model_data(DataSource, model)\nreturn datasource\n@classmethod\ndef create(cls, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, name: Optional[str] = None, wait_for_metadata: bool = True, client: Optional[Client] = None, **config) -> \"DataSource\":\n\"\"\"Create a new [`DataSource`][ydata.sdk.datasources.DataSource].\n        Arguments:\n            connector (Connector): Connector from which the datasource is created\n            datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n            name (Optional[str]): (optional) DataSource name\n            wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n            client (Client): (optional) Client to connect to the backend\n            **config: Datasource specific configuration\n        Returns:\n            DataSource\n        \"\"\"\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\nreturn cls._create(connector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name, wait_for_metadata=wait_for_metadata, client=client)\n@classmethod\ndef _create(cls, connector: Connector, datasource_type: Type[mDataSource], datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, config: Optional[Dict] = None, name: Optional[str] = None, wait_for_metadata: bool = True, client: Optional[Client] = None) -> \"DataSource\":\nmodel = DataSource._create_model(\nconnector, datasource_type, datatype, config, name, client)\ndatasource = ModelFactoryMixin._init_from_model_data(DataSource, model)\nif wait_for_metadata:\ndatasource._model = DataSource._wait_for_metadata(datasource)._model\nreturn datasource\n@classmethod\n@init_client\ndef _create_model(cls, connector: Connector, datasource_type: Type[mDataSource], datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, config: Optional[Dict] = None, name: Optional[str] = None, client: Optional[Client] = None) -> mDataSource:\n_name = name if name is not None else str(uuid4())\n_config = config if config is not None else {}\npayload = {\n\"name\": _name,\n\"connector\": {\n\"uid\": connector.uid,\n\"type\": connector.type.value\n},\n\"dataType\": datatype.value\n}\nif connector.type != ConnectorType.FILE:\n_config = datasource_type(**config).to_payload()\npayload.update(_config)\nresponse = client.post('/datasource/', json=payload)\ndata: list = response.json()\nreturn DataSource._model_from_api(data, datasource_type)\n@staticmethod\ndef _wait_for_metadata(datasource):\nlogger = create_logger(__name__, level=LOG_LEVEL)\nwhile datasource.status not in [Status.AVAILABLE, Status.FAILED, Status.UNAVAILABLE]:\nlogger.info(f'Calculating metadata [{datasource.status}]')\ndatasource = DataSource.get(uid=datasource.uid, client=datasource._client)\nsleep(BACKOFF)\nreturn datasource\n@staticmethod\ndef _model_from_api(data: Dict, datasource_type: Type[mDataSource]) -> mDataSource:\ndata['datatype'] = data.pop('dataType', None)\ndata = filter_dict(datasource_type, data)\nmodel = datasource_type(**data)\nreturn model\ndef __repr__(self):\nreturn self._model.__repr__()\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.create","title":"create(connector, datatype=DataSourceType.TABULAR, name=None, wait_for_metadata=True, client=None, **config) classmethod","text":"

    Create a new DataSource.

    Parameters:

    Name Type Description Default connector Connector

    Connector from which the datasource is created

    required datatype Optional[Union[DataSourceType, str]]

    (optional) DataSource type

    TABULAR name Optional[str]

    (optional) DataSource name

    None wait_for_metadata bool

    If True, wait until the metadata is fully calculated

    True client Client

    (optional) Client to connect to the backend

    None **config

    Datasource specific configuration

    {}

    Returns:

    Type Description DataSource

    DataSource

    Source code in ydata/sdk/datasources/datasource.py
    @classmethod\ndef create(cls, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, name: Optional[str] = None, wait_for_metadata: bool = True, client: Optional[Client] = None, **config) -> \"DataSource\":\n\"\"\"Create a new [`DataSource`][ydata.sdk.datasources.DataSource].\n    Arguments:\n        connector (Connector): Connector from which the datasource is created\n        datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n        name (Optional[str]): (optional) DataSource name\n        wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n        client (Client): (optional) Client to connect to the backend\n        **config: Datasource specific configuration\n    Returns:\n        DataSource\n    \"\"\"\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\nreturn cls._create(connector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name, wait_for_metadata=wait_for_metadata, client=client)\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.get","title":"get(uid, client=None) staticmethod","text":"

    Get an existing DataSource.

    Parameters:

    Name Type Description Default uid UID

    DataSource identifier

    required client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description DataSource

    DataSource

    Source code in ydata/sdk/datasources/datasource.py
    @staticmethod\n@init_client\ndef get(uid: UID, client: Optional[Client] = None) -> \"DataSource\":\n\"\"\"Get an existing [`DataSource`][ydata.sdk.datasources.DataSource].\n    Arguments:\n        uid (UID): DataSource identifier\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        DataSource\n    \"\"\"\nresponse = client.get(f'/datasource/{uid}')\ndata: list = response.json()\ndatasource_type = CONNECTOR_TO_DATASOURCE.get(\nConnectorType(data['connector']['type']))\nmodel = DataSource._model_from_api(data, datasource_type)\ndatasource = ModelFactoryMixin._init_from_model_data(DataSource, model)\nreturn datasource\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.list","title":"list(client=None) staticmethod","text":"

    List the DataSource instances.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description DataSourceList

    List of datasources

    Source code in ydata/sdk/datasources/datasource.py
    @staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> DataSourceList:\n\"\"\"List the  [`DataSource`][ydata.sdk.datasources.DataSource]\n    instances.\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        List of datasources\n    \"\"\"\ndef __process_data(data: list) -> list:\nto_del = ['metadata']\nfor e in data:\nfor k in to_del:\ne.pop(k, None)\nreturn data\nresponse = client.get('/datasource')\ndata: list = response.json()\ndata = __process_data(data)\nreturn DataSourceList(data)\n
    "},{"location":"sdk/reference/api/datasources/datasource/#status","title":"Status","text":"

    Bases: BaseModel

    "},{"location":"sdk/reference/api/datasources/datasource/#datasourcetype","title":"DataSourceType","text":"

    Bases: StringEnum

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.TABULAR","title":"TABULAR = 'tabular' class-attribute instance-attribute","text":"

    The DataSource is tabular (i.e. it does not have a temporal dimension).

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.TIMESERIES","title":"TIMESERIES = 'timeseries' class-attribute instance-attribute","text":"

    The DataSource has a temporal dimension.

    "},{"location":"sdk/reference/api/datasources/metadata/","title":"Metadata","text":"

    Bases: BaseModel

    The Metadata object contains descriptive information about a.

    DataSource

    Attributes:

    Name Type Description columns List[Column]

    columns information

    "},{"location":"sdk/reference/api/synthesizers/base/","title":"Synthesizer","text":"

    Bases: ABC, ModelFactoryMixin

    Main synthesizer class.

    This class cannot be directly instanciated because of the specificities between RegularSynthesizer, TimeSeriesSynthesizer or MultiTableSynthesizer sample methods.

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer--methods","title":"Methods","text":"
    • fit: train a synthesizer instance.
    • sample: request synthetic data.
    • status: current status of the synthesizer instance.
    Note

    The synthesizer instance is created in the backend only when the fit method is called.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None Source code in ydata/sdk/synthesizers/synthesizer.py
    @typechecked\nclass BaseSynthesizer(ABC, ModelFactoryMixin):\n\"\"\"Main synthesizer class.\n    This class cannot be directly instanciated because of the specificities between [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer], [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] or [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer] `sample` methods.\n    Methods\n    -------\n    - `fit`: train a synthesizer instance.\n    - `sample`: request synthetic data.\n    - `status`: current status of the synthesizer instance.\n    Note:\n            The synthesizer instance is created in the backend only when the `fit` method is called.\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    \"\"\"\ndef __init__(\nself, uid: Optional[UID] = None, name: Optional[str] = None,\nproject: Optional[Project] = None, client: Optional[Client] = None):\nself._init_common(client=client)\nself._model = mSynthesizer(uid=uid, name=name or str(uuid4()))\nself._project = project\n@init_client\ndef _init_common(self, client: Optional[Client] = None):\nself._client = client\nself._logger = create_logger(__name__, level=LOG_LEVEL)\ndef fit(self, X: Union[DataSource, pdDataFrame],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\ndatatype: Optional[Union[DataSourceType, str]] = None,\nsortbykey: Optional[Union[str, List[str]]] = None,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.\n        The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].\n        By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.\n        The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]\n            sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Target for the dataset\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\nif self._is_initialized():\nraise AlreadyFittedError()\n_datatype = DataSourceType(datatype) if isinstance(\nX, pdDataFrame) else DataSourceType(X.datatype)\ndataset_attrs = self._init_datasource_attributes(\nsortbykey, entities, generate_cols, exclude_cols, dtypes)\nself._validate_datasource_attributes(X, dataset_attrs, _datatype, target)\n# If the training data is a pandas dataframe, we first need to create a data source and then the instance\nif isinstance(X, pdDataFrame):\nif X.empty:\nraise EmptyDataError(\"The DataFrame is empty\")\n_X = LocalDataSource(source=X, datatype=_datatype, client=self._client)\nelse:\nif datatype != _datatype:\nwarn(\"When the training data is a DataSource, the argument `datatype` is ignored.\",\nDataSourceTypeWarning)\n_X = X\nif _X.status != dsStatus.AVAILABLE:\nraise DataSourceNotAvailableError(\nf\"The datasource '{_X.uid}' is not available (status = {_X.status.value})\")\nif isinstance(dataset_attrs, dict):\ndataset_attrs = DataSourceAttrs(**dataset_attrs)\nself._fit_from_datasource(\nX=_X, dataset_attrs=dataset_attrs, target=target,\nanonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)\n@staticmethod\ndef _init_datasource_attributes(\nsortbykey: Optional[Union[str, List[str]]],\nentities: Optional[Union[str, List[str]]],\ngenerate_cols: Optional[List[str]],\nexclude_cols: Optional[List[str]],\ndtypes: Optional[Dict[str, Union[str, DataType]]]) -> DataSourceAttrs:\ndataset_attrs = {\n'sortbykey': sortbykey if sortbykey is not None else [],\n'entities': entities if entities is not None else [],\n'generate_cols': generate_cols if generate_cols is not None else [],\n'exclude_cols': exclude_cols if exclude_cols is not None else [],\n'dtypes': {k: DataType(v) for k, v in dtypes.items()} if dtypes is not None else {}\n}\nreturn DataSourceAttrs(**dataset_attrs)\n@staticmethod\ndef _validate_datasource_attributes(X: Union[DataSource, pdDataFrame], dataset_attrs: DataSourceAttrs, datatype: DataSourceType, target: Optional[str]):\ncolumns = []\nif isinstance(X, pdDataFrame):\ncolumns = X.columns\nif datatype is None:\nraise DataTypeMissingError(\n\"Argument `datatype` is mandatory for pandas.DataFrame training data\")\ndatatype = DataSourceType(datatype)\nelse:\ncolumns = [c.name for c in X.metadata.columns]\nif target is not None and target not in columns:\nraise DataSourceAttrsError(\n\"Invalid target: column '{target}' does not exist\")\nif datatype == DataSourceType.TIMESERIES:\nif not dataset_attrs.sortbykey:\nraise DataSourceAttrsError(\n\"The argument `sortbykey` is mandatory for timeseries datasource.\")\ninvalid_fields = {}\nfor field, v in dataset_attrs.dict().items():\nfield_columns = v if field != 'dtypes' else v.keys()\nnot_in_cols = [c for c in field_columns if c not in columns]\nif len(not_in_cols) > 0:\ninvalid_fields[field] = not_in_cols\nif len(invalid_fields) > 0:\nerror_msgs = [\"\\t- Field '{}': columns {} do not exist\".format(\nf, ', '.join(v)) for f, v in invalid_fields.items()]\nraise DataSourceAttrsError(\n\"The dataset attributes are invalid:\\n {}\".format('\\n'.join(error_msgs)))\n@staticmethod\ndef _metadata_to_payload(\ndatatype: DataSourceType, ds_metadata: Metadata,\ndataset_attrs: Optional[DataSourceAttrs] = None, target: Optional[str] = None\n) -> dict:\n\"\"\"Transform a the metadata and dataset attributes into a valid\n        payload.\n        Arguments:\n            datatype (DataSourceType): datasource type\n            ds_metadata (Metadata): datasource metadata object\n            dataset_attrs ( Optional[DataSourceAttrs] ): (optional) Dataset attributes\n            target (Optional[str]): (optional) target column name\n        Returns:\n            metadata payload dictionary\n        \"\"\"\ncolumns = [\n{\n'name': c.name,\n'generation': True and c.name not in dataset_attrs.exclude_cols,\n'dataType': DataType(dataset_attrs.dtypes[c.name]).value if c.name in dataset_attrs.dtypes else c.datatype,\n'varType': c.vartype,\n}\nfor c in ds_metadata.columns]\nmetadata = {\n'columns': columns,\n'target': target\n}\nif dataset_attrs is not None:\nif datatype == DataSourceType.TIMESERIES:\nmetadata['sortBy'] = [c for c in dataset_attrs.sortbykey]\nmetadata['entity'] = [c for c in dataset_attrs.entities]\nreturn metadata\ndef _fit_from_datasource(\nself,\nX: DataSource,\nprivacy_level: Optional[PrivacyLevel] = None,\ndataset_attrs: Optional[DataSourceAttrs] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None\n) -> None:\npayload = self._create_payload()\npayload['dataSourceUID'] = X.uid\nif privacy_level:\npayload['privacy_level'] = privacy_level.value\nif X.metadata is not None and X.datatype is not None:\npayload['metadata'] = self._metadata_to_payload(\nDataSourceType(X.datatype), X.metadata, dataset_attrs, target)\nif anonymize is not None:\npayload[\"extraData\"][\"anonymize\"] = anonymize\nif condition_on is not None:\npayload[\"extraData\"][\"condition_on\"] = condition_on\nresponse = self._client.post(\n'/synthesizer/', json=payload, project=self._project)\ndata = response.json()\nself._model = mSynthesizer(**data)\nwhile self._check_fitting_not_finished(self.status):\nself._logger.info('Training the synthesizer...')\nsleep(BACKOFF)\ndef _create_payload(self) -> dict:\npayload = {\n'extraData': {}\n}\nif self._model and self._model.name:\npayload['name'] = self._model.name\nreturn payload\ndef _check_fitting_not_finished(self, status: Status) -> bool:\nself._logger.debug(f'checking status {status}')\nif status.state in [Status.State.READY, Status.State.REPORT]:\nreturn False\nself._logger.debug(f'status not ready yet {status.state}')\nif status.prepare and PrepareState(status.prepare.state) == PrepareState.FAILED:\nraise FittingError('Could not train the synthesizer')\nif status.training and TrainingState(status.training.state) == TrainingState.FAILED:\nraise FittingError('Could not train the synthesizer')\nreturn True\n@abstractmethod\ndef sample(self) -> pdDataFrame:\n\"\"\"Abstract method to sample from a synthesizer.\"\"\"\ndef _sample(self, payload: Dict) -> pdDataFrame:\n\"\"\"Sample from a synthesizer.\n        Arguments:\n            payload (dict): payload configuring the sample request\n        Returns:\n            pandas `DataFrame`\n        \"\"\"\nresponse = self._client.post(\nf\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\ndata: Dict = response.json()\nsample_uid = data.get('uid')\nsample_status = None\nwhile sample_status not in ['finished', 'failed']:\nself._logger.info('Sampling from the synthesizer...')\nresponse = self._client.get(\nf'/synthesizer/{self.uid}/history', project=self._project)\nhistory: Dict = response.json()\nsample_data = next((s for s in history if s.get('uid') == sample_uid), None)\nsample_status = sample_data.get('status', {}).get('state')\nsleep(BACKOFF)\nresponse = self._client.get_static_file(\nf'/synthesizer/{self.uid}/sample/{sample_uid}/sample.csv', project=self._project)\ndata = StringIO(response.content.decode())\nreturn read_csv(data)\n@property\ndef uid(self) -> UID:\n\"\"\"Get the status of a synthesizer instance.\n        Returns:\n            Synthesizer status\n        \"\"\"\nif not self._is_initialized():\nreturn Status.State.NOT_INITIALIZED\nreturn self._model.uid\n@property\ndef status(self) -> Status:\n\"\"\"Get the status of a synthesizer instance.\n        Returns:\n            Synthesizer status\n        \"\"\"\nif not self._is_initialized():\nreturn Status.not_initialized()\ntry:\nself = self.get()\nreturn self._model.status\nexcept Exception:  # noqa: PIE786\nreturn Status.unknown()\ndef get(self):\nassert self._is_initialized() and self._model.uid, InputError(\n\"Please provide the synthesizer `uid`\")\nresponse = self._client.get(f'/synthesizer/{self.uid}', project=self._project)\ndata = response.json()\nself._model = mSynthesizer(**data)\nreturn self\n@staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> SynthesizersList:\n\"\"\"List the synthesizer instances.\n        Arguments:\n            client (Client): (optional) Client to connect to the backend\n        Returns:\n            List of synthesizers\n        \"\"\"\ndef __process_data(data: list) -> list:\nto_del = ['metadata', 'report', 'mode']\nfor e in data:\nfor k in to_del:\ne.pop(k, None)\nreturn data\nresponse = client.get('/synthesizer')\ndata: list = response.json()\ndata = __process_data(data)\nreturn SynthesizersList(data)\ndef _is_initialized(self) -> bool:\n\"\"\"Determine if a synthesizer is instanciated or not.\n        Returns:\n            True if the synthesizer is instanciated\n        \"\"\"\nreturn self._model is not None\n@staticmethod\ndef _resolve_api_status(api_status: Dict) -> Status:\n\"\"\"Determine the status of the Synthesizer.\n        The status of the synthesizer instance is determined by the state of\n        its different components.\n        Arguments:\n            api_status (dict): json from the endpoint GET /synthesizer\n        Returns:\n            Synthesizer Status\n        \"\"\"\nstatus = Status(api_status.get('state', Status.UNKNOWN.name))\nif status == Status.PREPARE:\nif PrepareState(api_status.get('prepare', {}).get(\n'state', PrepareState.UNKNOWN.name)) == PrepareState.FAILED:\nreturn Status.FAILED\nelif status == Status.TRAIN:\nif TrainingState(api_status.get('training', {}).get(\n'state', TrainingState.UNKNOWN.name)) == TrainingState.FAILED:\nreturn Status.FAILED\nelif status == Status.REPORT:\nreturn Status.READY\nreturn status\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.status","title":"status: Status property","text":"

    Get the status of a synthesizer instance.

    Returns:

    Type Description Status

    Synthesizer status

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.uid","title":"uid: UID property","text":"

    Get the status of a synthesizer instance.

    Returns:

    Type Description UID

    Synthesizer status

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource. When the training dataset is a pandas DataFrame, the argument datatype is required as it cannot be deduced.

    The argumentsortbykey is mandatory for TimeSeries.

    By default, if generate_cols or exclude_cols are not specified, all columns are generated by the synthesizer. The argument exclude_cols has precedence over generate_cols, i.e. a column col will not be generated if it is in both list.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY datatype Optional[Union[DataSourceType, str]]

    (optional) Dataset datatype - required if X is a pandas.DataFrame

    None sortbykey Union[str, List[str]]

    (optional) column(s) to use to sort timeseries datasets

    None entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Target for the dataset

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/synthesizer.py
    def fit(self, X: Union[DataSource, pdDataFrame],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\ndatatype: Optional[Union[DataSourceType, str]] = None,\nsortbykey: Optional[Union[str, List[str]]] = None,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.\n    The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].\n    By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.\n    The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]\n        sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Target for the dataset\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\nif self._is_initialized():\nraise AlreadyFittedError()\n_datatype = DataSourceType(datatype) if isinstance(\nX, pdDataFrame) else DataSourceType(X.datatype)\ndataset_attrs = self._init_datasource_attributes(\nsortbykey, entities, generate_cols, exclude_cols, dtypes)\nself._validate_datasource_attributes(X, dataset_attrs, _datatype, target)\n# If the training data is a pandas dataframe, we first need to create a data source and then the instance\nif isinstance(X, pdDataFrame):\nif X.empty:\nraise EmptyDataError(\"The DataFrame is empty\")\n_X = LocalDataSource(source=X, datatype=_datatype, client=self._client)\nelse:\nif datatype != _datatype:\nwarn(\"When the training data is a DataSource, the argument `datatype` is ignored.\",\nDataSourceTypeWarning)\n_X = X\nif _X.status != dsStatus.AVAILABLE:\nraise DataSourceNotAvailableError(\nf\"The datasource '{_X.uid}' is not available (status = {_X.status.value})\")\nif isinstance(dataset_attrs, dict):\ndataset_attrs = DataSourceAttrs(**dataset_attrs)\nself._fit_from_datasource(\nX=_X, dataset_attrs=dataset_attrs, target=target,\nanonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.list","title":"list(client=None) staticmethod","text":"

    List the synthesizer instances.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description SynthesizersList

    List of synthesizers

    Source code in ydata/sdk/synthesizers/synthesizer.py
    @staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> SynthesizersList:\n\"\"\"List the synthesizer instances.\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    Returns:\n        List of synthesizers\n    \"\"\"\ndef __process_data(data: list) -> list:\nto_del = ['metadata', 'report', 'mode']\nfor e in data:\nfor k in to_del:\ne.pop(k, None)\nreturn data\nresponse = client.get('/synthesizer')\ndata: list = response.json()\ndata = __process_data(data)\nreturn SynthesizersList(data)\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.sample","title":"sample() abstractmethod","text":"

    Abstract method to sample from a synthesizer.

    Source code in ydata/sdk/synthesizers/synthesizer.py
    @abstractmethod\ndef sample(self) -> pdDataFrame:\n\"\"\"Abstract method to sample from a synthesizer.\"\"\"\n
    "},{"location":"sdk/reference/api/synthesizers/base/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"sdk/reference/api/synthesizers/multitable/","title":"Multitable","text":"

    Bases: BaseSynthesizer

    MultiTable synthesizer class.

    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer--methods","title":"Methods","text":"
    • fit: train a synthesizer instance.
    • sample: request synthetic data.
    • status: current status of the synthesizer instance.
    Note

    The synthesizer instance is created in the backend only when the fit method is called.

    Parameters:

    Name Type Description Default write_connector UID

    Connector of type RDBMS to be used to write the samples

    required name str

    (optional) Name to be used when creating the synthesizer. Calculated internally if not provided

    None client Client

    (optional) Client to connect to the backend

    None Source code in ydata/sdk/synthesizers/multitable.py
    class MultiTableSynthesizer(BaseSynthesizer):\n\"\"\"MultiTable synthesizer class.\n    Methods\n    -------\n    - `fit`: train a synthesizer instance.\n    - `sample`: request synthetic data.\n    - `status`: current status of the synthesizer instance.\n    Note:\n            The synthesizer instance is created in the backend only when the `fit` method is called.\n    Arguments:\n        write_connector (UID): Connector of type RDBMS to be used to write the samples\n        name (str): (optional) Name to be used when creating the synthesizer. Calculated internally if not provided\n        client (Client): (optional) Client to connect to the backend\n    \"\"\"\ndef __init__(\nself, write_connector: Union[Connector, UID], uid: Optional[UID] = None, name: Optional[str] = None,\nproject: Optional[Project] = None, client: Optional[Client] = None):\nsuper().__init__(uid, name, project, client)\nconnector = self._check_or_fetch_connector(write_connector)\nself.__write_connector = connector.uid\ndef fit(self, X: DataSource,\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\ndatatype: Optional[Union[DataSourceType, str]] = None,\nsortbykey: Optional[Union[str, List[str]]] = None,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n        The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        Except X, all the other arguments are for now ignored until they are supported.\n        Arguments:\n            X (DataSource): DataSource to Train\n        \"\"\"\nself._fit_from_datasource(X)\ndef sample(self, frac: Union[int, float] = 1, write_connector: Optional[Union[Connector, UID]] = None) -> None:\n\"\"\"Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]\n        instance.\n        The sample is saved in the connector that was provided in the synthesizer initialization\n        or in the\n        Arguments:\n            frac (int | float): fraction of the sample to be returned\n        \"\"\"\nassert frac >= 0.1, InputError(\n\"It is not possible to generate an empty synthetic data schema. Please validate the input provided. \")\nassert frac <= 5, InputError(\n\"It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.\")\npayload = {\n'fraction': frac,\n}\nif write_connector is not None:\nconnector = self._check_or_fetch_connector(write_connector)\npayload['writeConnector'] = connector.uid\nresponse = self._client.post(\nf\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\ndata = response.json()\nsample_uid = data.get('uid')\nsample_status = None\nwhile sample_status not in ['finished', 'failed']:\nself._logger.info('Sampling from the synthesizer...')\nresponse = self._client.get(\nf'/synthesizer/{self.uid}/history', project=self._project)\nhistory = response.json()\nsample_data = next((s for s in history if s.get('uid') == sample_uid), None)\nsample_status = sample_data.get('status', {}).get('state')\nsleep(BACKOFF)\nprint(\nf\"Sample created and saved into connector with ID {self.__write_connector or write_connector}\")\ndef _create_payload(self) -> dict:\npayload = super()._create_payload()\npayload['writeConnector'] = self.__write_connector\nreturn payload\ndef _check_or_fetch_connector(self, write_connector: Union[Connector, UID]) -> Connector:\nself._logger.debug(f'Write connector is {write_connector}')\nif isinstance(write_connector, str):\nself._logger.debug(f'Write connector is of type `UID` {write_connector}')\nwrite_connector = Connector.get(write_connector)\nself._logger.debug(f'Using fetched connector {write_connector}')\nif write_connector.uid is None:\nraise InputError(\"Invalid connector provided as input for write\")\nif write_connector.type not in [ConnectorType.AZURE_SQL, ConnectorType.MYSQL, ConnectorType.SNOWFLAKE]:\nraise ConnectorError(\nf\"Invalid type `{write_connector.type}` for the provided connector\")\nreturn write_connector\n
    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset a YData DataSource. Except X, all the other arguments are for now ignored until they are supported.

    Parameters:

    Name Type Description Default X DataSource

    DataSource to Train

    required Source code in ydata/sdk/synthesizers/multitable.py
    def fit(self, X: DataSource,\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\ndatatype: Optional[Union[DataSourceType, str]] = None,\nsortbykey: Optional[Union[str, List[str]]] = None,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n    The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    Except X, all the other arguments are for now ignored until they are supported.\n    Arguments:\n        X (DataSource): DataSource to Train\n    \"\"\"\nself._fit_from_datasource(X)\n
    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer.sample","title":"sample(frac=1, write_connector=None)","text":"

    Sample from a MultiTableSynthesizer instance. The sample is saved in the connector that was provided in the synthesizer initialization or in the

    Parameters:

    Name Type Description Default frac int | float

    fraction of the sample to be returned

    1 Source code in ydata/sdk/synthesizers/multitable.py
    def sample(self, frac: Union[int, float] = 1, write_connector: Optional[Union[Connector, UID]] = None) -> None:\n\"\"\"Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]\n    instance.\n    The sample is saved in the connector that was provided in the synthesizer initialization\n    or in the\n    Arguments:\n        frac (int | float): fraction of the sample to be returned\n    \"\"\"\nassert frac >= 0.1, InputError(\n\"It is not possible to generate an empty synthetic data schema. Please validate the input provided. \")\nassert frac <= 5, InputError(\n\"It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.\")\npayload = {\n'fraction': frac,\n}\nif write_connector is not None:\nconnector = self._check_or_fetch_connector(write_connector)\npayload['writeConnector'] = connector.uid\nresponse = self._client.post(\nf\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\ndata = response.json()\nsample_uid = data.get('uid')\nsample_status = None\nwhile sample_status not in ['finished', 'failed']:\nself._logger.info('Sampling from the synthesizer...')\nresponse = self._client.get(\nf'/synthesizer/{self.uid}/history', project=self._project)\nhistory = response.json()\nsample_data = next((s for s in history if s.get('uid') == sample_uid), None)\nsample_status = sample_data.get('status', {}).get('state')\nsleep(BACKOFF)\nprint(\nf\"Sample created and saved into connector with ID {self.__write_connector or write_connector}\")\n
    "},{"location":"sdk/reference/api/synthesizers/regular/","title":"Regular","text":"

    Bases: BaseSynthesizer

    Source code in ydata/sdk/synthesizers/regular.py
    class RegularSynthesizer(BaseSynthesizer):\ndef sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:\n\"\"\"Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]\n        instance.\n        Arguments:\n            n_samples (int): number of rows in the sample\n            condition_on: (Optional[dict]): (optional) conditional sampling parameters\n        Returns:\n            synthetic data\n        \"\"\"\nif n_samples < 1:\nraise InputError(\"Parameter 'n_samples' must be greater than 0\")\npayload = {\"numberOfRecords\": n_samples}\nif condition_on is not None:\npayload[\"extraData\"] = {\n\"condition_on\": condition_on\n}\nreturn self._sample(payload=payload)\ndef fit(self, X: Union[DataSource, pdDataFrame],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Target column\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\nBaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,\ngenerate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,\ntarget=target, anonymize=anonymize, privacy_level=privacy_level,\ncondition_on=condition_on)\ndef __repr__(self):\nif self._model is not None:\nreturn self._model.__repr__()\nelse:\nreturn \"RegularSynthesizer(Not Initialized)\"\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.regular.RegularSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Target column

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/regular.py
    def fit(self, X: Union[DataSource, pdDataFrame],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Target column\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\nBaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,\ngenerate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,\ntarget=target, anonymize=anonymize, privacy_level=privacy_level,\ncondition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.regular.RegularSynthesizer.sample","title":"sample(n_samples=1, condition_on=None)","text":"

    Sample from a RegularSynthesizer instance.

    Parameters:

    Name Type Description Default n_samples int

    number of rows in the sample

    1 condition_on Optional[dict]

    (Optional[dict]): (optional) conditional sampling parameters

    None

    Returns:

    Type Description DataFrame

    synthetic data

    Source code in ydata/sdk/synthesizers/regular.py
    def sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:\n\"\"\"Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]\n    instance.\n    Arguments:\n        n_samples (int): number of rows in the sample\n        condition_on: (Optional[dict]): (optional) conditional sampling parameters\n    Returns:\n        synthetic data\n    \"\"\"\nif n_samples < 1:\nraise InputError(\"Parameter 'n_samples' must be greater than 0\")\npayload = {\"numberOfRecords\": n_samples}\nif condition_on is not None:\npayload[\"extraData\"] = {\n\"condition_on\": condition_on\n}\nreturn self._sample(payload=payload)\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"sdk/reference/api/synthesizers/timeseries/","title":"TimeSeries","text":"

    Bases: BaseSynthesizer

    Source code in ydata/sdk/synthesizers/timeseries.py
    class TimeSeriesSynthesizer(BaseSynthesizer):\ndef sample(self, n_entities: int, condition_on: Optional[dict] = None) -> pdDataFrame:\n\"\"\"Sample from a [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] instance.\n        If a training dataset was not using any `entity` column, the Synthesizer assumes a single entity.\n        A [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] always sample the full trajectory of its entities.\n        Arguments:\n            n_entities (int): number of entities to sample\n            condition_on: (Optional[dict]): (optional) conditional sampling parameters\n        Returns:\n            synthetic data\n        \"\"\"\nif n_entities is not None and n_entities < 1:\nraise InputError(\"Parameter 'n_entities' must be greater than 0\")\npayload = {\"numberOfRecords\": n_entities}\nif condition_on is not None:\npayload[\"extraData\"] = {\n\"condition_on\": condition_on\n}\nreturn self._sample(payload=payload)\ndef fit(self, X: Union[DataSource, pdDataFrame],\nsortbykey: Optional[Union[str, List[str]]],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            sortbykey (Union[str, List[str]]): column(s) to use to sort timeseries datasets\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Metadata associated to the datasource\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\nBaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TIMESERIES, sortbykey=sortbykey,\nentities=entities, generate_cols=generate_cols, exclude_cols=exclude_cols,\ndtypes=dtypes, target=target, anonymize=anonymize, privacy_level=privacy_level,\ncondition_on=condition_on)\ndef __repr__(self):\nif self._model is not None:\nreturn self._model.__repr__()\nelse:\nreturn \"TimeSeriesSynthesizer(Not Initialized)\"\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.timeseries.TimeSeriesSynthesizer.fit","title":"fit(X, sortbykey, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required sortbykey Union[str, List[str]]

    column(s) to use to sort timeseries datasets

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Metadata associated to the datasource

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/timeseries.py
    def fit(self, X: Union[DataSource, pdDataFrame],\nsortbykey: Optional[Union[str, List[str]]],\nprivacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\nentities: Optional[Union[str, List[str]]] = None,\ngenerate_cols: Optional[List[str]] = None,\nexclude_cols: Optional[List[str]] = None,\ndtypes: Optional[Dict[str, Union[str, DataType]]] = None,\ntarget: Optional[str] = None,\nanonymize: Optional[dict] = None,\ncondition_on: Optional[List[str]] = None) -> None:\n\"\"\"Fit the synthesizer.\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        sortbykey (Union[str, List[str]]): column(s) to use to sort timeseries datasets\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Metadata associated to the datasource\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\nBaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TIMESERIES, sortbykey=sortbykey,\nentities=entities, generate_cols=generate_cols, exclude_cols=exclude_cols,\ndtypes=dtypes, target=target, anonymize=anonymize, privacy_level=privacy_level,\ncondition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.timeseries.TimeSeriesSynthesizer.sample","title":"sample(n_entities, condition_on=None)","text":"

    Sample from a TimeSeriesSynthesizer instance.

    If a training dataset was not using any entity column, the Synthesizer assumes a single entity. A TimeSeriesSynthesizer always sample the full trajectory of its entities.

    Parameters:

    Name Type Description Default n_entities int

    number of entities to sample

    required condition_on Optional[dict]

    (Optional[dict]): (optional) conditional sampling parameters

    None

    Returns:

    Type Description DataFrame

    synthetic data

    Source code in ydata/sdk/synthesizers/timeseries.py
    def sample(self, n_entities: int, condition_on: Optional[dict] = None) -> pdDataFrame:\n\"\"\"Sample from a [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] instance.\n    If a training dataset was not using any `entity` column, the Synthesizer assumes a single entity.\n    A [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] always sample the full trajectory of its entities.\n    Arguments:\n        n_entities (int): number of entities to sample\n        condition_on: (Optional[dict]): (optional) conditional sampling parameters\n    Returns:\n        synthetic data\n    \"\"\"\nif n_entities is not None and n_entities < 1:\nraise InputError(\"Parameter 'n_entities' must be greater than 0\")\npayload = {\"numberOfRecords\": n_entities}\nif condition_on is not None:\npayload[\"extraData\"] = {\n\"condition_on\": condition_on\n}\nreturn self._sample(payload=payload)\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"support/help-troubleshooting/","title":"Help & Troubleshooting","text":""}]} \ No newline at end of file diff --git a/0.7/sitemap.xml.gz b/0.7/sitemap.xml.gz index 131d4f117ead9cdfb9a66405dc67dd58bf9f95a4..cd6b531cdfdc9e1a5972c7c7ec2899ba85ed4f34 100644 GIT binary patch delta 13 Ucmb=gXP58h;BeT!Y$AIF039|2Z2$lO delta 13 Ucmb=gXP58h;IM3Np2%JS02_P+4*&oF diff --git a/latest/examples/synthesizer_multitable/index.html b/latest/examples/synthesizer_multitable/index.html new file mode 100644 index 00000000..643d7db5 --- /dev/null +++ b/latest/examples/synthesizer_multitable/index.html @@ -0,0 +1,16 @@ + + + + + Redirecting + + + + + Redirecting to ../../../0.7/examples/synthesizer_multitable/... + + \ No newline at end of file diff --git a/latest/sdk/reference/api/synthesizers/multitable/index.html b/latest/sdk/reference/api/synthesizers/multitable/index.html new file mode 100644 index 00000000..5e157605 --- /dev/null +++ b/latest/sdk/reference/api/synthesizers/multitable/index.html @@ -0,0 +1,16 @@ + + + + + Redirecting + + + + + Redirecting to ../../../../../../0.7/sdk/reference/api/synthesizers/multitable/... + + \ No newline at end of file