docs: add pandas snippets

niekdt · Sep 27, 2024 · b0c2b6a · b0c2b6a
1 parent cddb7b9
commit b0c2b6a
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 7 deletions.
diff --git a/src/actionsheets/data/python/pandas/pandas.dataframe.toml b/src/actionsheets/data/python/pandas/pandas.dataframe.toml
@@ -155,7 +155,7 @@ action = "Contains no NA"
 code = "data.notna().all().all()"
 
 [test.contains.na.col]
-action = "Does column _col_ contain NA value(s)"
+action = "Does column _col_ contain any NA values"
 code = "data[col].hasnan"
 
 [test.contains.na.col.none]
@@ -166,6 +166,26 @@ code = "~data[col].hasnan"
 action = "Do none of the columns _cols_ contain NA values"
 code = "data[['col1', 'col2']].notnull().all().all()"
 
+[test.contains.'inf']
+action = "Contains any infinity"
+code = """
+import numpy as np
+data.isin([np.inf, -np.inf]).values.any()
+"""
+
+[test.contains.'inf'.col]
+action = "Does column _col_ contain any infinite values"
+code = """
+import numpy as np
+data['col'].isin([np.inf, -np.inf]).any()
+"""
+
+[test.contains.'inf'.col.2]
+code = """
+import numpy as np
+np.isinf(data['col']).any()
+"""
+
 
 [test.duplicate]
 section = "Tests for duplicates"
@@ -265,6 +285,11 @@ code = "data.duplicated()"
 action = "Mask for duplicates across columns in list _cols_"
 code = "data.duplicated(subset=cols)"
 
+[extract.row.count.group]
+action = "Extract row count by group _group_"
+code = "data.groupby('group').size().reset_index(name='count')"
+details = "Returns a data frame with group columns and a respective count column"
+
 
 [query]
 section = "Query"
@@ -360,6 +385,10 @@ details = "Overwrites existing column"
 action = "Append undefined categorical column _col_"
 code = "data['col'] = pd.Categorical([None] * len(data), categories=['a', 'b', 'c'])"
 
+[update.grow.col.append.str.concat]
+action = "Add string column _col_ based on concatenation of two string columns _x_ and _y_, with separator _sep_"
+code = "data['col'] = data['x'] + 'sep' + data['y']"
+
 [update.grow.rownum]
 action = "Add column with row number (ignore index)"
 code = "data['num'] = range(len(data))"
@@ -430,6 +459,12 @@ code = "pd.concat(data, [s1, s2], axis=1)"
 action = "Insert column"
 code = "data.insert(x)"
 
+[derive.grow.col.str.split]
+action = "Split string column _col_ into two columns based on separator _sep_"
+code = """
+data['col'].str.split('sep', n=1, expand=True)
+"""
+
 
 [derive.shrink]
 section = "Shrink"
@@ -441,14 +476,40 @@ section = "Reduce number of rows"
 action = "Keep first _n_ rows"
 code = "data.head(n)"
 
+[derive.shrink.row.remove.head]
+action = "Remove first _n_ rows"
+code = "data[n:]"
+
 [derive.shrink.row.tail]
 action = "Keep last _n_ rows"
 code = "data.tail(n)"
 
+[derive.shrink.row.remove.tail]
+action = "Remove last _n_ rows"
+code = "data[:-n]"
+
 [derive.shrink.row.pop]
 action = "Pop row"
 code = "data.pop()"
 
+[derive.shrink.row.largest]
+action = "Select _n_ largest rows according to column _col_"
+code = "data.nlargest(n, col)"
+
+[derive.shrink.row.largest.group]
+action = "Select _n_ largest rows per group _group_, according to column _col_"
+code = "data.groupby('group').apply(lambda x: x.nlargest(n=n, columns='col')).reset_index(drop=True))"
+
+[derive.shrink.row.last.group]
+action = "Select last row per group _group_"
+code = "data.groupby('group').last().reset_index()"
+
+[derive.shrink.row.max.group]
+action = "Select the row per group _group_ with the largest value for column _col_"
+code = """
+data.loc[data.groupby('group')['col'].idxmax()].reset_index(drop=True)
+"""
+
 [derive.shrink.row.remove.list]
 action = "Remove rows list _rows_"
 code = "data.drop(rows)"
@@ -525,12 +586,21 @@ section = "Combine"
 [derive.combine.concat]
 action = "Concatenate rows of dataframes"
 code = "pd.concat([df, df2, dfN])"
-details = "Consider `ignore_index=True`"
+details = "Consider `ignore_index=True` argument"
 
 [derive.combine.concat.mix]
 action = "Concatenate rows of dataframes, having partially overlapping columns"
 code = "?"
 
+[derive.combine.concat.cols]
+action = "Concatenate columns of dataframes, assuming equal index, and assuming columns don't overlap"
+code = "pd.concat([data, data2], axis=1)"
+details = "Unsafe. First make sure indexes are aligned, or output has twice the number of rows"
+
+[derive.combine.concat.cols.safe]
+action = "Concatenate columns of dataframes, ignoring the index, and assuming columns don't overlap"
+code = "pd.concat([data.reset_index(drop=True), data2.reset_index(drop=True)], axis=1)"
+
 [derive.combine.merge.inner]
 action = "Inner join"
 code = "data.merge(data2, on=['sex', 'country'])"
@@ -622,11 +692,13 @@ section = "To file format"
 
 [convert.file.csv]
 action = "To CSV file"
-code = "data.to_csv('file.csv')"
+code = "data.to_csv('file.csv', index=False)"
+details = "`index=False` is needed not to polute CSV with a meaningless index"
 
 [convert.file.tsv]
 action = "To TSV file"
-code = "data.to_table('file.tsv')"
+code = "data.to_csv('file.tsv', sep='\t', index=False)"
+details = "`index=False` is needed not to polute TSV with a meaningless index"
 
 [convert.file.json]
 action = "To JSON file"

diff --git a/src/actionsheets/data/python/pandas/pandas.series.toml b/src/actionsheets/data/python/pandas/pandas.series.toml
@@ -20,9 +20,40 @@ code = "import pandas as pd"
 section = "Create"
 
 [create.empty]
+section = "Create empty series"
+
+[create.empty.object]
 action = "Empty series (of object type)"
 code = "pd.Series()"
 
+[create.empty.bool]
+action = "Empty boolean series"
+code = "pd.Series(dtype=bool)"
+
+[create.empty.category.undefined]
+action = "Empty categorical series without defined categories"
+code = "pd.Series(dtype='category')"
+
+[create.empty.category]
+action = "Empty categorical series with pre-defined categories"
+code = "pd.Categorical([], categories=['a', 'b', 'c'])"
+
+[create.empty.int]
+action = "Empty int series"
+code = "pd.Series(dtype=int)"
+
+[create.empty.float]
+action = "Empty float series"
+code = "pd.Series(dtype=float)"
+
+[create.empty.date]
+action = "Empty datetime series"
+code = "pd.Series(dtype='datetime64[ns]')"
+
+
+[create.constant]
+section = "Create series of constant values"
+
 [create.constant.na]
 action = "Series filled with NAs of length _n_ (of object type)"
 code = "pd.Series([None] * n)"
@@ -31,9 +62,13 @@ code = "pd.Series([None] * n)"
 action = "Constant value _v_ of length _n_"
 code = "pd.Series(v, index=range(n))"
 
-[create.list]
-action = "From list"
-code = "pd.Series([1, 2, 3, 7])"
+
+[create.define]
+section = "Create series from a list of values"
+
+[create.define.list]
+action = "Object series from a generic list of values"
+code = "pd.Series([1, None, 'a'])"
 
 [create.define.int]
 action = "Int series from a list of integers"
@@ -256,6 +291,14 @@ code = "x.min()"
 action = "Greatest value, ignoring NAs"
 code = "x.max()"
 
+[extract.min.index]
+action = "Index of the smallest value, ignoring NAs"
+code = "x.idxmin()"
+
+[extract.max.index]
+action = "Index of the greatest value, ignoring NAs"
+code = "x.idxmax()"
+
 [extract.value.counts]
 action = "Count occurrence per value"
 code = "x.value_counts()"