From 460e1ceb751c773b43837ed600c304712921be68 Mon Sep 17 00:00:00 2001 From: hants Date: Wed, 7 Sep 2022 12:25:25 -0400 Subject: [PATCH] updates --- descriptive/example1/code/descriptive.ipynb | 1 + descriptive/example1/code/requirements.txt | 4 + .../example1/code/researchpy_example.ipynb | 157 +++++++++ descriptive/example1/code/tableone.py | 36 +++ descriptive/example1/data/Altair.csv | 101 ++++++ descriptive/example1/data/Cholesterol.csv | 301 ++++++++++++++++++ descriptive/example1/data/data.csv | 201 ++++++++++++ descriptive/example1/data/test.csv | 13 + descriptive/example1/data/test2.csv | 11 + transformation/dataFiles/homework/cleaning.py | 3 + .../pyScripts/p3_stonyBrookClean.py | 6 +- 11 files changed, 833 insertions(+), 1 deletion(-) create mode 100644 descriptive/example1/code/descriptive.ipynb create mode 100644 descriptive/example1/code/requirements.txt create mode 100644 descriptive/example1/code/researchpy_example.ipynb create mode 100644 descriptive/example1/code/tableone.py create mode 100644 descriptive/example1/data/Altair.csv create mode 100644 descriptive/example1/data/Cholesterol.csv create mode 100644 descriptive/example1/data/data.csv create mode 100644 descriptive/example1/data/test.csv create mode 100644 descriptive/example1/data/test2.csv create mode 100644 transformation/dataFiles/homework/cleaning.py diff --git a/descriptive/example1/code/descriptive.ipynb b/descriptive/example1/code/descriptive.ipynb new file mode 100644 index 0000000..7e49b33 --- /dev/null +++ b/descriptive/example1/code/descriptive.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"colab_type":"text","id":"QvNdV8lVbozM"},"source":["# Descriptive statistics"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"qY6Djzavbya7"},"source":["## Library imports"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"7GMyMAQSbEk3"},"outputs":[],"source":["import pandas as pd\n","from scipy import stats # A great statistical module from the scipy library"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"srfntVDKcQa9"},"source":["## Data import"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"runAmOfktIjB"},"source":["### Connecting to Google Drive and importing the data"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"nzpnpiildUPE"},"outputs":[],"source":["df = pd.read_csv('descriptive/example1/data/data.csv') # Import the csv file"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"8yXnhtOOdYya"},"outputs":[],"source":["df # Display the dataframe to the screen"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"AJ5g0VEytPeT"},"source":["### Examining the dataframe object"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"zgNzoKkTnuOW"},"source":["We investigate the data by looking at the number of subjects (rows) and the number of statistical variable (columns)."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"BiEZ6TT2n7E-"},"outputs":[],"source":["df.shape # Using the .shape property (attribute)"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"ez7uPFifoS4J"},"source":["We see 200 participants in our study, with data collected for 13 variables."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"zYUsi-LPoA-Z"},"source":["Let's have a look at all the statistical variables."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"_MXiaVe2oEoB"},"outputs":[],"source":["df.columns # Name of each column"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"eaDFW2h8oYna"},"source":["The data represent a small study on cholesterol values before and after taking either a placebo or an active drug."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"jX1RyII3oH6B"},"source":["Finally, we can view the data types of each variable (column)."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"AB1FQEiDol5h"},"outputs":[],"source":["df.dtypes # Pandas data type of each column"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"z--RBhkQoztR"},"source":["## Counting"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"j7n5uO0_sKuD"},"source":["### Frequencies"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"Jw4pFfnEpA4B"},"source":["Counting how many times a sample space element of a categorical variable occurs is a good start. In our dataframe object, we have a *Group* variable. Let's first see what the sample space of this variable is. The `unique` method will return an array of the unqiue elements it finds in a specified column."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"j2NszAdlpKAf"},"outputs":[],"source":["df.Group.unique() # The .unique() method returns the sample space elements of a column (pandas series object)"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"WBGb-COXpZU9"},"source":["As mentioned, patients received either a placebo (*Control* group) or an active drug (*Active* group). These two terms are the sample space elemnts of the nominal categorical variable *Group*."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"sPATSsSGpsiZ"},"source":["We can now count how many times each of these elements appear in the *Group* column, using the `.value_counts()` method. This gives us the **frequency** of each value."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"cEwk4zTVp4RZ"},"outputs":[],"source":["df.Group.value_counts() # Counting the number of times the unique values appear"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"V6toqec3qCiC"},"source":["We see that there are 100 participants in each group. We can express the counts as a fraction, called a **relative frequency**. Thsi is done by setting the `normalize=` argumen to `True`."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"TqB3E4BRrBJY"},"outputs":[],"source":["df.Group.value_counts(normalize=True) # Expressing the relative frequency"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"VZzl62v3rLqx"},"source":["As expected, we see each element taking up a half of the total number of participants. We can multiply this by 100 to get a percentage."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"sbaxut6-rXPS"},"outputs":[],"source":["df.Group.value_counts(normalize=True) * 100 # Expressing the relative frequency as a percentage"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"mR4CVCdUqKMQ"},"source":["#### Exercise"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"ki2WTaN0qL2h"},"source":["The *Smoke* column indicates wheter particpants never smoked (*0*), are smokers (*1*), or have smoked before (*2*). Calculate the frequency with which each element appears."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"EbfUpYpJqjax"},"source":["#### Solution"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"eHqEnvAQqlEG"},"outputs":[],"source":["df.Smoke.value_counts()"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"Qr7-GrdGri-R"},"outputs":[],"source":["df.Smoke.value_counts(ascending=True, normalize=True) * 100"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"NmbMStMlsToh"},"source":["### Grouped frequencies"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"HkDgMd3jsbbJ"},"source":["We can calculate *combined frequencies*. As an example, consider the number of participants in each group of the study that chose each of the five possible values in the survey question."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"oO5VbZdOstNR"},"source":["We can do this with the pandas `crosstab()` function."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"yioZnTziszlK"},"outputs":[],"source":["pd.crosstab(df.Survey, df.Group) # Row and column variable"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"Z_nU-yw8fwqx"},"source":["## Measures of central tendency (point estimates)"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"tbNCmuyEtW2h"},"source":["**Measures of central tendency** or **Point estimates** are single values that are representative of a list of continuous numerical values. There are a few that we will discuss here."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"eRFQIdV3f3kQ"},"source":["### Mean (average)"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"YI7xbvbAtgeQ"},"source":["The **mean** or the **average** is more properly known as the **arithmetic mean**. It is simply the sum of all the continuous numerical variables divided by the number of values."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"nIuaGkiPtl6R"},"source":["Let's start learning about the information in our data by asking some questions."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"84JxSkA8f7L5"},"source":["- What is the mean age of all the patients?"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"1w_fl954t5XZ"},"source":["A pandas series object has many useful methods that are geared towards summary statistics. The `.mean()` method calculates the mean."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"lnW0iZSBde3N"},"outputs":[],"source":["# Using the .mean() method\n","df.Age.mean()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"0dOjaj7WgKj4"},"source":["- What is the mean heart rate of all the patients?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"3TJ1fhNogA56"},"outputs":[],"source":["# Using alternative column (variable) reference\n","df['HR'].mean()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"W2vbqS-Pge9H"},"source":["- What is the mean age of the patients who smoke (indicated as *1* in the *Smoke* column)?"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"MF4YroUquV4a"},"source":["We looked at conditional in the previous notebook, where we selected only certain rows in a pandas series."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"zMpOcQbBgaOk"},"outputs":[],"source":["# Using a conditional on the Smoke column\n","df[df.Smoke == 1]['Age'].mean()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"1T7Z3timg3h4"},"source":["- What is the mean age of the patients who do not smoke (indicated as *0* in the *Smoke* column)?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"yzFrSWOXguu5"},"outputs":[],"source":["# Using a conditional on the Smoke column\n","df[df.Smoke == 0]['Age'].mean()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"S6h0Ot0MukbA"},"source":["We have learned something usefull with this summary statistic. The patients who smoke are quite a bit older than those who do not. Is this a significant difference? What test can we use to discover this? What about the third group, the ex-smokers? All will be revealed."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"aG8WcojNhDNu"},"source":["- What are the mean ages of the patients who smoke compared to those who do not smoke?"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"vebMvujIvaBx"},"source":["We can save a lot of time and typing by calculating the age means for all the smoker groups."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"WlwifbqdvDLQ"},"source":["The `.groupby()` method can create groups from the unique elements in a column and then call a method on another column."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"DtqvwNbehB-w"},"outputs":[],"source":["# Use the .groupy() method\n","df.groupby('Smoke')['Age'].mean()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"kTrVr7X9wJyC"},"source":["By the way, the `.mean()` method has some useful arguments. We can use `skipna=True` to skip over any missing values (this is the default behaviour of this method). We can also use `numeric_only=True` if there are data values that were not captured as numbers."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"JKYNO02bwQ_Z"},"source":["### Geometric mean"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"byJiOpjXwS_5"},"source":["The **geometric mean** multiplies all the continous numerical variables and take the *n*-th root of that product, where *n* is the number of values. At the beginning of this notebook we imported the stats module from the scipy library. It contains many functions that we will use in the statistical analysis of our data. The `gmean()` function calculates the geometric mean. It can take a pandas series as argument."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"aRDDpR2xwrt4"},"outputs":[],"source":["stats.gmean(df.Age)"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"7aD4OQQYhS_l"},"source":["### Median"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"B4VdrM95TFq1"},"source":["The mean makes an assumption of the data values and that is that they should be normally distriuted. We will learn much more about distributions in the next notebook. For now, we can view the normal distribution as the familiar bell-shaped curve."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"uwAJDTiDTWPO"},"source":["Not all data value for a continuous numerical value follow a nice bell-shaped curve. We can have quite different *shapes* (distributions) or many outliers (values that are way-off from all the others). In this case, the mean is not a good representative summary of all the data values. Here, we rather use the median."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"mhq-qnyDTo5X"},"source":["The **median** puts all the values in a sorted order. If there are an odd number of values, then the median is the middle value. If there are an even number of values, then the mean of the middle two values as taken."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"YtCauitGhjmM"},"source":["- What is the median heart rate of patients older than $50$?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"fNCXvRhLhP2D"},"outputs":[],"source":["# Using the .median() function\n","df[df.Age > 50]['HR'].median()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"iShT33mWxrvt"},"source":["#### Exercise"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"7T8e8tzGx12Q"},"source":["Calculate the median age of the participants who smoke (*1*) and have a heart rate of more than 70."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"m8R_UTK3xwYQ"},"outputs":[],"source":["df.columns"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"QqnAneXFxu1g"},"source":["#### Solution"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"_WeNyUp1yOKY"},"outputs":[],"source":["df.loc[(df.Smoke == 1) & (df.HR > 70), 'Age'].median()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"0LxzmfOrh0OJ"},"source":["### Mode"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"YPQs8H68T-tt"},"source":["The last measure of central tendency that we will take a look at is the mode. The **mode** is used for categorical of discrete data types. It simply return the value(s) that occurs most commonly. If a single sample space element occurs most commonly, that will be the single mode. Somethimes more than one sample space element shares the spoils. This variable is then bimodal. As you might imagine, there are terms such as tri-modal and multi-modal."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"nbHeKQ_yh5CF"},"source":["- What is the mode of the smoking variable?"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"1BGjsmonyvAZ"},"source":["We use the `.value_counts()` method do calculate the frequency. The `ascending=` argument is set to `False` by default and the `sort=` is set to `True`, such that we get the mode at the top."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"F2SOtFbvhrFt"},"outputs":[],"source":["df.Smoke.value_counts()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"ZSx4BfkLiTbK"},"source":["## Measures of dispersion (spread)"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"xAdT_qaXVRad"},"source":["**Measure of dispersion** give us an indication of how spread out our data is."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"d0Xj_HL-iYPe"},"source":["### Standard deviation and variance"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"PfWzikN1Wl4N"},"source":["The **standard deviation** can be understood as the average difference between each continuous numerical data value and the mean of that variable. Difference infers subtraction. As some values will be larger than the mean and some smaller, subtraction from the mean will lead to positive numbers and negative numbers. In fact, from the way we calculate the mean, if we sum up all these differences (so as to calculate a mean difference), we will get 0. To mitigate this, we sqaure all of the differences. Squaring (multiplying by itself) returns positive values. Now we can sum all these values and divide by the number of values. This gives us the **variance**."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"8c_8KrJEXfb3"},"source":["Variances are very useful in statistics. We need to express the spread in the same units as our variable for it to make sense as a summary statistics. The *age* variable had a unit of years. What then, is a $\\text{years}^2$. Instead, we take the square root of the variance to get the standard deviation, now expressed in the same units as the variable and a true measure of the average difference between all the values and the mean."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"e3GqwhglYCi3"},"source":["The `.std()` method returns the standard deviation of a series object and the `.var()` method returns the variance."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"XTma-V9silDy"},"source":["- What is the standard deviation of the age of patients who smoke vs those who do not smoke?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"dn9I-qw9iOP0"},"outputs":[],"source":["# Group by the Smoke column\n","df.groupby('Smoke')['Age'].std()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"l4TdG6qui8X8"},"source":["- What is the variance of the age of patients who smoke vs those who do not smoke?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"8SWKWbwEiyn9"},"outputs":[],"source":["df.groupby('Smoke')['Age'].var()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"plDKTmT9jRcy"},"source":["### Range"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"idtxsoxhVeum"},"source":["The **range** is the difference between the minimum and the maximum value of a continuous numerical variable. The `min()` and the `max()` methods for series objects give these values. Let's see then how old our youngest and oldest participants are."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"Pa41MN06jUAy"},"source":["- What is the minimum age of all the participants?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"O_7EvljzjIHJ"},"outputs":[],"source":["# Using the .min() functuion\n","df.Age.min()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"WRrZ812Sjdto"},"source":["- What is the maximum age of all the participants?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"rP9QmY39jYGJ"},"outputs":[],"source":["# Using the .max() function\n","df.Age.max()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"7CNM2qCnjnAq"},"source":["- What is the range in the age of all the participants?\n"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"dpxIwNoy0FUB"},"source":["We simply subtract the minimum value from the maximum value."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"h1UFkaz1jlP6"},"outputs":[],"source":["# Difference between maximum and minimum ages\n","df.Age.max() - df.Age.min()"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"XSubh4N1j1Uh"},"source":["### Quantiles"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"GdwRlW3-Zjq0"},"source":["Just as we divided our continuous numerical variables up into two halves for the mean, so we can divide them up into quarters. In fact, we can divide it up at any percentage level from 0% to 100% (fraction of 0.0 to 1.0). Here 0% would be the minimum value and 100% would be the maximum value. Dividing the values up into these bins give us **quantiles**."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"5tIKnoESZ8Vu"},"source":["We can divide the values up into four bins with three values. These values are the **quartiles**."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"J7MmotgG1jrw"},"source":["The lowest of these three values (the **first quartile**), divide the data into two parts, with a quarter being lower than that value and three-quarters being higher. The second divide the data values equally (the median or **second quartile**). The third is a value that has three-quarters of the values less than and a quarter more than it (the **third quartile**)."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"fC36pjCHkCvx"},"source":["- What are the quartile values for the age of all the patients?"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"BmjgyJwrape4"},"source":["The `.quantile()` method allows us to choose, as a fraction, any of these cut-off values. For the quartiles, we create a list `[0.25, 0.5, 0.75]`."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"U9iW7ZNsjvht"},"outputs":[],"source":["# Specifying the quartiles as fractions\n","df.Age.quantile([0.25, 0.5, 0.75])"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"YPltHdznkZ83"},"source":["- What is the $95$th percentile values in age of the patients who smoke vs those that do not?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"xN5sCHO-kRJk"},"outputs":[],"source":["df.groupby('Smoke')['Age'].quantile(0.95)"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"PtyTBD7a1BFw"},"source":["The **interquartile range** is the difference between the third and the first quartile."]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"DXoM5qY-k4ll"},"source":["- What is the interquartile range of the age of all the patients?"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"i1gNieQ4krVW"},"outputs":[],"source":["df.Age.quantile(0.75) - df.Age.quantile(0.25)"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"A3_x8T-w1MAt"},"source":["## Conclusion"]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"T7MeslD41O96"},"source":["We now know a lot more about our data. Be encouraged to learn even more by asking some question about this mock study and see if you can calculate the required value."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{},"colab_type":"code","id":"eLPUPr11Qccg"},"outputs":[],"source":[]}],"metadata":{"colab":{"collapsed_sections":["JWZAUQEhbuHb","qY6Djzavbya7","srfntVDKcQa9","runAmOfktIjB","AJ5g0VEytPeT","z--RBhkQoztR","j7n5uO0_sKuD","mR4CVCdUqKMQ","EbfUpYpJqjax","NmbMStMlsToh","Z_nU-yw8fwqx","eRFQIdV3f3kQ","JKYNO02bwQ_Z","7aD4OQQYhS_l","iShT33mWxrvt","QqnAneXFxu1g","0LxzmfOrh0OJ","ZSx4BfkLiTbK","d0Xj_HL-iYPe","plDKTmT9jRcy","XSubh4N1j1Uh","A3_x8T-w1MAt"],"name":"04 Descriptive statistics.ipynb","provenance":[],"toc_visible":true},"kernelspec":{"display_name":"Python 3.6.4 64-bit ('3.6.4')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.4"},"vscode":{"interpreter":{"hash":"f691c77f2932b50764e6cbe55ba6da9aefcdc0d1195f88baaf9e9338c7c6b4c0"}}},"nbformat":4,"nbformat_minor":0} diff --git a/descriptive/example1/code/requirements.txt b/descriptive/example1/code/requirements.txt new file mode 100644 index 0000000..11ef38f --- /dev/null +++ b/descriptive/example1/code/requirements.txt @@ -0,0 +1,4 @@ +pandas +scipy +tableone +researchpy diff --git a/descriptive/example1/code/researchpy_example.ipynb b/descriptive/example1/code/researchpy_example.ipynb new file mode 100644 index 0000000..3df6458 --- /dev/null +++ b/descriptive/example1/code/researchpy_example.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Yuib-1s1YRlt", + "outputId": "687cb64c-995a-43b1-ed47-185cb32e2008" + }, + "outputs": [], + "source": [ + "!pip install researchpy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Tz1qKEMxYTCT" + }, + "outputs": [], + "source": [ + "import researchpy as rp\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "NHJ4xg5DYXZx", + "outputId": "03263426-2e84-4445-f952-8d10fbdff94e" + }, + "outputs": [], + "source": [ + "df = pd.read_csv('./data.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2HyFY1tKgwDk", + "outputId": "f3c62ea7-52e8-463c-cf34-f1febaf833c0" + }, + "outputs": [], + "source": [ + "rp.codebook(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IBby_YNohrHl", + "outputId": "da650c24-9910-47e5-c8f4-7b0c65bbb7e7" + }, + "outputs": [], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 179 + }, + "id": "fF8x6Hvdh5Tr", + "outputId": "34642f22-9b01-46fc-8940-4e7f32d09334" + }, + "outputs": [], + "source": [ + "## example of getting descriptives for single or group of continuous variables\n", + "\n", + "rp.summary_cont(df[['Age', 'HR', 'sBP']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "7dUKqxsOiXkQ", + "outputId": "f4b7fa5e-473f-4407-b749-975623aae1e0" + }, + "outputs": [], + "source": [ + "rp.summary_cat(df[['Group', 'Smoke']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "g3kA2jJoijzO", + "outputId": "f8556125-835f-43c4-b9e6-6f4878fc450d" + }, + "outputs": [], + "source": [ + "df['Group'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sYipIGGximTA", + "outputId": "dc23d990-1a4d-45ac-b53c-79d743334215" + }, + "outputs": [], + "source": [ + "df['Smoke'].value_counts()" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/descriptive/example1/code/tableone.py b/descriptive/example1/code/tableone.py new file mode 100644 index 0000000..b7e0277 --- /dev/null +++ b/descriptive/example1/code/tableone.py @@ -0,0 +1,36 @@ +import pandas as pd +from tableone import TableOne, load_dataset + +##### DATASET 1 ##### +example_data = load_dataset('pn2012') +# # littlerecode death where 0 is alive and 1 is dead +# example_data['death'] = example_data['death'].replace(0, 'alive') +example_data.dtypes +example_data_columns = ['Age', 'SysABP', 'Height', 'Weight', 'ICU', 'death'] +example_data_categorical = ['ICU', 'death'] +example_data_groupby = ['death'] +example_data_labels={'death': 'mortality'} +exampleTab1 = TableOne(example_data, columns=example_data_columns, + categorical=example_data_categorical, groupby=example_data_groupby, + rename=example_data_labels, pval=False) +exampleTab1 +print(exampleTab1.tabulate(tablefmt = "fancy_grid")) +exampleTab1.to_csv('descriptive/example1/data/test.csv') + + + +##### DATASET 2 ##### +my_data = pd.read_csv('descriptive/example1/data/data.csv') +df2 = my_data.copy() +df2.dtypes +list(df2) +df2.head(5) +df2['Smoke'] +df2_columns = ['Age', 'HR', 'Group', 'sBP', 'Smoke'] +df2_categories = ['Smoke', 'Group'] +df2_groupby = ['Smoke'] +# df2['Vocation'].value_counts() +df2_table1 = TableOne(df2, columns=df2_columns, + categorical=df2_categories, groupby=df2_groupby, pval=False) +print(df2_table1.tabulate(tablefmt = "fancy_grid")) +df2_table1.to_csv('descriptive/example1/data/test2.csv') \ No newline at end of file diff --git a/descriptive/example1/data/Altair.csv b/descriptive/example1/data/Altair.csv new file mode 100644 index 0000000..aa009e0 --- /dev/null +++ b/descriptive/example1/data/Altair.csv @@ -0,0 +1,101 @@ +SampleID,Type,Grade,MeasureA,MeasureB,MeasureC +1,I,2,25,31.3,110.1891467 +2,II,4,22,23.8,99.61951223 +3,I,3,22,27.6,87.55160191 +4,II,2,28,33.2,85.56734707 +5,II,4,30,31.8,110.2700532 +6,I,3,26,26.4,99.84167892 +7,I,2,26,33.7,76.78406754 +8,II,3,28,35.8,92.68980598 +9,I,4,23,25,105.3292932 +10,II,2,25,26.9,102.7667628 +11,II,4,26,32.8,103.1580788 +12,I,3,22,28.9,99.63755467 +13,I,2,28,36.5,93.36849383 +14,II,4,24,27.3,104.725783 +15,I,3,22,27.9,87.67951762 +16,II,2,23,25.4,93.92992221 +17,II,3,24,28.5,102.3853207 +18,I,4,27,36.7,100.0448647 +19,I,2,21,29.5,96.14677609 +20,II,4,25,25.5,90.0584909 +21,I,3,26,33.2,106.2905857 +22,II,2,20,26.7,110.9012447 +23,II,4,24,29,102.9123925 +24,I,3,26,34.6,109.5989193 +25,I,2,20,25.6,120.3092205 +26,II,3,29,30.7,80.76294849 +27,I,4,25,27.4,90.33607983 +28,II,2,23,25.1,111.3001142 +29,II,4,22,30.4,111.3810443 +30,I,3,29,36.5,113.3473012 +31,I,2,21,27,99.29337329 +32,II,4,30,30.3,102.409397 +33,I,3,21,24.9,94.59506272 +34,II,2,23,29.5,105.072395 +35,II,3,27,29.4,95.22285424 +36,I,4,28,31.9,115.8428138 +37,I,2,29,33.8,93.20039257 +38,II,4,26,34.5,91.80364222 +39,I,3,28,28.7,105.1201517 +40,II,2,20,20.9,89.95307887 +41,II,4,25,29.4,100.082686 +42,I,3,20,23.8,105.1388602 +43,I,2,25,27.7,96.57069437 +44,II,3,29,31,89.02825048 +45,I,4,27,28.1,115.9905033 +46,II,2,24,33.1,109.6808062 +47,II,4,25,25.5,97.9233459 +48,I,3,23,27.4,91.43744805 +49,I,2,24,29.4,89.17425205 +50,II,4,23,30.9,104.5168731 +51,I,3,23,31.3,120.8590099 +52,II,2,26,34.5,103.1805883 +53,II,3,20,22.7,92.01241296 +54,I,4,26,33.8,98.24382294 +55,I,2,28,37.9,112.6216806 +56,II,4,30,32.5,114.5057637 +57,I,3,29,32.1,85.65551755 +58,II,2,21,22.6,99.92775192 +59,II,4,20,29.4,101.7370707 +60,I,3,29,32.3,104.6963938 +61,I,2,30,35.1,90.68274837 +62,II,3,22,28.6,104.945687 +63,I,4,27,35.2,100.5862524 +64,II,2,22,28.3,101.9664154 +65,II,4,30,37.5,117.9238576 +66,I,3,24,32.7,99.11348868 +67,I,2,28,34.6,87.66866232 +68,II,4,22,29.5,103.8499839 +69,I,3,27,37,105.52171 +70,II,2,28,29.2,98.54170359 +71,II,3,23,24.1,102.9014881 +72,I,4,30,33.6,96.67562447 +73,I,2,27,32,103.6666286 +74,II,4,23,27.4,96.26970859 +75,I,3,25,33.6,97.88947741 +76,II,2,26,33.4,89.36824634 +77,II,4,27,29.8,79.41290868 +78,I,3,25,31.9,111.6602743 +79,I,2,23,30.5,118.6330966 +80,II,3,28,37.8,102.7833759 +81,I,4,27,29.1,105.0360624 +82,II,2,28,34.2,92.0825595 +83,II,4,29,35.9,95.7516333 +84,I,3,23,26.5,92.06924539 +85,I,2,20,23.1,86.0933612 +86,II,4,25,25.5,112.4517925 +87,I,3,22,25,104.4103507 +88,II,2,29,34.3,98.8985973 +89,II,3,23,26.1,96.47799202 +90,I,4,24,30.4,101.1108615 +91,I,2,27,33.4,98.05828612 +92,II,4,24,25.9,112.3727757 +93,I,3,24,30.1,91.94633635 +94,II,2,21,28.5,115.3320592 +95,II,4,29,29.2,96.69564868 +96,I,3,21,25,103.2237425 +97,I,2,26,26.4,101.1300201 +98,II,3,21,24,117.7524212 +99,I,4,30,36.3,101.5227999 +100,II,2,23,29.3,80.16592453 diff --git a/descriptive/example1/data/Cholesterol.csv b/descriptive/example1/data/Cholesterol.csv new file mode 100644 index 0000000..c7208ac --- /dev/null +++ b/descriptive/example1/data/Cholesterol.csv @@ -0,0 +1,301 @@ +Group,Delta +Active,0.8 +Active,0.9 +Control,1.7 +Active,-1.1 +Control,1.7 +Control,-0.8 +Active,-0.5 +Control,0.8 +Active,0.3 +Active,1.4 +Control,-0.2 +Active,-2.3 +Control,0.3 +Control,-1.3 +Active,0.3 +Control,-1.6 +Active,-0.4 +Active,-0.7 +Control,-1.5 +Active,0.4 +Control,0.7 +Control,-0.8 +Active,-0.6 +Control,-0.7 +Active,0.1 +Active,0.5 +Control,1.9 +Active,0.2 +Control,-0.3 +Control,-0.4 +Active,0.8 +Control,-0.6 +Active,-1.3 +Active,-0.7 +Control,-0.1 +Active,1.1 +Control,-1.1 +Control,-0.5 +Active,-0.6 +Control,0.3 +Active,-0.9 +Active,-1 +Control,0 +Active,0.9 +Control,-0.3 +Control,0.2 +Active,-0.4 +Control,-0.3 +Active,-0.3 +Active,0.6 +Control,0.7 +Active,-0.9 +Control,-0.1 +Control,-0.8 +Active,-0.8 +Control,0.1 +Active,0.3 +Active,-1 +Control,-1.4 +Active,-0.8 +Control,1.2 +Control,1.7 +Active,-0.2 +Control,-1.4 +Active,1.8 +Active,0.9 +Control,0.8 +Active,-1.9 +Control,-0.7 +Control,-1.4 +Active,-0.2 +Control,-0.5 +Active,-1.3 +Active,0.1 +Control,0.4 +Active,-1.1 +Control,0.1 +Control,-0.2 +Active,-1.1 +Control,-1 +Active,1.4 +Active,1.5 +Control,0.2 +Active,-3.7 +Control,0.1 +Control,0.5 +Active,0.1 +Control,0 +Active,-1.5 +Active,-0.7 +Control,1.1 +Active,2 +Control,0 +Control,1 +Active,0.6 +Control,-1.5 +Active,-0.7 +Active,0 +Control,-0.8 +Active,1.1 +Control,0.3 +Control,-0.9 +Active,-0.6 +Control,0.3 +Active,1.5 +Active,0.9 +Control,1.4 +Active,1.2 +Control,0.3 +Control,-2.1 +Active,-0.6 +Control,0 +Active,-1.3 +Active,0.1 +Control,1.5 +Active,0.1 +Control,1.3 +Control,1.1 +Active,-0.4 +Control,1.3 +Active,0.3 +Active,0.2 +Control,1.1 +Active,1.6 +Control,0.8 +Control,1.5 +Active,0 +Control,-0.5 +Active,-0.9 +Active,-0.5 +Control,-0.1 +Active,-1.7 +Control,-0.3 +Control,0.9 +Active,1.9 +Control,2.1 +Active,0.6 +Active,-0.5 +Control,-2.1 +Active,0.8 +Control,-0.8 +Control,0 +Active,1.8 +Control,-0.5 +Active,0.3 +Active,-0.5 +Control,0.5 +Active,0.1 +Control,0.4 +Control,1.1 +Active,0 +Control,1.2 +Active,2 +Active,-2.2 +Control,0.5 +Active,0 +Control,0.3 +Control,-0.3 +Active,-0.7 +Control,-1.2 +Active,-1.3 +Active,-1 +Control,0.7 +Active,1 +Control,0 +Control,-0.6 +Active,1.4 +Control,0.7 +Active,0.3 +Active,-0.2 +Control,-1.4 +Active,0.7 +Control,1.1 +Control,1.2 +Active,0.6 +Control,0.7 +Active,0.4 +Active,-0.8 +Control,-0.8 +Active,-0.1 +Control,1.5 +Control,-0.9 +Active,-0.2 +Control,-0.5 +Active,-0.9 +Active,-0.7 +Control,2.2 +Active,0.8 +Control,1.6 +Control,1.4 +Active,-0.7 +Control,0.5 +Active,-1.4 +Active,-0.4 +Control,-3.1 +Active,0.3 +Control,0.9 +Control,1.3 +Active,-0.5 +Control,-0.3 +Active,-2.5 +Active,-0.3 +Control,0.8 +Active,-2.3 +Control,-0.6 +Control,-0.8 +Active,-0.1 +Control,0.4 +Active,0 +Active,-0.5 +Control,0.6 +Active,-1.6 +Control,2.6 +Control,1.7 +Active,-0.1 +Control,1.2 +Active,-1.5 +Active,0.6 +Control,-0.6 +Active,0.8 +Control,-0.6 +Control,1.3 +Active,0.3 +Control,-0.7 +Active,0.3 +Active,0.9 +Control,1 +Active,-0.9 +Control,-1.5 +Control,1.5 +Active,0.2 +Control,-0.5 +Active,0.6 +Active,-0.6 +Control,-0.2 +Active,-0.9 +Control,0.5 +Control,0.2 +Active,0.5 +Control,-0.4 +Active,0.5 +Active,-0.5 +Control,-2 +Active,-0.3 +Control,0.1 +Control,-1.3 +Active,-0.3 +Control,0.8 +Active,0.6 +Active,-1.9 +Control,-3.3 +Active,1.1 +Control,-1 +Control,3.2 +Active,0.7 +Control,0.8 +Active,-2.2 +Active,0.4 +Control,-1.2 +Active,0.4 +Control,0.3 +Control,-0.2 +Active,-0.8 +Control,0.2 +Active,0.1 +Active,-0.4 +Control,1.1 +Active,-0.6 +Control,0.9 +Control,-0.4 +Active,1.9 +Control,-1.2 +Active,-0.8 +Active,1.4 +Control,1.4 +Active,1.1 +Control,1.5 +Control,0.3 +Active,0 +Control,-1.1 +Active,0.1 +Active,0.5 +Control,-1.4 +Active,0.5 +Control,-0.3 +Control,0.2 +Active,-0.8 +Control,-1.7 +Active,0.3 +Active,-0.8 +Control,-0.9 +Active,-0.7 +Control,0 +Control,-1.6 +Active,1.2 +Control,1 +Active,0.4 +Active,1.9 +Control,0 +Active,-0.5 diff --git a/descriptive/example1/data/data.csv b/descriptive/example1/data/data.csv new file mode 100644 index 0000000..51224b3 --- /dev/null +++ b/descriptive/example1/data/data.csv @@ -0,0 +1,201 @@ +"Name","DOB","Age","Vocation","Smoke","HR","sBP","CholesterolBefore","TAG","Survey","CholesterolAfter","Delta","Group" +"Dylan Patton",1981-10-07,43,"Energy manager",0,47,145,1.2,1.2,1,0.7,0.5,"Active" +"Sandra Howard",1993-01-27,53,"Tax adviser",0,51,115,1.2,0.6,3,1,0.2,"Active" +"Samantha Williams",1973-12-21,33,"IT consultant",0,54,120,2,1.3,3,1.7,0.3,"Active" +"Ashley Hensley",1981-12-01,43,"Nurse, children's",0,54,103,2.1,1.6,4,2.1,0,"Active" +"Robert Wilson",1964-06-23,46,"Clinical embryologist",0,61,138,2.8,2.1,5,2.8,0,"Active" +"Leslie Diaz",1994-08-25,48,"Politician's assistant",0,59,122,2.8,1.4,4,2.6,0.2,"Active" +"Frank Zimmerman",1981-03-04,54,"Police officer",0,60,129,2.9,2.4,1,2.6,0.3,"Active" +"Aaron Harris",1948-01-10,58,"Nurse, children's",0,61,131,3.1,2.2,1,2.9,0.2,"Active" +"William Smith",1998-11-20,44,"Scientific laboratory technician",0,58,111,3.1,2.4,1,2.8,0.3,"Control" +"Andrea Fletcher",1955-12-23,31,"Lexicographer",0,59,122,3.2,1.7,5,2.8,0.4,"Active" +"James Wells",1998-08-09,45,"Charity fundraiser",0,62,121,3.2,1.7,4,2.7,0.5,"Active" +"Lisa Perez",1983-09-05,35,"Chief Marketing Officer",0,61,119,3.3,2,3,2.9,0.4,"Active" +"Desiree Sandoval",1981-11-08,49,"Hydrologist",0,61,135,3.3,2.3,5,3.1,0.2,"Active" +"Karen Knight",1941-06-14,56,"Advertising account planner",0,62,134,3.4,2.1,2,3.3,0.1,"Control" +"Evan Stewart",1934-12-22,57,"Web designer",0,75,162,3.5,2.4,3,3.4,0.1,"Active" +"Mark Johnson",1958-11-22,38,"Lobbyist",2,62,140,3.5,2.6,1,2.9,0.6,"Active" +"Roger Chen",1985-11-19,35,"Photographer",0,59,121,3.5,2.3,2,3.4,0.1,"Control" +"Dennis Sanchez",1945-10-25,50,"Chartered loss adjuster",0,63,129,3.5,2.5,1,3.5,0,"Active" +"Eric Hendrix",1969-05-04,45,"Community education officer",2,61,138,3.6,2.3,1,3.2,0.4,"Control" +"Kathleen Burnett",1970-03-23,49,"Occupational hygienist",0,64,144,3.7,2.2,1,3.7,0,"Active" +"Mr. Bradley Bailey",1990-09-15,63,"Conservation officer, nature",0,64,136,3.7,2,4,3.6,0.1,"Active" +"Lacey Wilcox",1998-02-24,45,"Chemist, analytical",0,60,115,3.7,2.1,5,3.4,0.3,"Control" +"James Aguilar",2000-11-05,51,"Immunologist",0,64,134,3.7,2,1,3.5,0.2,"Control" +"Michael Banks",1977-08-19,43,"Exhibition designer",2,62,132,3.7,2.6,3,3.6,0.1,"Control" +"Clifford Williams",1957-05-04,31,"Special effects artist",2,60,134,3.7,2.5,4,3.5,0.2,"Control" +"Danny Smith",1941-01-09,58,"Broadcast presenter",2,65,136,3.8,2.1,2,3.3,0.5,"Active" +"Lori Herrera",1960-01-17,40,"Psychologist, occupational",0,63,122,3.8,2.7,1,3.5,0.3,"Active" +"Melinda Ingram",1944-07-07,47,"Midwife",0,65,130,3.9,2.8,3,3.6,0.3,"Active" +"Sonya Hale",1945-06-14,45,"Meteorologist",2,67,154,3.9,2.6,3,3.5,0.4,"Active" +"Lisa Patrick",1958-11-02,41,"Sports therapist",0,65,145,3.9,2.2,4,3.5,0.4,"Active" +"Brittany Valenzuela",1992-08-02,47,"Therapist, horticultural",0,59,132,3.9,2.6,2,3.5,0.4,"Control" +"David Kane",1981-08-11,38,"Mining engineer",0,63,143,4,2.2,5,3.8,0.2,"Control" +"Douglas Hardin",1971-06-01,54,"Estate agent",0,63,128,4,2.3,3,3.8,0.2,"Control" +"Kyle Boyd",1959-12-30,30,"Waste management officer",0,63,133,4,2.5,5,3.8,0.2,"Control" +"Mr. Tyler Strickland DDS",1940-08-27,46,"Tourist information centre manager",0,62,136,4.1,2.3,2,3.8,0.3,"Control" +"William Evans",1957-08-04,64,"Lecturer, higher education",0,61,127,4.2,2.2,5,3.9,0.3,"Control" +"Darryl Howell",1996-09-13,41,"Research scientist (physical sciences)",1,65,140,4.2,3.1,5,4.1,0.100000000000001,"Active" +"Jeremiah Carter",1954-02-08,40,"Technical brewer",0,64,139,4.2,3.1,4,4.1,0.100000000000001,"Control" +"Derek Long",1997-06-22,45,"Minerals surveyor",0,65,128,4.2,2.6,3,4.1,0.100000000000001,"Control" +"Derrick Lopez",1934-01-02,65,"Commercial/residential surveyor",0,66,128,4.2,2.3,5,3.9,0.3,"Active" +"Laurie Clay",1981-06-12,74,"Lecturer, further education",2,67,136,4.2,3,2,3.9,0.3,"Active" +"Joseph Price",1938-02-21,55,"Programmer, systems",0,54,104,4.3,3.1,2,4,0.3,"Control" +"Tammy Schwartz",1975-08-11,58,"Insurance claims handler",2,69,133,4.3,3,2,4,0.3,"Active" +"Joshua Avila",1978-01-19,53,"Media planner",0,64,129,4.3,2.5,2,4.1,0.2,"Control" +"Kenneth Bowman",1934-07-13,68,"Magazine features editor",1,68,135,4.3,3.1,1,4.2,0.1,"Active" +"Blake Fritz",1952-08-07,54,"Research scientist (medical)",0,62,133,4.3,2.5,3,4.2,0.1,"Control" +"Brian Olson",1958-12-14,72,"Arts administrator",0,67,132,4.3,3.1,1,3.9,0.4,"Active" +"Michael White",1954-11-24,32,"Museum/gallery exhibitions officer",0,63,137,4.3,3,5,4,0.3,"Control" +"Matthew Sellers",1957-09-17,38,"Scientist, research (physical sciences)",0,65,132,4.3,3.1,5,4.2,0.1,"Control" +"Matthew Brown",1939-09-06,59,"Journalist, magazine",0,63,137,4.3,2.2,2,4.1,0.2,"Control" +"Joshua Lewis",1985-09-20,53,"Professor Emeritus",0,66,140,4.4,2.4,3,4,0.4,"Control" +"Kayla Mendoza",1979-08-05,69,"Buyer, industrial",1,66,133,4.4,2.8,5,4.1,0.300000000000001,"Control" +"Kathy Padilla",1983-01-03,42,"Journalist, newspaper",0,68,139,4.4,2.8,5,3.9,0.5,"Active" +"Christopher Abbott",1963-06-12,67,"Ergonomist",2,66,128,4.4,2.6,2,4.2,0.2,"Control" +"Stephanie Jacobs",1977-06-18,38,"Estate manager/land agent",0,69,139,4.4,2.7,2,4,0.4,"Active" +"Juan Johnson",1956-12-09,51,"Logistics and distribution manager",0,65,141,4.5,2.9,5,4,0.5,"Control" +"Craig King",2001-03-06,37,"Scientist, audiological",0,67,135,4.5,3.2,5,4.3,0.2,"Control" +"Tracy Palmer",1950-10-08,36,"Counselling psychologist",0,65,148,4.5,3.2,5,3.8,0.7,"Control" +"John Boyle",1979-07-30,34,"Pharmacist, community",1,66,148,4.5,3.2,2,4.4,0.1,"Control" +"Jeremy Dennis",1983-02-03,38,"Midwife",1,69,158,4.5,2.7,5,4.4,0.1,"Active" +"Janet Young",1981-03-18,31,"Aeronautical engineer",1,65,133,4.5,2.3,3,4.1,0.4,"Control" +"Mr. James Rowe",1995-08-10,48,"Advertising copywriter",0,64,144,4.6,2.6,2,4,0.6,"Control" +"Scott Sherman",1965-02-26,49,"Landscape architect",0,68,145,4.6,2.5,2,4.4,0.199999999999999,"Active" +"Stephanie Miles",1942-04-25,62,"Psychologist, sport and exercise",0,66,150,4.6,2.6,2,4.8,-0.2,"Control" +"Katrina Gilbert",1964-06-08,39,"Surveyor, hydrographic",0,68,133,4.6,2.9,5,4.4,0.199999999999999,"Control" +"Joseph Smith",1981-11-17,74,"Secretary, company",0,66,147,4.6,3.3,1,4.6,0,"Control" +"Jeremy Ellis",1986-07-26,41,"Financial controller",1,68,132,4.6,3.2,4,4.3,0.3,"Active" +"Linda Kennedy",1993-10-24,42,"Teacher, secondary school",0,65,136,4.7,3.1,5,4.1,0.600000000000001,"Control" +"Mandy Garcia",1985-05-31,60,"Cabin crew",2,71,154,4.7,2.4,4,4.5,0.2,"Active" +"Maria Ruiz",1999-01-14,67,"Cartographer",1,65,149,4.7,3.3,1,4.7,0,"Active" +"Gary Pineda",1960-02-16,42,"Scientist, research (physical sciences)",1,66,132,4.7,2.4,2,4.7,0,"Control" +"Tina Martinez",1941-05-31,74,"Passenger transport manager",1,69,146,4.8,3.1,4,4.5,0.3,"Active" +"Denise Long",1945-03-13,52,"Graphic designer",1,83,168,4.9,2.7,2,4.4,0.5,"Control" +"Daniel White",1975-03-23,37,"Brewing technologist",1,65,136,4.9,2.5,1,4.5,0.4,"Control" +"Kyle Moore",1963-12-26,61,"Industrial buyer",1,65,141,4.9,2.8,1,4.4,0.5,"Active" +"Diana Miller",1958-06-29,54,"Designer, textile",1,68,142,4.9,3,5,4.7,0.2,"Control" +"Steven Hunt",1967-11-25,72,"Nature conservation officer",1,69,158,4.9,2.9,2,4.4,0.5,"Control" +"Haley Tucker",1999-11-11,36,"Set designer",1,69,158,4.9,3.2,4,4.8,0.100000000000001,"Active" +"John Estrada",2000-11-13,49,"Information officer",1,68,137,4.9,3,3,4.5,0.4,"Control" +"Scott Ramsey",1951-07-03,47,"Product designer",1,67,138,5,3,5,4.6,0.4,"Control" +"Cynthia Mercado",1942-07-23,49,"Contractor",2,65,126,5,3.1,5,4.7,0.3,"Control" +"Marissa Anderson PhD",1940-05-15,49,"Nurse, learning disability",1,75,170,5,2.7,3,4.7,0.3,"Active" +"Justin Bennett",1963-04-04,69,"Insurance broker",1,66,142,5,3,4,4.6,0.4,"Control" +"Laura Mcdonald",1981-04-29,43,"Hydrographic surveyor",1,66,127,5,3,3,4.5,0.5,"Control" +"Thomas Cole",1944-08-07,38,"Corporate investment banker",1,72,141,5,2.8,2,4.7,0.3,"Active" +"Levi Lopez",1980-09-24,60,"Bookseller",1,71,159,5,3,5,4.9,0.1,"Active" +"Jeffrey Washington",1994-12-04,45,"Waste management officer",1,67,154,5.1,2.8,2,4.7,0.4,"Control" +"Christopher Carter",1966-12-07,71,"Lawyer",2,67,151,5.1,3.2,1,5,0.1,"Control" +"Daniel Miller",1975-09-08,65,"Surveyor, mining",1,70,143,5.1,3.5,5,5,0.1,"Control" +"Tara Powell",1996-06-29,73,"Engineer, mining",1,69,137,5.2,2.7,2,5.2,0,"Control" +"Richard Pierce",1966-12-16,60,"Dealer",2,69,147,5.2,2.9,5,5,0.2,"Control" +"Dylan Dixon",1965-03-10,70,"Psychotherapist, child",1,73,158,5.3,3.5,3,4.9,0.4,"Active" +"John Dawson",1979-06-14,45,"Computer games developer",1,73,158,5.3,3.6,5,5.3,0,"Control" +"Shari Wagner",2001-08-29,49,"Paramedic",2,68,149,5.3,3.6,2,5,0.3,"Control" +"Sharon Cain",1981-06-15,65,"Clinical biochemist",1,72,139,5.5,2.8,1,5.5,0,"Control" +"Heidi Hernandez",1995-12-23,53,"Forensic psychologist",1,77,167,5.5,3.3,5,5.3,0.2,"Active" +"Brandi Ibarra",1973-11-01,30,"Communications engineer",1,72,159,5.5,3.7,1,5.3,0.2,"Control" +"Jason Williams",1944-12-22,58,"Herpetologist",1,71,148,5.6,3.4,4,5.3,0.3,"Control" +"Robert Pruitt",1966-07-15,50,"Midwife",1,75,150,5.6,3.3,1,5.3,0.3,"Active" +"Brittany Richardson",1947-11-23,69,"Engineer, manufacturing",1,72,146,5.7,3.3,2,5.3,0.4,"Control" +"Kristina Zimmerman",1994-01-01,72,"Agricultural consultant",0,70,157,5.7,3.6,1,5.3,0.4,"Control" +"Jennifer Key",1995-12-13,61,"Engineer, energy",0,70,151,5.7,3.5,2,5.6,0.100000000000001,"Control" +"Christine King",1983-07-27,33,"Community pharmacist",0,78,151,5.9,3.8,4,5.6,0.300000000000001,"Active" +"Mary Rodriguez",2001-07-07,30,"Music tutor",0,74,168,5.9,3.4,4,5.6,0.300000000000001,"Control" +"Austin Fuller",1965-02-17,44,"Trading standards officer",0,75,165,6,4,5,5.9,0.1,"Control" +"Lauren Ramos",1970-09-21,71,"Statistician",0,77,160,6,3.7,2,5.8,0.2,"Active" +"James Huffman",1941-05-05,49,"Manufacturing systems engineer",0,75,157,6,3.9,4,5.8,0.2,"Control" +"Kelly Rivera",1939-07-05,75,"Call centre manager",0,78,170,6,3.2,4,5.9,0.1,"Active" +"Charles Torres",1998-10-17,43,"Control and instrumentation engineer",0,73,146,6.1,3.1,2,5.5,0.6,"Control" +"Todd Rice",1951-04-02,35,"Medical sales representative",2,75,165,6.2,3.5,2,5.9,0.3,"Control" +"Ashley Fischer",1968-01-16,55,"Trade union research officer",0,78,164,6.2,4,1,5.9,0.3,"Active" +"Craig Raymond",1945-03-18,38,"Engineer, energy",0,73,162,6.2,3.2,2,5.9,0.3,"Control" +"Gregory Avila",1956-09-30,36,"Trade mark attorney",0,78,168,6.3,3.9,3,6.2,0.1,"Active" +"Matthew Gonzalez",1965-01-31,46,"Publishing copy",0,75,169,6.3,3.9,2,6,0.3,"Control" +"Vicki Sweeney",1997-12-08,60,"Mental health nurse",0,79,163,6.3,3.8,4,6,0.3,"Control" +"Charles Garcia",1983-10-16,40,"Music tutor",2,80,165,6.4,3.5,1,6,0.4,"Active" +"Shannon Hammond",1939-05-18,57,"Maintenance engineer",0,81,174,6.4,3.9,2,6,0.4,"Active" +"Tina Cabrera",1983-08-22,69,"Translator",0,82,180,6.6,3.6,2,6.2,0.4,"Active" +"Patrick Perez",1982-05-07,56,"Volunteer coordinator",0,82,159,6.7,4.1,4,6.3,0.4,"Active" +"Molly Davis",1996-06-24,66,"Secretary/administrator",0,78,176,6.7,3.6,1,6.3,0.4,"Control" +"Jonathan Williams",1982-08-12,60,"Museum/gallery conservator",0,78,160,6.8,4.4,4,6.7,0.1,"Control" +"Bonnie Johnson",1959-08-09,42,"Biomedical scientist",0,78,158,6.9,4,1,6.7,0.2,"Control" +"Barbara Lawson",1969-03-08,49,"Medical illustrator",2,78,159,7,4.2,5,6.6,0.4,"Control" +"Natalie Hanna",1992-05-14,68,"Geophysicist/field seismologist",2,83,181,7,4.3,4,7,0,"Active" +"Renee Schneider",1948-09-24,32,"Corporate treasurer",0,83,170,7,4.1,2,6.7,0.3,"Active" +"Victoria Gordon",1956-08-05,31,"Pharmacist, community",0,83,165,7,4.2,2,6.7,0.3,"Active" +"Diana Burch",1984-06-22,56,"Corporate treasurer",0,83,174,7,3.7,2,6.7,0.3,"Active" +"Michael Black",1973-03-18,35,"Fish farm manager",0,85,178,7.1,3.9,4,6.9,0.199999999999999,"Active" +"Kevin Levine",1947-02-03,63,"Immunologist",0,79,167,7.2,4.5,4,7.1,0.100000000000001,"Control" +"Laura Kelly",1959-05-28,54,"Dancer",0,79,173,7.2,4.4,1,7.3,-0.1,"Control" +"Stephanie Pruitt",1996-01-09,68,"Heritage manager",0,85,186,7.3,3.8,3,7,0.3,"Active" +"Douglas Evans",1948-07-15,72,"Dancer",0,83,166,7.3,4.4,1,7.1,0.2,"Control" +"Andre Brown",1949-02-18,40,"Legal executive",0,84,187,7.3,4.3,5,6.9,0.4,"Active" +"Cindy Perkins",1963-03-23,41,"Energy engineer",1,85,175,7.4,4.2,3,7.1,0.300000000000001,"Active" +"Jason Grimes",1989-01-27,66,"Musician",1,84,179,7.4,3.7,3,7.1,0.300000000000001,"Active" +"Maureen Stark",1972-02-28,73,"Contractor",1,78,153,7.4,4.1,2,7.2,0.2,"Control" +"Curtis Diaz",1961-11-01,36,"Careers information officer",2,81,157,7.4,4.2,1,6.9,0.5,"Control" +"Christopher Henry",1958-09-23,54,"Careers adviser",1,85,170,7.5,3.9,5,7,0.5,"Active" +"April Clarke DDS",1948-06-08,57,"Oncologist",1,85,181,7.5,4.4,2,7.3,0.2,"Control" +"Carrie Sanders",1971-03-14,74,"Video editor",1,85,173,7.5,4.4,2,7.4,0.1,"Active" +"Jeremiah Taylor",1973-06-30,48,"Engineering geologist",1,24,52,7.6,4.1,2,7.3,0.3,"Control" +"Laurie Strong",1942-06-27,72,"Surveyor, building control",2,85,168,7.6,4.4,5,7.1,0.5,"Active" +"Nichole Best",1954-10-31,54,"Careers information officer",1,82,175,7.6,3.9,5,7.4,0.199999999999999,"Control" +"Jeremy Wagner",1938-12-10,71,"Cytogeneticist",1,104,205,7.7,4.3,3,7.6,0.100000000000001,"Active" +"Tammy Hamilton",1994-02-20,42,"Journalist, magazine",1,87,193,7.7,4.6,1,7.3,0.4,"Active" +"Kevin Shields",1982-04-19,54,"Accommodation manager",2,86,190,7.7,4,4,7.3,0.4,"Active" +"Jennifer Kelly",1970-11-05,66,"Public house manager",1,85,189,7.7,4.7,5,7.7,0,"Control" +"Greg Lewis",1978-01-28,58,"Production manager",1,86,191,7.8,4.3,3,7.3,0.5,"Control" +"Matthew Campbell",1970-09-11,60,"Bonds trader",1,86,185,7.8,4.3,3,7.7,0.1,"Active" +"Jane Stone",1984-03-13,63,"Electrical engineer",2,82,159,7.8,4.8,4,7.4,0.4,"Active" +"Angela Clark",1950-04-30,44,"Media buyer",1,86,173,7.8,4.4,2,7.3,0.5,"Active" +"Lisa Rhodes",1989-05-03,55,"English as a foreign language teacher",1,86,179,7.8,4.7,4,7.3,0.5,"Active" +"Andrea Anderson",1974-04-08,73,"Horticultural consultant",1,85,183,7.9,4.2,4,7.7,0.2,"Control" +"Christopher Luna",1992-11-22,55,"Accountant, chartered certified",2,87,187,7.9,4.9,1,7.7,0.2,"Active" +"Linda Evans",1962-12-25,40,"Archivist",1,86,167,7.9,4.5,4,7.8,0.100000000000001,"Active" +"Margaret Thompson",1985-05-29,43,"Development worker, international aid",1,86,174,7.9,4.2,3,7.7,0.2,"Active" +"Erin Burton",1942-10-06,71,"Biomedical scientist",1,86,179,7.9,4.8,1,7.8,0.100000000000001,"Active" +"Jodi Wood",1946-11-29,54,"Animal technologist",0,87,173,8,4,5,7.7,0.3,"Active" +"Angela Long",1976-06-07,61,"Engineer, civil (contracting)",2,80,158,8,4.6,4,8,0,"Control" +"Bob Williams",1991-03-10,55,"Commercial/residential surveyor",1,82,167,8.1,4.1,2,7.8,0.3,"Active" +"Elizabeth Ashley",1949-11-23,67,"Newspaper journalist",1,87,187,8.1,4.3,2,7.5,0.6,"Active" +"April Reyes",1948-10-21,55,"Engineer, electronics",1,87,193,8.1,4.1,5,7.9,0.199999999999999,"Active" +"Monica Kelley",1981-04-07,68,"Police officer",1,87,176,8.1,4.3,1,8,0.1,"Active" +"Shannon Neal",1973-12-28,71,"Administrator, sports",2,88,194,8.1,4.7,1,8,0.1,"Active" +"Adam Smith",1941-07-21,54,"Journalist, broadcasting",1,88,183,8.1,4.7,1,8,0.1,"Active" +"Connor Torres",1988-01-06,42,"Radiation protection practitioner",1,87,190,8.2,4.9,5,8.2,0,"Active" +"Brandy Johnson",1978-04-13,53,"Private music teacher",1,88,196,8.2,4.4,4,8.1,0.1,"Active" +"Heather King",1940-08-01,69,"Adult guidance worker",1,88,178,8.2,4.9,5,7.8,0.4,"Active" +"Angela Boyer",1977-06-11,57,"Dancer",1,88,174,8.2,5,2,8,0.199999999999999,"Active" +"Kathryn Smith",1963-08-07,45,"Scientist, marine",2,88,192,8.2,4.2,5,7.9,0.299999999999999,"Active" +"Mary Aguilar",1952-01-09,62,"Furniture designer",0,88,182,8.3,5.1,2,8.2,0.100000000000001,"Control" +"Mary Barnett",1993-10-21,33,"Operational investment banker",1,88,179,8.3,4.9,3,8.1,0.200000000000001,"Active" +"Kathleen Goodwin",1938-05-14,74,"Therapeutic radiographer",1,86,182,8.3,4.9,2,8.1,0.200000000000001,"Control" +"Debra Hoover",1971-07-16,74,"Building control surveyor",1,88,195,8.3,5,4,8.2,0.100000000000001,"Active" +"Heidi Gaines",1974-06-26,64,"Occupational therapist",1,89,198,8.4,4.5,4,8.3,0.1,"Active" +"Nicole Vance",1990-07-17,43,"Personnel officer",1,87,193,8.4,5,2,8.2,0.200000000000001,"Control" +"Alejandro Love",1957-05-03,35,"Fashion designer",2,85,167,8.4,4.5,2,8.3,0.1,"Control" +"Michael Richardson",2001-08-15,57,"Hydrographic surveyor",1,88,184,8.5,4.3,5,8.5,0,"Active" +"Jonathan Bautista",1938-06-28,64,"Landscape architect",1,89,198,8.5,4.8,1,8.1,0.4,"Control" +"Steven Wilson",1947-02-17,55,"Surveyor, planning and development",1,89,194,8.6,5.3,1,8.3,0.299999999999999,"Active" +"Paula White",1979-03-04,74,"Engineer, control and instrumentation",0,89,187,8.6,4.6,5,8.2,0.4,"Active" +"Deborah Shelton",1934-11-10,50,"Designer, textile",1,90,179,8.6,4.7,3,8.6,0,"Control" +"James Rojas",1965-11-02,40,"Solicitor, Scotland",1,44,98,8.7,5,1,8.3,0.399999999999999,"Control" +"Leah Blankenship",1938-09-22,62,"Dramatherapist",0,90,176,8.7,5.3,3,8.5,0.199999999999999,"Active" +"Angela Wilson",1988-05-19,65,"Building control surveyor",1,90,190,8.8,4.5,4,8.5,0.300000000000001,"Active" +"James Wright",1997-09-20,63,"Advertising account executive",1,87,193,8.8,5.3,3,8.5,0.300000000000001,"Control" +"Heather Sawyer",1946-01-12,68,"Sport and exercise psychologist",1,89,179,9,5.2,3,8.8,0.199999999999999,"Control" +"Kristie Morris",1994-01-16,45,"Senior tax professional/tax inspector",1,91,185,9,4.6,4,8.6,0.4,"Active" +"Joan Chavez",1999-10-07,41,"Energy manager",0,93,182,9.1,5,2,8.8,0.299999999999999,"Control" +"Patricia Miller",1972-11-15,54,"Psychologist, educational",1,92,198,9.3,4.9,5,9,0.300000000000001,"Active" +"Rachel Mcguire",1970-12-23,62,"Medical sales representative",1,92,203,9.3,5.1,4,8.9,0.4,"Control" +"Angela Wilson",1983-08-24,65,"Designer, television/film set",1,92,202,9.3,5,5,9,0.300000000000001,"Active" +"Carrie Stevens",1985-12-11,72,"Ergonomist",1,95,192,9.6,5.3,2,9.4,0.199999999999999,"Control" +"Eric Rodriguez",2001-08-14,61,"Geneticist, molecular",0,95,191,9.7,5.2,4,9.5,0.199999999999999,"Control" +"Jeffery Silva",1973-11-25,70,"Bookseller",1,94,203,9.9,5.4,1,9.6,0.300000000000001,"Control" +"John Curtis",1936-11-25,66,"Sales professional, IT",1,96,201,10.1,5.1,5,10,0.1,"Control" +"Jessica Tanner",1986-07-01,54,"Paramedic",1,93,183,10.1,5.3,5,10,0.1,"Control" +"Charles Smith",1959-01-30,61,"Chartered certified accountant",0,99,212,10.1,5.6,4,9.7,0.4,"Control" +"Barry Porter",1979-05-30,65,"Dancer",1,98,200,10.1,5.3,3,10,0.1,"Control" +"Julie Barrett",1972-07-27,66,"Theme park manager",1,102,208,11.1,5.7,2,10.7,0.4,"Active" diff --git a/descriptive/example1/data/test.csv b/descriptive/example1/data/test.csv new file mode 100644 index 0000000..fa3e7a7 --- /dev/null +++ b/descriptive/example1/data/test.csv @@ -0,0 +1,13 @@ +,,Grouped by mortality,Grouped by mortality,Grouped by mortality,Grouped by mortality +,,Missing,Overall,0,1 +n,,,1000,864,136 +"Age, mean (SD)",,0,65.0 (17.2),64.0 (17.4),71.7 (14.0) +"SysABP, mean (SD)",,291,114.3 (40.2),115.4 (38.3),107.6 (49.4) +"Height, mean (SD)",,475,170.1 (22.1),170.3 (23.2),168.5 (11.3) +"Weight, mean (SD)",,302,82.9 (23.8),83.0 (23.6),82.3 (25.4) +"ICU, n (%)",CCU,0,162 (16.2),137 (15.9),25 (18.4) +"ICU, n (%)",CSRU,,202 (20.2),194 (22.5),8 (5.9) +"ICU, n (%)",MICU,,380 (38.0),318 (36.8),62 (45.6) +"ICU, n (%)",SICU,,256 (25.6),215 (24.9),41 (30.1) +"mortality, n (%)",0,0,864 (86.4),864 (100.0), +"mortality, n (%)",1,,136 (13.6),,136 (100.0) diff --git a/descriptive/example1/data/test2.csv b/descriptive/example1/data/test2.csv new file mode 100644 index 0000000..601a09c --- /dev/null +++ b/descriptive/example1/data/test2.csv @@ -0,0 +1,11 @@ +,,Grouped by Smoke,Grouped by Smoke,Grouped by Smoke,Grouped by Smoke,Grouped by Smoke +,,Missing,Overall,0,1,2 +n,,,200,88,85,27 +"Age, mean (SD)",,0,53.1 (12.6),50.1 (12.0),56.2 (12.4),53.0 (12.9) +"HR, mean (SD)",,0,74.7 (12.2),70.1 (10.7),79.6 (12.6),74.0 (9.5) +"Group, n (%)",Active,0,100 (50.0),41 (46.6),45 (52.9),14 (51.9) +"Group, n (%)",Control,,100 (50.0),47 (53.4),40 (47.1),13 (48.1) +"sBP, mean (SD)",,0,157.2 (25.7),147.8 (22.3),167.5 (26.8),155.6 (20.3) +"Smoke, n (%)",0,0,88 (44.0),88 (100.0),, +"Smoke, n (%)",1,,85 (42.5),,85 (100.0), +"Smoke, n (%)",2,,27 (13.5),,,27 (100.0) diff --git a/transformation/dataFiles/homework/cleaning.py b/transformation/dataFiles/homework/cleaning.py new file mode 100644 index 0000000..24e633b --- /dev/null +++ b/transformation/dataFiles/homework/cleaning.py @@ -0,0 +1,3 @@ +import pandas as pd + +df = pd.read_excel('/Users/hantswilliams/Downloads/FoodAccessResearchAtlasData2019.xlsx') \ No newline at end of file diff --git a/transformation/pyScripts/p3_stonyBrookClean.py b/transformation/pyScripts/p3_stonyBrookClean.py index 97d500b..4058aea 100644 --- a/transformation/pyScripts/p3_stonyBrookClean.py +++ b/transformation/pyScripts/p3_stonyBrookClean.py @@ -1,7 +1,7 @@ import pandas as pd # load the data -stonybrook = pd.read_csv('transformation/dataFiles/113243405_StonyBrookUniversityHospital_standardcharges.csv') +stonybrook = pd.read_csv('transformation/dataFiles/raw/113243405_StonyBrookUniversityHospital_standardcharges.csv') ## looking at the data frame, it is currently in WIDE format, we want to make it STACKED format ## so first lets get a name of the columns @@ -13,3 +13,7 @@ ## transofmring with the melt function from a wide dataframe to a stacked dataframe stonybrook_modified = stonybrook.melt(id_vars=idVars, value_vars=valueVars) +print(stonybrook_modified.head(50)) + +stonybrook_modified.to_csv('transformation/dataFiles/clean/113243405_StonyBrookUniversityHospital_standardcharges_clean.csv') +stonybrook_modified.shape \ No newline at end of file