Skip to content

Commit

Permalink
Folded in changes from Brooke Wenig
Browse files Browse the repository at this point in the history
  • Loading branch information
bmc committed Jul 30, 2016
1 parent a619ddb commit 0cd35b6
Show file tree
Hide file tree
Showing 11 changed files with 14 additions and 12 deletions.
Binary file modified cs120_autograder_complete.dbc
Binary file not shown.
Binary file modified cs120_autograder_register.dbc
Binary file not shown.
Binary file modified cs120_autograder_simpler.dbc
Binary file not shown.
Binary file modified cs120_lab0.dbc
Binary file not shown.
Binary file modified cs120_lab1a_math_review.dbc
Binary file not shown.
Binary file modified cs120_lab1b_word_count_rdd.dbc
Binary file not shown.
Binary file modified cs120_lab2_linear_regression_df.dbc
Binary file not shown.
Binary file modified cs120_lab3_ctr_df.dbc
Binary file not shown.
2 changes: 1 addition & 1 deletion cs120_lab3_ctr_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ def parse_raw_df(raw_df):
# MAGIC %md
# MAGIC ### Visualization 1: Feature frequency
# MAGIC
# MAGIC We will now visualize the number of times each of the 233,286 OHE features appears in the training data. We first compute the number of times each feature appears, then bucket the features by these counts. The buckets are sized by powers of 2, so the first bucket corresponds to features that appear exactly once ( \\( \scriptsize 2^0 \\) ), the second to features that appear twice ( \\( \scriptsize 2^1 \\) ), the third to features that occur between three and four ( \\( \scriptsize 2^2 \\) ) times, the fifth bucket is five to eight ( \\( \scriptsize 2^3 \\) ) times and so on. The scatter plot below shows the logarithm of the bucket thresholds versus the logarithm of the number of features that have counts that fall in the buckets.
# MAGIC We will now visualize the number of times each of the 233,941 OHE features appears in the training data. We first compute the number of times each feature appears, then bucket the features by these counts. The buckets are sized by powers of 2, so the first bucket corresponds to features that appear exactly once ( \\( \scriptsize 2^0 \\) ), the second to features that appear twice ( \\( \scriptsize 2^1 \\) ), the third to features that occur between three and four ( \\( \scriptsize 2^2 \\) ) times, the fifth bucket is five to eight ( \\( \scriptsize 2^3 \\) ) times and so on. The scatter plot below shows the logarithm of the bucket thresholds versus the logarithm of the number of features that have counts that fall in the buckets.

# COMMAND ----------

Expand Down
Binary file modified cs120_lab4_pca.dbc
Binary file not shown.
24 changes: 13 additions & 11 deletions cs120_lab4_pca.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Databricks notebook source exported at Mon, 25 Jul 2016 15:37:31 UTC
# Databricks notebook source exported at Sat, 30 Jul 2016 02:42:14 UTC

# MAGIC %md
# MAGIC <a rel="license" href="http://creativecommons.org/licenses/by-nc-nd/4.0/"> <img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-nd/4.0/88x31.png"/> </a> <br/> This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-nd/4.0/"> Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License. </a>
Expand Down Expand Up @@ -353,11 +353,13 @@ def pca(data, k=2):
# MAGIC ### (2b) PCA on `data_random`
# MAGIC
# MAGIC Next, use the PCA function we just developed to find the top two principal components of the spherical `data_random` we created in Visualization 1.
# MAGIC
# MAGIC First, we need to convert `data_random` to the RDD `random_data_rdd`, and do all subsequent operations on `random_data_rdd`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
random_data_df = sc.parallelize(data_random)
random_data_rdd = sc.parallelize(data_random)

# Use pca on data_random
top_components_random, random_data_scores_auto, eigenvalues_random = <FILL IN>
Expand Down Expand Up @@ -428,7 +430,7 @@ def project_points_and_get_lines(data, components, x_range):
# COMMAND ----------

((x1, x2), (line1X1, line1X2), (line2X1, line2X2)) = \
project_points_and_get_lines(random_data_df, top_components_random, 5)
project_points_and_get_lines(random_data_rdd, top_components_random, 5)

# generate layout and plot data
fig, ax = prepare_plot(np.arange(46, 55, 2), np.arange(46, 55, 2), figsize=(7, 7))
Expand Down Expand Up @@ -572,14 +574,14 @@ def variance_explained(data, k=1):
components, scores, eigenvalues = <FILL IN>
<FILL IN>

variance_random_1 = variance_explained(random_data_df, 1)
variance_random_1 = variance_explained(random_data_rdd, 1)
variance_correlated_1 = variance_explained(correlated_data, 1)
variance_random_2 = variance_explained(random_data_df, 2)
variance_random_2 = variance_explained(random_data_rdd, 2)
variance_correlated_2 = variance_explained(correlated_data, 2)
variance_threeD_2 = variance_explained(threeD_data, 2)
print ('Percentage of variance explained by the first component of random_data_df: {0:.1f}%'
print ('Percentage of variance explained by the first component of random_data_rdd: {0:.1f}%'
.format(variance_random_1 * 100))
print ('Percentage of variance explained by both components of random_data_df: {0:.1f}%'
print ('Percentage of variance explained by both components of random_data_rdd: {0:.1f}%'
.format(variance_random_2 * 100))
print ('\nPercentage of variance explained by the first component of correlated_data: {0:.1f}%'.
format(variance_correlated_1 * 100))
Expand Down Expand Up @@ -680,6 +682,7 @@ def parse(line):
Test.assertEquals(entry[0], (0, 0), 'incorrect key for entry')
Test.assertEquals(entry[1].size, 240, 'incorrect length of entry array')
Test.assertTrue(np.allclose(np.sum(entry[1]), 24683.5), 'incorrect values in entry array')
Test.assertTrue(raw_data.is_cached, 'raw_data is not cached')

# COMMAND ----------

Expand Down Expand Up @@ -805,7 +808,7 @@ def rescale(ts):
# MAGIC %md
# MAGIC ### Visualization 7: Top two components as images
# MAGIC
# MAGIC Now, we'll view the scores for the top two component as images. Note that we reshape the vectors by the dimensions of the original image, 230 x 202.
# MAGIC Now, we'll view the scores for the top two components as images. Note that we reshape the vectors by the dimensions of the original image, 230 x 202.
# MAGIC These graphs map the values for the single component to a grayscale image. This provides us with a visual representation which we can use to see the overall structure of the zebrafish brain and to identify where high and low values occur. However, using this representation, there is a substantial amount of useful information that is difficult to interpret. In the next visualization, we'll see how we can improve interpretability by combining the two principal components into a single image using a color mapping.

# COMMAND ----------
Expand Down Expand Up @@ -1134,7 +1137,7 @@ def polar_transform(scale, img):
# MAGIC %md
# MAGIC ### Visualization 9: Top two components by time
# MAGIC
# MAGIC Let's view the scores from the first two PCs as a composite image. When we preprocess by aggregating by time and then perform PCA, we are only looking at variability related to temporal dynamics. As a result, if neurons appear similar -- have similar colors -- in the resulting image, it means that their responses vary similarly over time, regardless of how they might be encoding direction. In the image below, we can define the midline as the horizontal line across the middle of the brain. We see clear patterns of neural activity in different parts of the brain, and crucially note that the regions on either side of the midline are similar, which suggests that temporal dynamics do not differ across the two sides of the brain.
# MAGIC Let's view the scores from the first two principal components as a composite image. When we preprocess by aggregating by time and then perform PCA, we are only looking at variability related to temporal dynamics. As a result, if neurons appear similar -- have similar colors -- in the resulting image, it means that their responses vary similarly over time, regardless of how they might be encoding direction. In the image below, we can define the midline as the horizontal line across the middle of the brain. We see clear patterns of neural activity in different parts of the brain, and crucially note that the regions on either side of the midline are similar, which suggests that temporal dynamics do not differ across the two sides of the brain.

# COMMAND ----------

Expand Down Expand Up @@ -1218,7 +1221,7 @@ def polar_transform(scale, img):
# MAGIC %md
# MAGIC ### Visualization 10: Top two components by direction
# MAGIC
# MAGIC Again, let's view the scores from the first two PCs as a composite image. When we preprocess by averaging across time (group by direction), and then perform PCA, we are only looking at variability related to stimulus direction. As a result, if neurons appear similar -- have similar colors -- in the image, it means that their responses vary similarly across directions, regardless of how they evolve over time. In the image below, we see a different pattern of similarity across regions of the brain. Moreover, regions on either side of the midline are colored differently, which suggests that we are looking at a property, direction selectivity, that has a different representation across the two sides of the brain.
# MAGIC Again, let's view the scores from the first two principal components as a composite image. When we preprocess by averaging across time (group by direction), and then perform PCA, we are only looking at variability related to stimulus direction. As a result, if neurons appear similar -- have similar colors -- in the image, it means that their responses vary similarly across directions, regardless of how they evolve over time. In the image below, we see a different pattern of similarity across regions of the brain. Moreover, regions on either side of the midline are colored differently, which suggests that we are looking at a property, direction selectivity, that has a different representation across the two sides of the brain.

# COMMAND ----------

Expand Down Expand Up @@ -1324,4 +1327,3 @@ def polar_transform(scale, img):

# COMMAND ----------


0 comments on commit 0cd35b6

Please sign in to comment.