diff --git a/Lecture1.ipynb b/Lecture1.ipynb index 766a81f..d5d4ee0 100644 --- a/Lecture1.ipynb +++ b/Lecture1.ipynb @@ -581,7 +581,7 @@ "D0 = 0.5\n", "\n", "# Range of activation energies in eV\n", - "activation_energies = np.linspace(0.1, 1, 0) # Range from 0.1 to 0.8 eV in n steps\n", + "activation_energies = np.linspace(0.1, 1, 0) # Range from 0.1 to 1 eV in n steps\n", "\n", "# Temperature range in K\n", "T = np.linspace(100, 5000, 100)\n", @@ -863,7 +863,7 @@ }, "outputs": [], "source": [ - "#Code block\n", + "#Empty block for your answers\n", "\n", "\n" ] @@ -876,44 +876,7 @@ }, "outputs": [], "source": [ - "#Comment block\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 430 - }, - "id": "x0ICHJj-ifCa", - "outputId": "46ac74f1-3dc1-4929-f294-9755c0fe881b", - "tags": [] - }, - "outputs": [], - "source": [ - "#Code block\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OC29f_fOifCa", - "outputId": "487c61eb-98b2-45de-ec93-0c396a9b72bd", - "tags": [] - }, - "outputs": [], - "source": [ - "#Comment block\n", + "#Empty block for your answers\n", "\n", "\n" ] @@ -923,7 +886,7 @@ "metadata": {}, "source": [ "
\n", - " 📓 Submission: When your notebook is complete, click on the download icon on the top right, select .ipynb. If you are using Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n", + " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n", "
" ] }, diff --git a/Lecture2.ipynb b/Lecture2.ipynb new file mode 100644 index 0000000..491b485 --- /dev/null +++ b/Lecture2.ipynb @@ -0,0 +1,1835 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "454ppM0MFAPE", + "tags": [] + }, + "source": [ + "# Machine Learning Basics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O_Nj8oCN_szN" + }, + "source": [ + "
\n", + " 💡 Alan Turing: We are not interested in the fact that the brain has the consistency of cold porridge.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4ksjNfR_szO" + }, + "source": [ + "\n", + "\n", + "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture2-modelling)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JXcgtX8S_szO" + }, + "source": [ + "## 💎 Crystal hardness\n", + "\n", + "Are you excited to tackle a regression problem?\n", + "\n", + "Today's dataset consists of the [bulk modulus](https://en.wikipedia.org/wiki/Bulk_modulus) for more than 10,000 inorganic crystals. The exercise aims to develop an understanding of how to approach supervised learning problems.\n", + "\n", + "The energy of a crystal varies with the volume of the unit cell. The equilibrium volume is found at the minimum in the potential energy surface. The shape of this curve can be described by an equation of state, where energy is a function of volume or pressure, i.e. $E(V)$ or $E(P)$. The curvature is related to the bulk modulus $B$, which can be defined as:\n", + "\n", + "$\n", + "B = -V \\frac{\\partial P}{\\partial V} = V \\frac{\\partial^2 E}{\\partial V^2}\n", + "$\n", + "\n", + "The typical unit of $B$ is GPa. For example, diamond has has a measured bulk modulus of $B$ = 443 GPa at T = 4 K. The bulk modulus is a useful quantity in models of materials bonding, thermodynamics, and mechanics. For instance, the inverse of the bulk modulus is the compressability of a crystal ($\\kappa = \\frac{1}{B}$).\n", + "\n", + "We will use the Python package `matminer` (https://matminer.readthedocs.io) to access the materials dataset and featurise the data in a form that is suitable for statistical analysis and building machine learning models. We will use the computational materials science package `pymatgen` (https://pymatgen.org) that powers the [Materials Project](https://materialsproject.org). There are many new concepts that will be explored in future lectures, so don't worry about grasping everything now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VsM81HX6FAPV", + "outputId": "71c29e3b-b326-48fb-8103-ae6c123bed2d" + }, + "outputs": [], + "source": [ + "# Installation of libraries\n", + "!pip install matminer --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4WAbrbHyPL8-" + }, + "source": [ + "
\n", + " 🍧 Tip: The import block grows large as you use more Python libraries. You can look up each package (try a web search to find documentation and examples) to learn more about the modules we are using.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e82OXOnqFAPa", + "tags": [] + }, + "outputs": [], + "source": [ + "# Basic utilities\n", + "import pprint # Pretty print data structures\n", + "import warnings # Warning control\n", + "import numpy as np # Numerical operations\n", + "from numpy import ComplexWarning # Warning for complex numbers \n", + "\n", + "# Data handling\n", + "import pandas as pd # Data manipulation with DataFrames\n", + "from monty.serialization import loadfn # Load serialised data\n", + "\n", + "# Materials science\n", + "from pymatgen.core import Structure # Materials analysis for crystal structures\n", + "import matminer # Materials informatics\n", + "from matminer.datasets.dataset_retrieval import load_dataset # Load materials datasets\n", + "\n", + "# Visualisation\n", + "import matplotlib.pyplot as plt # Plotting\n", + "import seaborn as sns # Statistical visualisation\n", + "plt.style.use('ggplot') # Set Matplotlib style to 'ggplot'\n", + "\n", + "# Warning management\n", + "warnings.filterwarnings(\"ignore\", category=ComplexWarning) # Ignore ComplexWarning\n", + "\n", + "# Performance adjustments\n", + "teaching_mode = True # To make models run faster" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Colab error solution\n", + "If running the import module cell fails with an \"AttributeError\", click `Runtime` -> `Restart Session` and then simply rerun the cell. \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S9uQ9sA9FAPd" + }, + "source": [ + "## Bulk moduli dataset\n", + "\n", + "From `matminer`, we can check what datasets are available using the `datasets.get_available_datasets()` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4k0NSW9SFp6C", + "outputId": "ee47b1c9-f05f-423f-f134-172c42e73a69", + "tags": [] + }, + "outputs": [], + "source": [ + "# Print the available datasets\n", + "matminer.datasets.get_available_datasets(print_format='low')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QuQWMPolHXdp" + }, + "source": [ + "We can use the `get_all_dataset_info` function from the `matminer.datasets.dataset_retrieval` module to output a detailed description of a matminer dataset. Let's check the information for the `matbench_log_kvrh` dataset. \n", + "\n", + "Here \"K\" relates to the bulk modulus (which we called $B$), and and \"VRH\" relates to the Voigt-Reuss-Hill equation of state, which is one approach to define a value for each material." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J5Nh53dUG8yN", + "outputId": "b6fa11e6-5f36-47e0-db3e-57ec778f05ea", + "tags": [] + }, + "outputs": [], + "source": [ + "print(matminer.datasets.dataset_retrieval.get_all_dataset_info('matbench_log_kvrh'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CWfRPdkbJZam" + }, + "source": [ + "We can then load a dataset using the `load_dataset` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 508 + }, + "id": "YvPGXUFfFAPi", + "outputId": "813e5f05-5ad2-4589-cce7-f04816ff67ed" + }, + "outputs": [], + "source": [ + "# Use matminer to download the dataset\n", + "df = load_dataset('matbench_log_kvrh')\n", + "print(f'The full dataset contains {df.shape[0]} entries. \\n')\n", + "\n", + "if teaching_mode:\n", + " # Store the original DataFrame as a copy\n", + " full_dataset_df = df.copy()\n", + " # Create a subset of the original DataFrame for demonstration purposes\n", + " df = df.sample(n=1500, random_state=41)\n", + " print(f'For teaching purposes we will only work with {df.shape[0]} entries from the DataFrame to make the model training and testing faster. \\n')\n", + "\n", + "print('The DataFrame is shown below:')\n", + "df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eNBBAHSFFAPm" + }, + "source": [ + "### Visualise the target variable" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SNHuNnCeKEqB" + }, + "source": [ + "We can use `df.describe()` to produce summary statistics of the numerical columns. The importance of this is to check whether the data for our target variable, `log10(K_VRH)`, is reasonable. Negative values for the bulk modulus are considered unphysical and forbidden by crystal thermodynamics. You can think about why from the definition.\n", + "\n", + "As we are working with `log10` of the bulk modulus, it should not be possible for there to be negative values in our target variable column as the logarithm of a negative number is undefined. This also gives us a quick check for the input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "QOXtNPnwKLPp", + "outputId": "4c51e835-9fa4-4dbc-e6f9-92704243539f" + }, + "outputs": [], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RHCGO7PEM8jt" + }, + "source": [ + "From the summary statistics, the minimum value for `log10(K_VRH)` is zero, so it appears that there are no glaring issues with the target variable. \n", + "\n", + "For a better understanding, let's make a histogram to visualise the distribution. This is best practice when you encounter any new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 457 + }, + "id": "3Nco38cTFAPt", + "outputId": "b2904fc9-74ac-498b-da08-cd92f2617ec1" + }, + "outputs": [], + "source": [ + "# Plot a histogram\n", + "fig, ax = plt.subplots(figsize=(5,3))\n", + "ax.hist(df22['log10(K_VRH)'])\n", + "ax.set_xlabel(r'$log_{10}K_{VRH}$ [$log_{10}GPa$]')\n", + "ax.set_ylabel('Counts')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hLwoNL7-gDv8" + }, + "source": [ + "
\n", + " Code hint \n", + "Your dataframe is not called df22!\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E_86YHauFAPw" + }, + "source": [ + "## Features of materials\n", + "As you may notice from the dataset, we only have one input feature, the crystal structure. This is not a numerical feature that we can use for a regression model. For supervised machine learning, we must represent each material by a vector that can be used as an input to the model, e.g.\n", + "\n", + "$$f(\\textrm{material}) \\rightarrow [1.1,0.8,3.5,0.01]$$\n", + "\n", + "would be a four-dimensional representation.\n", + "For now we will use some pre-selected features from `matminer` for this regression task. Materials representations will be covered in Lecture 4." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nMZUwbOoFAP2" + }, + "source": [ + "### Composition-based features\n", + "\n", + "To use the `ElementProperty` featuriser, we first need to add a `pymatgen.core.composition.Composition` object to our DataFrame. There are several ways to do this but we will proceed using the `composition` property of the pymatgen `Structure` class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "jyp_jAnXFAP3", + "outputId": "f756a4a9-9e85-40eb-90dc-e29b6214829e" + }, + "outputs": [], + "source": [ + "from matminer.featurizers.composition.composite import ElementProperty\n", + "from matminer.featurizers.structure.order import DensityFeatures\n", + "\n", + "# Add a composition column to df using the composition property of the Structure class and a lambda function\n", + "df['composition'] = df.structure.apply(lambda x: x.composition )\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ew5KbIuiRvEl" + }, + "source": [ + "The new composition column contains both the elements and the amount of each element in the composition. Let's use the `ElementProperty` featuriser to add some composition-based features to our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 618, + "referenced_widgets": [ + "e710b8caf6bd4a09a6a118403d80fc3c", + "4203508c83744a5c84670c0bbc06965b", + "e6dbacb36ff641339737d92e45d93b26", + "535ada5b803145178dddce7c4f280a08", + "b9f2dd4af34944e0b98aa9d14d9d437f", + "60a6d250398b4f918303e5214d1ac09f", + "0c124b51f2524de896ffa4410f093dfe", + "425f61fb86954383866c17ba4dfa6e7d", + "599bf9f909614f1c89c2fb8a2de5ec20", + "48470cbdaff145de994fe6d89bcd394d", + "91300eca1cd14325af914e5c65c5e90c" + ] + }, + "id": "gRGbhEXTFAP5", + "outputId": "1c47fad6-6db4-4b9b-c467-ce7ca512f0a0" + }, + "outputs": [], + "source": [ + "# Create the ElementProperty featuriser\n", + "el_prop_featuriser = ElementProperty.from_preset(preset_name='magpie')\n", + "\n", + "# By default multiprocessing is enabled, however this has been known to slow performance on some systems, so we disable it\n", + "el_prop_featuriser.set_n_jobs(1)\n", + "\n", + "# Apply the ElementProperty featuriser\n", + "df = el_prop_featuriser.featurize_dataframe(df, col_id='composition')\n", + "\n", + "# Print the shape of the DataFrame\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WscGGruUJuSW" + }, + "source": [ + "There are now a lot more columns in the DataFrame. We can check the reference for a property featuriser using the `.citations()` method as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PJxQSYPnJ24A", + "outputId": "40a22222-d78d-439b-ffb2-5564df5410b1" + }, + "outputs": [], + "source": [ + "el_prop_featuriser.citations()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j7sL1U_eSk2W" + }, + "source": [ + "### Structure-based features\n", + "\n", + "Within `matminer`, there are many featurisers which operate on crystal structures. We will add some simple features based on the density of the structures using `DensityFeatures`. We will return to these later in the module. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 600, + "referenced_widgets": [ + "1dae799457dc4e64a7ba43cd84610e51", + "a3e03881631a4c3a877bddd40a4b9339", + "2421b0bee5ea47a5b57268a17bf77388", + "14a91f0ac7884937abd7cdbcf640f146", + "a83f1fc752fe461b91768d888eaf6d74", + "9f929bd631c240c593b25abc794292bc", + "f5ba5255dbae4b238b8989248a4c432c", + "fecccd219e104ecd8de37e4cfa833466", + "7362762449f8404d8798a39ecaa2879d", + "0777667d2dce45488b68ca1762e881e4", + "d8b3449a21b845c9bbffb7e3871097f1" + ] + }, + "id": "-KGqcyFKFAP7", + "outputId": "a5ec79dd-eb8e-45cc-8b94-736fa74f72fa" + }, + "outputs": [], + "source": [ + "# Crystal structure to vector\n", + "density_featuriser = DensityFeatures()\n", + "density_featuriser.set_n_jobs(1)\n", + "df=density_featuriser.fit_featurize_dataframe(df, col_id='structure')\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jkFiyddrFAP_" + }, + "source": [ + "## Bulk modulus regression\n", + "\n", + "With regression tasks, we want to fit a model that maps our input feature $x$ to our target variable $y$, i.e. $y=f(x)$. Here, $x$ and $y$ are vectors of dimensions $M$ and $N$, respectively, such that $f: \\mathbb{R}^M\\rightarrow\\mathbb{R}^N$. \n", + "\n", + "Supervised machine learning problems generally take the following form:\n", + "* Select a form for the model $f$\n", + "* Determine an error/loss function that is used to evaluate model performance\n", + "* Optimise the parameters of the model to minimise the error\n", + "\n", + "The error, $L(\\hat{y},y)$, is a function of the predicted target variable $\\hat{\\textbf{y}}=f(\\textbf{x})$ and the true target variable, $\\textbf{y}$. We want our model to minimise $L$. \n", + "\n", + "For our problem. the target variable is `log(K_VRH)`, which we want to predict from knowledge of the composition and structure (represented by the set of chosen features). \n", + "\n", + "We can make extensive use of [scikit-learn](https://scikit-learn.org) for these tasks." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iNvTOfW1FAQB" + }, + "source": [ + "### Data preparation\n", + "\n", + "To start, we need to split our dataset into the target variable `log10(K_VRH)` and the input features. For the input features, we must remove any non-numerical data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dfGnmTGUFAQC", + "outputId": "971126da-dea3-4714-e6ad-e9a029f4e9a1" + }, + "outputs": [], + "source": [ + "# Define the features we want to keep\n", + "features_to_drop = ['structure','composition','log10(K_VRH)']\n", + "feature_cols = [col for col in list(df.columns) if col not in features_to_drop]\n", + "\n", + "# Get an array of the features\n", + "X = df[feature_cols].values\n", + "\n", + "# Get an array of the target variable\n", + "y = df['log10(K_VRH)'].values\n", + "\n", + "print(f'Shape of X: {X.shape}')\n", + "print(f'Shape of y: {y.shape}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fsi1YQeSFAQE" + }, + "source": [ + "We can also check the names of the features used for our model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1NJSASnsFAQF", + "outputId": "44e240a5-5445-414a-b23d-2c53d10d4c7a" + }, + "outputs": [], + "source": [ + "print(f'We have {len(feature_cols)} features in our dataset.')\n", + "print(features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " Code hint \n", + "Check your print statement!\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wXdIBtHFSWJw" + }, + "source": [ + "### Baseline linear regression model\n", + "A simple model is the linear regressor. For a univariate linear regressor represented by $\\hat{y}=mx+c$, the task is to find the best value of $m$ and $c$ that minimise the model error.\n", + "\n", + "If we were to consider multivariate linear regression, then our equation transforms to $\\hat{y}=\\beta_0 + ∑_1^n\\beta_ix_i$, where $\\beta_i$ are the weights of the model and $x_i$ are the input features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import linear regression model\n", + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Fit the model to the data\n", + "lr = LinearRegression()\n", + "lr.fit(X, y)\n", + "\n", + "def make_prediction_plot(X, y, model, label):\n", + " \"\"\"\n", + " Plot and return predictions for the given model and data.\n", + " \n", + " Parameters:\n", + " X : Input features.\n", + " y : Actual target values.\n", + " model : Fitted model.\n", + " label : Descriptor for the axes labels.\n", + " \n", + " Returns:\n", + " Predicted values.\n", + " \"\"\"\n", + " y_pred = model.predict(X)\n", + " fig, ax = plt.subplots(figsize=(5, 3))\n", + " ax.scatter(y, y_pred, c=y, cmap='viridis')\n", + " ax.plot(y, y, 'r-')\n", + " ax.set_xlabel(f'{label} true')\n", + " ax.set_ylabel(f'{label} predicted')\n", + " plt.show()\n", + " \n", + " return y_pred\n", + "\n", + "# Make predictions using the fitted model\n", + "y_pred = make_prediction_plot(X, y, lr, 'log10(K_VRH)')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wqN61jd2itkf", + "outputId": "92fea7dc-b068-4f68-bd8a-316e2ab86454" + }, + "outputs": [], + "source": [ + "from sklearn import metrics\n", + "\n", + "# Mean absolute error\n", + "print (f'The training MAE = {metrics.mean_absolute_error(y,y_pred):.3f} log10GPa')\n", + "\n", + "# Mean squared error\n", + "print(f'The training RMSE = {metrics.root_mean_squared_error(y,y_pred):.3f} log10GPa')\n", + "\n", + "# $r^2$ - coefficient of determination\n", + "print(f'The training r^2 = {lr.score(X,y):.3f}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_dqDln7n_szT" + }, + "source": [ + "Based on your analysis, is this a useful model?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ze_nruvgFAQH" + }, + "source": [ + "### Random forest regressor\n", + "\n", + "We can do better with a non-linear model. Let's try a machine learning regressor. [Random forest](https://en.wikipedia.org/wiki/Random_forest) is an ensemble machine learning algorithm that combines multiple [decision trees](https://en.wikipedia.org/wiki/Decision_tree) to improve predictive accuracy.\n", + "\n", + "Random forest can be applied to both classification and regression tasks. The prediction is made by taking a majority vote (for classification) or averaging (for regression) of the predictions from individual trees. Mathematically, it can be represented as:\n", + "\n", + "$\n", + "\\hat{y}_{RF} = \\frac{1}{n_{trees}} \\sum_{i=1}^{n_{trees}} f_i(x)\n", + "$\n", + "\n", + "where:\n", + "- $\\hat{y}_{RF}$ is the random forest prediction.\n", + "- $n_{trees}$ is the number of decision trees in the forest.\n", + "- $f_i(x)$ represents the prediction of the $i$-th tree.\n", + "\n", + "#### 1. Create the regressor\n", + "\n", + "In `sklearn`, the random forest regressor is created by:\n", + "\n", + "```python\n", + "RandomForestRegressor(n_estimators=, criterion=, max_depth=, min_samples_split=, min_samples_leaf=)\n", + "```\n", + "\n", + "The hyperparameters that need to be set are:\n", + "* `n_estimators`: number of decision trees in the random forest model.\n", + "* `criterion`: loss function to be minimised. Default value is 'squared_error` which is the MSE.\n", + "* `max_depth`: maximum depth of the tree.\n", + "* `min_sample_split`: minimum number of samples required to split an internal node.\n", + "* `min_samples_leaf`: minimum number of samples required to be at a leaf node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "id": "wMUSntUxFAQJ", + "outputId": "016083c9-92c9-46e5-b835-9046b932cc28" + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "# Define the model\n", + "rf = RandomForestRegressor(n_estimators=100,criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)\n", + "\n", + "# Fit the model\n", + "rf.fit(X,y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "orMF7755ytgW" + }, + "source": [ + "
\n", + " Why is the random state set to 42?\n", + "Most random number generators start with an initial seed value and then produces a sequence of numbers that appears random. Since the algorithms are deterministic, providing the same seed will result in the same sequence of \"random\" numbers. 42 is simply a science fiction reference. \n", + "
\n", + "\n", + "You just trained a machine learning model 🎉. \n", + "\n", + "We can now make predictions and plot the results. We will use the plotting function `make_prediction_plot()` that we defined earlier to make the plots. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 891 + }, + "id": "tRJQzOd1gDwC", + "outputId": "78e2c4ff-fa56-4a2a-9687-176d3a171f5e" + }, + "outputs": [], + "source": [ + "print(\"Linear regression\")\n", + "y_pred_lr = make_prediction_plot(X,y,lr,'log10(K_VRH)')\n", + "\n", + "print(\"Random Forest model\")\n", + "y_pred = make_prediction_plot(X,y,rf,'log10(K_VRH)')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jooFqet9za6q" + }, + "source": [ + "Now let's quantify the performance of the random forest model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MWLLTpK7FAQP", + "outputId": "cf941a03-0a25-4eed-b677-d60fb903078c" + }, + "outputs": [], + "source": [ + "# Print the metrics\n", + "print(f'The training MAE = {metrics.mean_absolute_error(y,y_pred):.3f} log10GPa')\n", + "print(f'The training RMSE = {metrics.root_mean_squared_error(y,y_pred):.3f} log10GPa')\n", + "print(f'The training r^2 = {rf.score(X,y):.3f}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ziXU0z0eFAQR" + }, + "source": [ + "The coefficient of determination, $r^2$, as well as the low RMSE suggest that this model is performs well. However, it is also likely that the model is simply overly-fitted to reproduce the training data. This means that it will not generalise to other materials (unseen data), which is necessary for a meaningful machine learning model." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QDt6aZuyFAQS" + }, + "source": [ + "#### 2. Cross validation\n", + "To better determine the quality of our model, we can peform cross-validation (CV). CV enables us to evaluate the out-of-sample goodness-of-fit of a regressor. We will use $k$-fold CV, which splits the training set into $k$ subsets. Each subset is used as a validation set to evaluate the performance, with the model being trained on the remaining $k-1$ subsets. Don't worry, we'll cover this in later lectures.\n", + "\n", + "Let's perform 5-fold CV:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YLMZv-2tFAQT", + "outputId": "b6475657-5959-46d3-f254-d1d00621a8ce" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import KFold, cross_val_score, cross_validate\n", + "\n", + "# Define the number of splits for cross-validation\n", + "n_splits = 5 if teaching_mode else 10\n", + "\n", + "# Compute the cross-validation score\n", + "cv = KFold(\n", + " n_splits=n_splits,\n", + " shuffle=True,\n", + " random_state=42\n", + " )\n", + "\n", + "scores= cross_val_score(rf, X, y,cv=cv, scoring='neg_mean_absolute_error')\n", + "\n", + "r2_scores = cross_val_score(rf, X, y, cv=cv, scoring='r2')\n", + "\n", + "print('From our cross-validation, we have obtained the following results:')\n", + "print(f'mean MAE = {np.mean(np.abs(scores)):.3f} log10GPa')\n", + "print(f'mean r^2 = {np.mean(np.abs(r2_scores)):.3f}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "id": "5cc1JJlMdMuV", + "outputId": "ff397fbd-bd91-4252-e676-89cd049eec7b" + }, + "outputs": [], + "source": [ + "# Show the training scores for each k-fold\n", + "fig, ax = plt.subplots(2, 1, figsize=(5, 4)) \n", + "\n", + "ax[0].scatter([i for i in range(len(scores))], np.abs(scores), c=scores, cmap='viridis')\n", + "ax[1].scatter([i for i in range(len(r2_scores))], np.abs(r2_scores), c=r2_scores, cmap='viridis') \n", + "ax[0].set_xlabel('Training fold')\n", + "ax[0].set_ylabel('MAE')\n", + "ax[0].set_ylim(0, 0.14) \n", + "ax[0].set_xticks(range(len(scores))) \n", + "ax[1].set_xticks(range(len(r2_scores))) \n", + "ax[1].set_xlabel('Training fold')\n", + "ax[1].set_ylabel('r$^2$')\n", + "ax[1].set_ylim(0, 1.0) \n", + "\n", + "# Display the plot\n", + "plt.show() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5hpFscUBFAQW" + }, + "source": [ + "There is an increase in the error (decrease in performance) for the CV model. However, the MAE is still reasonable. Let's visualise the result of the final model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 453 + }, + "id": "0UQ1OR1JgDwC", + "outputId": "d593fde2-2602-4535-93ab-e9a3e39d89aa", + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_val_predict\n", + "\n", + "# Plot the original and predicted data against each other\n", + "fig, ax = plt.subplots(figsize=(5, 3))\n", + "\n", + "# Scatter plot with color\n", + "ax.scatter(y, cross_val_predict(rf, X, y, cv=cv), c=y, cmap='viridis', label='Predicted', alpha=0.6)\n", + "\n", + "# Red line representing a perfect prediction (y = x)\n", + "ax.plot(y, y, 'r-', label='Perfect prediction')\n", + "\n", + "# Set labels and legend\n", + "ax.set_xlabel('K_VRH true')\n", + "ax.set_ylabel('K_VRH predicted')\n", + "ax.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1N6SDr5h0XMM" + }, + "source": [ + "## Feature importance\n", + "\n", + "We fed in many materials features, but which were most useful? Understanding this will increase our understanding (the interpretability) of the model.\n", + "\n", + "We can see how particular features contribute to a Random Forest model by looking at the `RandomForestRegressor().feature_importances_` attribute. Some features are significant, whereas others offer very little contribution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 569 + }, + "id": "2Q7t4b4IgDwD", + "outputId": "e33a4dd6-ff5d-4bb7-b573-8884112e5a4a", + "tags": [] + }, + "outputs": [], + "source": [ + "# Get the feature importances\n", + "importances = rf.feature_importances_\n", + "\n", + "# Get the indices that would sort the importances array from largest to smallest\n", + "indices = np.argsort(importances)[::-1]\n", + "\n", + "# Create a figure and axis for the plot\n", + "fig, ax = plt.subplots(figsize=(5, 3))\n", + "\n", + "# Create a bar plot of the feature importance\n", + "ax.bar(range(X.shape[1]), importances[indices], color=\"r\", align=\"center\")\n", + "\n", + "# Set the labels\n", + "ax.set_xlabel(\"Feature index\")\n", + "ax.set_ylabel(\"Importance\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YWdLRxwxB6ju" + }, + "source": [ + "There is a rapid drop off in the feature importance, with few features offering a significant contribution to the model.\n", + "\n", + "Below we will only plot the importance for the top-$N$ features. Try a value of 5. I guess the top feature is `vpa` (volume per atom)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "Lf2VfJfsgDwD", + "outputId": "a77abb2a-f4e6-47f7-a14f-61078643fea5", + "tags": [] + }, + "outputs": [], + "source": [ + "# Visualise the top N features\n", + "N = \n", + "\n", + "# Get the names of the top N important features\n", + "top_feature_names = df[feature_cols].columns.values[np.argsort(importances)[::-1][:N]]\n", + "\n", + "# Set up the figure and axis\n", + "fig, ax = plt.subplots(figsize=(5, 3))\n", + "\n", + "# Create a bar plot of the top N feature importances\n", + "ax.bar(x=top_feature_names, height=importances[np.argsort(importances)[::-1][:N]])\n", + "\n", + "# Set the labels and title\n", + "ax.set_xlabel(\"Feature\")\n", + "ax.set_ylabel(\"Importance\")\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "ax.set_xticklabels(top_feature_names, rotation=45, ha='right', rotation_mode='anchor')\n", + "\n", + "plt.show()\n", + "\n", + "# Print them too\n", + "print(f\"Top {N} Features:\")\n", + "for feat in range(N):\n", + " print(f\" {feat+1}. {feature_cols[indices[feat]]} ({importances[indices[feat]]:.3f})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " Code hint \n", + "Remember to set N!\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZNXExVcAgDwD", + "tags": [] + }, + "source": [ + "## 🚨 Exercise 2\n", + "\n", + "
\n", + " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n", + "
\n", + "\n", + "### Your details" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eCUqQGM3gDwD", + "outputId": "bcc02595-32da-4706-b819-086b8e9c182b" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "# Insert your values\n", + "Name = \"No Name\" # Replace with your name\n", + "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n", + "\n", + "# Set a random seed using the CID value\n", + "CID = int(CID)\n", + "np.random.seed(CID)\n", + "\n", + "# Print the message\n", + "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iKahfCzyMmeG" + }, + "source": [ + "### Problem\n", + "\n", + "In machine learning, reducing the number of features can lead to simpler models, reduce the risk of overfitting, and improve generalisation. Understanding which features are necessary and which can be excluded is crucial for developing efficient and interpretable models. \n", + "\n", + "A task will be given in class focusing on feature selection and model performance analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9obDaZV3gDwD", + "outputId": "7096de18-47b8-46e4-b448-c7d4df2ef668" + }, + "outputs": [], + "source": [ + "#Empty block for your answers\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 453 + }, + "id": "iBCMvIKogDwD", + "outputId": "915c6f5e-1d5d-4460-b70b-73a1a60b66a8" + }, + "outputs": [], + "source": [ + "#Empty block for your answers\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UitRoDjqgDwD", + "tags": [] + }, + "source": [ + "## 🌊 Dive deeper\n", + "\n", + "* _Level 1:_ Tackle Chapter 14 on Tree-Based Learners in [Machine Learning Refined](https://github.com/jermwatt/machine_learning_refined#what-is-new-in-the-second-edition).\n", + " \n", + "* _Level 2:_ A collection of videos from the [Materials Project Workshop](https://www.youtube.com/playlist?list=PLTjFYVNE7LTi6kGvPAF7DDQYj0KDL-vQL) on advanced Python.\n", + "\n", + "* _Level 3:_ Read more about the [scikit-learn](https://scikit-learn.org/stable/auto_examples/index.html) package and what it can do." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "vscode24", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0777667d2dce45488b68ca1762e881e4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0c124b51f2524de896ffa4410f093dfe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "14a91f0ac7884937abd7cdbcf640f146": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0777667d2dce45488b68ca1762e881e4", + "placeholder": "​", + "style": "IPY_MODEL_d8b3449a21b845c9bbffb7e3871097f1", + "value": " 1500/1500 [00:03<00:00, 449.72it/s]" + } + }, + "1dae799457dc4e64a7ba43cd84610e51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a3e03881631a4c3a877bddd40a4b9339", + "IPY_MODEL_2421b0bee5ea47a5b57268a17bf77388", + "IPY_MODEL_14a91f0ac7884937abd7cdbcf640f146" + ], + "layout": "IPY_MODEL_a83f1fc752fe461b91768d888eaf6d74" + } + }, + "2421b0bee5ea47a5b57268a17bf77388": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fecccd219e104ecd8de37e4cfa833466", + "max": 1500, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7362762449f8404d8798a39ecaa2879d", + "value": 1500 + } + }, + "4203508c83744a5c84670c0bbc06965b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_60a6d250398b4f918303e5214d1ac09f", + "placeholder": "​", + "style": "IPY_MODEL_0c124b51f2524de896ffa4410f093dfe", + "value": "ElementProperty: 100%" + } + }, + "425f61fb86954383866c17ba4dfa6e7d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "48470cbdaff145de994fe6d89bcd394d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "535ada5b803145178dddce7c4f280a08": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_48470cbdaff145de994fe6d89bcd394d", + "placeholder": "​", + "style": "IPY_MODEL_91300eca1cd14325af914e5c65c5e90c", + "value": " 1500/1500 [00:05<00:00, 183.22it/s]" + } + }, + "599bf9f909614f1c89c2fb8a2de5ec20": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "60a6d250398b4f918303e5214d1ac09f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7362762449f8404d8798a39ecaa2879d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "91300eca1cd14325af914e5c65c5e90c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9f929bd631c240c593b25abc794292bc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a3e03881631a4c3a877bddd40a4b9339": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9f929bd631c240c593b25abc794292bc", + "placeholder": "​", + "style": "IPY_MODEL_f5ba5255dbae4b238b8989248a4c432c", + "value": "DensityFeatures: 100%" + } + }, + "a83f1fc752fe461b91768d888eaf6d74": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b9f2dd4af34944e0b98aa9d14d9d437f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d8b3449a21b845c9bbffb7e3871097f1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e6dbacb36ff641339737d92e45d93b26": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_425f61fb86954383866c17ba4dfa6e7d", + "max": 1500, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_599bf9f909614f1c89c2fb8a2de5ec20", + "value": 1500 + } + }, + "e710b8caf6bd4a09a6a118403d80fc3c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4203508c83744a5c84670c0bbc06965b", + "IPY_MODEL_e6dbacb36ff641339737d92e45d93b26", + "IPY_MODEL_535ada5b803145178dddce7c4f280a08" + ], + "layout": "IPY_MODEL_b9f2dd4af34944e0b98aa9d14d9d437f" + } + }, + "f5ba5255dbae4b238b8989248a4c432c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fecccd219e104ecd8de37e4cfa833466": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/_toc.yml b/_toc.yml index a68a9b4..6c9e705 100644 --- a/_toc.yml +++ b/_toc.yml @@ -14,3 +14,4 @@ parts: numbered: true chapters: - file: Lecture1 + - file: Lecture2 diff --git a/slides/MLforMaterials_Lecture1_Intro_25.pdf b/slides/MLforMaterials_Lecture1_Intro_25.pdf index 9d82e78..4c4e54d 100644 Binary files a/slides/MLforMaterials_Lecture1_Intro_25.pdf and b/slides/MLforMaterials_Lecture1_Intro_25.pdf differ diff --git a/slides/MLforMaterials_Lecture2_Basics_25.pdf b/slides/MLforMaterials_Lecture2_Basics_25.pdf new file mode 100644 index 0000000..cb0ff6f Binary files /dev/null and b/slides/MLforMaterials_Lecture2_Basics_25.pdf differ