{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [ "remove_input" ] }, "outputs": [], "source": [ "path_data = '../../../../data/'\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from scipy import stats\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "plt.style.use('fivethirtyeight')\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# HIDDEN \n", "galton = pd.read_csv(path_data + 'galton.csv')\n", "\n", "heights = galton[['midparentHeight', 'childHeight']]\n", "\n", "heights = heights.rename(columns={'midparentHeight':'MidParent', 'childHeight':'Child'})\n", "\n", "dugong = pd.read_csv('http://www.statsci.org/data/oz/dugongs.txt')\n", "\n", "dugong[['Age','Length']] = dugong['Age\\tLength'].str.split('\\t',expand=True)\n", "\n", "dugong = dugong.drop(columns=['Age\\tLength'])\n", "\n", "length = dugong.pop('Length')\n", "\n", "dugong.insert(0, 'Length', length)\n", "\n", "dugong['Length'] = dugong['Length'].astype(float)\n", "\n", "dugong['Age'] = dugong['Age'].astype(float)\n", "\n", "hybrid = pd.read_csv(path_data + 'hybrid.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "tags": [ "remove_input" ] }, "outputs": [], "source": [ "def standard_units(x):\n", " return (x - np.mean(x))/np.std(x)\n", "\n", "def correlation(table, x, y):\n", " x_in_standard_units = standard_units(table[x])\n", " y_in_standard_units = standard_units(table[y])\n", " return np.mean(x_in_standard_units * y_in_standard_units)\n", "\n", "def slope(table, x, y):\n", " r = correlation(table, x, y)\n", " return r * np.std(table[y])/np.std(table[x])\n", "\n", "def intercept(table, x, y):\n", " a = slope(table, x, y)\n", " return np.mean(table[y]) - a * np.mean(table[x])\n", "\n", "def fit(table, x, y):\n", " a = slope(table, x, y)\n", " b = intercept(table, x, y)\n", " return a * table[x] + b\n", "\n", "def residual(table, x, y):\n", " return table[y] - fit(table, x, y)\n", "\n", "def scatter_fit(table, x, y):\n", " #fig, ax = plt.subplots(figsize=(7,6))\n", " plt.scatter(table[x], \n", " table[y], \n", " color='darkblue',\n", " s=20)\n", " \n", " plt.plot(table[x], fit(table, x, y), lw=2, color='gold')\n", " plt.xlabel(x)\n", " plt.ylabel(y)\n", " \n", "def residual_plot(table, x, y):\n", " fig, ax = plt.subplots(figsize=(7,6))\n", " x_array = table[x]\n", " t = pd.DataFrame({x:x_array, 'residuals':residual(table, x, y)})\n", " ax.scatter(t[x], t['residuals'], color='r')\n", " xlims = np.array([min(x_array), max(x_array)])\n", " ax.plot(xlims, np.array([0, 0]), color='darkblue', lw=4)\n", " x_label = x\n", " y_label = y\n", " y_vals = ax.get_yticks()\n", " plt.ylabel(y_label)\n", " plt.xlabel(x_label)\n", " plt.title('Residual Plot')\n", " plt.show()\n", "\n", "def regression_diagnostic_plots(table, x, y):\n", " scatter_fit(table, x, y)\n", " residual_plot(table, x, y) " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "tags": [ "remove_input" ] }, "outputs": [ { "data": { "text/html": [ "
\n", " | MidParent | \n", "Child | \n", "Fitted Value | \n", "Residual | \n", "
---|---|---|---|---|
0 | \n", "75.43 | \n", "73.2 | \n", "70.712373 | \n", "2.487627 | \n", "
1 | \n", "75.43 | \n", "69.2 | \n", "70.712373 | \n", "-1.512373 | \n", "
2 | \n", "75.43 | \n", "69.0 | \n", "70.712373 | \n", "-1.712373 | \n", "
3 | \n", "75.43 | \n", "69.0 | \n", "70.712373 | \n", "-1.712373 | \n", "
4 | \n", "73.66 | \n", "73.5 | \n", "69.584244 | \n", "3.915756 | \n", "
5 | \n", "73.66 | \n", "72.5 | \n", "69.584244 | \n", "2.915756 | \n", "
6 | \n", "73.66 | \n", "65.5 | \n", "69.584244 | \n", "-4.084244 | \n", "
7 | \n", "73.66 | \n", "65.5 | \n", "69.584244 | \n", "-4.084244 | \n", "
8 | \n", "72.06 | \n", "71.0 | \n", "68.564467 | \n", "2.435533 | \n", "
9 | \n", "72.06 | \n", "68.0 | \n", "68.564467 | \n", "-0.564467 | \n", "