{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [ "remove_input" ] }, "outputs": [], "source": [ "path_data = '../../../../data/'\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from scipy import stats\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "plt.style.use('fivethirtyeight')\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The Regression Line ###\n", "The correlation coefficient $r$ doesn't just measure how clustered the points in a scatter plot are about a straight line. It also helps identify the straight line about which the points are clustered. In this section we will retrace the path that Galton and Pearson took to discover that line.\n", "\n", "Galton's data on the heights of parents and their adult children showed a linear association. The linearity was confirmed when our predictions of the children's heights based on the midparent heights roughly followed a straight line." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "galton = pd.read_csv(path_data + 'galton.csv')\n", "\n", "heights = pd.DataFrame(\n", " {'MidParent':galton['midparentHeight'],\n", " 'Child':galton['childHeight']}\n", " )" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def predict_child(mpht):\n", " \"\"\"Return a prediction of the height of a child \n", " whose parents have a midparent height of mpht.\n", " \n", " The prediction is the average height of the children \n", " whose midparent height is in the range mpht plus or minus 0.5 inches.\n", " \"\"\"\n", " \n", " close_points = heights[(heights['MidParent'] >= mpht-0.5) & (heights['MidParent'] < mpht+0.5)]\n", " return close_points['Child'].mean()\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | MidParent | \n", "Child | \n", "Prediction | \n", "
---|---|---|---|
0 | \n", "75.43 | \n", "73.2 | \n", "70.100000 | \n", "
1 | \n", "75.43 | \n", "69.2 | \n", "70.100000 | \n", "
2 | \n", "75.43 | \n", "69.0 | \n", "70.100000 | \n", "
3 | \n", "75.43 | \n", "69.0 | \n", "70.100000 | \n", "
4 | \n", "73.66 | \n", "73.5 | \n", "70.415789 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
929 | \n", "66.64 | \n", "64.0 | \n", "65.156579 | \n", "
930 | \n", "66.64 | \n", "62.0 | \n", "65.156579 | \n", "
931 | \n", "66.64 | \n", "61.0 | \n", "65.156579 | \n", "
932 | \n", "65.27 | \n", "66.5 | \n", "64.229630 | \n", "
933 | \n", "65.27 | \n", "57.0 | \n", "64.229630 | \n", "
934 rows × 3 columns
\n", "\n", " | MidParent SU | \n", "Child SU | \n", "
---|---|---|
0 | \n", "3.454652 | \n", "1.804156 | \n", "
1 | \n", "3.454652 | \n", "0.686005 | \n", "
2 | \n", "3.454652 | \n", "0.630097 | \n", "
3 | \n", "3.454652 | \n", "0.630097 | \n", "
4 | \n", "2.472085 | \n", "1.888017 | \n", "
... | \n", "... | \n", "... | \n", "
929 | \n", "-1.424873 | \n", "-0.767591 | \n", "
930 | \n", "-1.424873 | \n", "-1.326667 | \n", "
931 | \n", "-1.424873 | \n", "-1.606205 | \n", "
932 | \n", "-2.185390 | \n", "-0.068747 | \n", "
933 | \n", "-2.185390 | \n", "-2.724356 | \n", "
934 rows × 2 columns
\n", "\n", " | MidParent SU | \n", "Child SU | \n", "Prediction SU | \n", "
---|---|---|---|
0 | \n", "3.454652 | \n", "1.804156 | \n", "0.937589 | \n", "
1 | \n", "3.454652 | \n", "0.686005 | \n", "0.937589 | \n", "
2 | \n", "3.454652 | \n", "0.630097 | \n", "0.937589 | \n", "
3 | \n", "3.454652 | \n", "0.630097 | \n", "0.937589 | \n", "
4 | \n", "2.472085 | \n", "1.888017 | \n", "1.025864 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
929 | \n", "-1.424873 | \n", "-0.767591 | \n", "-0.444284 | \n", "
930 | \n", "-1.424873 | \n", "-1.326667 | \n", "-0.444284 | \n", "
931 | \n", "-1.424873 | \n", "-1.606205 | \n", "-0.444284 | \n", "
932 | \n", "-2.185390 | \n", "-0.068747 | \n", "-0.703401 | \n", "
933 | \n", "-2.185390 | \n", "-2.724356 | \n", "-0.703401 | \n", "
934 rows × 3 columns
\n", "\n", " | MidParent | \n", "Child | \n", "Prediction | \n", "
---|---|---|---|
33 | \n", "70.48 | \n", "74.0 | \n", "67.634239 | \n", "
34 | \n", "70.48 | \n", "70.0 | \n", "67.634239 | \n", "
35 | \n", "70.48 | \n", "68.0 | \n", "67.634239 | \n", "
\n", " | MidParent | \n", "Child | \n", "Prediction | \n", "Regression Prediction | \n", "
---|---|---|---|---|
0 | \n", "75.43 | \n", "73.2 | \n", "70.100000 | \n", "70.712373 | \n", "
1 | \n", "75.43 | \n", "69.2 | \n", "70.100000 | \n", "70.712373 | \n", "
2 | \n", "75.43 | \n", "69.0 | \n", "70.100000 | \n", "70.712373 | \n", "
3 | \n", "75.43 | \n", "69.0 | \n", "70.100000 | \n", "70.712373 | \n", "
4 | \n", "73.66 | \n", "73.5 | \n", "70.415789 | \n", "69.584244 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
929 | \n", "66.64 | \n", "64.0 | \n", "65.156579 | \n", "65.109971 | \n", "
930 | \n", "66.64 | \n", "62.0 | \n", "65.156579 | \n", "65.109971 | \n", "
931 | \n", "66.64 | \n", "61.0 | \n", "65.156579 | \n", "65.109971 | \n", "
932 | \n", "65.27 | \n", "66.5 | \n", "64.229630 | \n", "64.236786 | \n", "
933 | \n", "65.27 | \n", "57.0 | \n", "64.229630 | \n", "64.236786 | \n", "
934 rows × 4 columns
\n", "\n", " | MidParent | \n", "Child | \n", "Fitted | \n", "
---|---|---|---|
0 | \n", "75.43 | \n", "73.2 | \n", "70.712373 | \n", "
1 | \n", "75.43 | \n", "69.2 | \n", "70.712373 | \n", "
2 | \n", "75.43 | \n", "69.0 | \n", "70.712373 | \n", "
3 | \n", "75.43 | \n", "69.0 | \n", "70.712373 | \n", "
4 | \n", "73.66 | \n", "73.5 | \n", "69.584244 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
929 | \n", "66.64 | \n", "64.0 | \n", "65.109971 | \n", "
930 | \n", "66.64 | \n", "62.0 | \n", "65.109971 | \n", "
931 | \n", "66.64 | \n", "61.0 | \n", "65.109971 | \n", "
932 | \n", "65.27 | \n", "66.5 | \n", "64.236786 | \n", "
933 | \n", "65.27 | \n", "57.0 | \n", "64.236786 | \n", "
934 rows × 3 columns
\n", "