{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import matplotlib\n", "#matplotlib.use('Agg')\n", "path_data = '../../../../data/'\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "from mpl_toolkits.mplot3d import Axes3D\n", "import numpy as np\n", "import pandas as pd\n", "import math\n", "import scipy.stats as stats\n", "plt.style.use('fivethirtyeight')\n", "\n", "import warnings\n", "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def standard_units(x):\n", " return (x - np.mean(x))/np.std(x)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (, line 1)", "output_type": "error", "traceback": [ "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m HemoG1 =\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "HemoG1 = " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step by step" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def distance(point1, point2):\n", " \"\"\"The distance between two arrays of numbers.\"\"\"\n", " return np.sqrt(np.sum((point1 - point2)**2))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.5" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.sqrt(np.sum((0 - 1.5)**2))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def all_distances(training, point):\n", " \"\"\"The distance between p (an array of numbers) and the numbers in row i of attribute_table.\"\"\"\n", " attributes = training.drop('Class')\n", " def distance_from_point(row):\n", " return distance(point, np.array(row))\n", " #return attributes.apply(distance_from_point)\n", " return attributes.map(distance_from_point) #--- check map function" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def table_with_distances(training, point):\n", " \"\"\"A copy of the training table with the distance from each row to array p.\"\"\"\n", " #return training.with_column('Distance', all_distances(training, point))\n", " training['Distance'] = all_distances(training, point)\n", " return training" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def closest(training, point, k):\n", " \"\"\"A table containing the k closest rows in the training table to array p.\"\"\"\n", " with_dists = table_with_distances(training, point)\n", " sorted_by_distance = with_dists.sort_values(by=['Distance'])\n", " topk = sorted_by_distance.take(np.arange(k))\n", " return topk" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeBlood PressureSpecific GravityAlbuminSugarRed Blood CellsPus CellPus Cell clumpsBacteriaGlucose...Packed Cell VolumeWhite Blood Cell CountRed Blood Cell CountHypertensionDiabetes MellitusCoronary Artery DiseaseAppetitePedal EdemaAnemiaClass
048701.00540normalabnormalpresentnotpresent117...3267003.9yesnonopooryesyes1
153901.02020abnormalabnormalpresentnotpresent70...29121003.7yesyesnopoornoyes1
263701.01030abnormalabnormalpresentnotpresent380...3245003.8yesyesnopooryesno1
368801.01032normalabnormalpresentpresent157...16110002.6yesyesyespooryesno1
461801.01520abnormalabnormalnotpresentnotpresent173...2492003.2yesyesyespooryesyes1
..................................................................
15355801.02000normalnormalnotpresentnotpresent140...4767004.9nononogoodnono0
15442701.02500normalnormalnotpresentnotpresent75...5478006.2nononogoodnono0
15512801.02000normalnormalnotpresentnotpresent100...4966005.4nononogoodnono0
15617601.02500normalnormalnotpresentnotpresent114...5172005.9nononogoodnono0
15758801.02500normalnormalnotpresentnotpresent131...5368006.1nononogoodnono0
\n", "

158 rows × 25 columns

\n", "
" ], "text/plain": [ " Age Blood Pressure Specific Gravity Albumin Sugar Red Blood Cells \\\n", "0 48 70 1.005 4 0 normal \n", "1 53 90 1.020 2 0 abnormal \n", "2 63 70 1.010 3 0 abnormal \n", "3 68 80 1.010 3 2 normal \n", "4 61 80 1.015 2 0 abnormal \n", ".. ... ... ... ... ... ... \n", "153 55 80 1.020 0 0 normal \n", "154 42 70 1.025 0 0 normal \n", "155 12 80 1.020 0 0 normal \n", "156 17 60 1.025 0 0 normal \n", "157 58 80 1.025 0 0 normal \n", "\n", " Pus Cell Pus Cell clumps Bacteria Glucose ... Packed Cell Volume \\\n", "0 abnormal present notpresent 117 ... 32 \n", "1 abnormal present notpresent 70 ... 29 \n", "2 abnormal present notpresent 380 ... 32 \n", "3 abnormal present present 157 ... 16 \n", "4 abnormal notpresent notpresent 173 ... 24 \n", ".. ... ... ... ... ... ... \n", "153 normal notpresent notpresent 140 ... 47 \n", "154 normal notpresent notpresent 75 ... 54 \n", "155 normal notpresent notpresent 100 ... 49 \n", "156 normal notpresent notpresent 114 ... 51 \n", "157 normal notpresent notpresent 131 ... 53 \n", "\n", " White Blood Cell Count Red Blood Cell Count Hypertension \\\n", "0 6700 3.9 yes \n", "1 12100 3.7 yes \n", "2 4500 3.8 yes \n", "3 11000 2.6 yes \n", "4 9200 3.2 yes \n", ".. ... ... ... \n", "153 6700 4.9 no \n", "154 7800 6.2 no \n", "155 6600 5.4 no \n", "156 7200 5.9 no \n", "157 6800 6.1 no \n", "\n", " Diabetes Mellitus Coronary Artery Disease Appetite Pedal Edema Anemia \\\n", "0 no no poor yes yes \n", "1 yes no poor no yes \n", "2 yes no poor yes no \n", "3 yes yes poor yes no \n", "4 yes yes poor yes yes \n", ".. ... ... ... ... ... \n", "153 no no good no no \n", "154 no no good no no \n", "155 no no good no no \n", "156 no no good no no \n", "157 no no good no no \n", "\n", " Class \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 \n", ".. ... \n", "153 0 \n", "154 0 \n", "155 0 \n", "156 0 \n", "157 0 \n", "\n", "[158 rows x 25 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ckd = pd.read_csv('ckd.csv')\n", "ckd.rename(columns={'Blood Glucose Random':'Glucose'}, inplace=True)\n", "ckd" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'ckd' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m ckd_su = pd.DataFrame({'Hemoglobin':standard_units(ckd['Hemoglobin']), \n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m'Glucose'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mstandard_units\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mckd\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Glucose'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m'White Blood Cell Count'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mstandard_units\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mckd\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'White Blood Cell Count'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m 'Class':ckd['Class'].astype(str)})\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'ckd' is not defined" ] } ], "source": [ "ckd_su = pd.DataFrame({'Hemoglobin':standard_units(ckd['Hemoglobin']), \n", " 'Glucose':standard_units(ckd['Glucose']), \n", " 'White Blood Cell Count':standard_units(ckd['White Blood Cell Count']), \n", " 'Class':ckd['Class'].astype(str)})\n", "\n", "#type(ckd_su['Class'][0])\n", "\n", "ckd_su" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "color_table = pd.DataFrame(\n", " {'Class':np.array([1, 0]),\n", " 'Color':np.array(['darkblue', 'gold'])}, index=np.array([1,0]))\n", " \n", "color_table['Class'] = color_table['Class'].astype(str)\n", "\n", "ckd_combined = pd.merge(ckd_su, color_table, on='Class')\n", "\n", "ckd_combined" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "glucose_color_darkblue = ckd_combined[ckd_combined['Color'] == 'darkblue']\n", "glucose_color_gold = ckd_combined[ckd_combined['Color'] == 'gold']\n", "\n", "\n", "fig, ax = plt.subplots(figsize=(7,6))\n", "\n", "ax.scatter(glucose_color_darkblue['Hemoglobin'], \n", " glucose_color_darkblue['Glucose'], \n", " label='Color=darkblue', \n", " color='darkblue')\n", "\n", "ax.scatter(glucose_color_gold['Hemoglobin'], \n", " glucose_color_gold['Glucose'], \n", " label='Color=gold', \n", " color='gold')\n", "\n", "x_label = 'Hemoglobin'\n", "\n", "y_label = 'Glucose'\n", "\n", "y_vals = ax.get_yticks()\n", "\n", "plt.ylabel(y_label)\n", "\n", "ax.legend(loc='upper left')\n", "\n", "plt.xlabel(x_label)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def show_closest(point):\n", " \"\"\"point = array([x,y]) \n", " gives the coordinates of a new point\n", " shown in red\"\"\"\n", " \n", " HemoG2 = ckd.copy()\n", " HemoG1 = HemoG2.drop(columns=['White Blood Cell Count'])\n", " \n", " t = closest(HemoGl, point, 1)\n", " x_closest = t.row(0).item(1)\n", " y_closest = t.row(0).item(2)\n", " \n", " ckd.scatter('Hemoglobin', 'Glucose', group='Color')\n", " plt.scatter(point.item(0), point.item(1), color='red', s=30)\n", " plt.plot(np.array([point.item(0), x_closest]), np.array([point.item(1), y_closest]), color='k', lw=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "alice = np.array([0, 1.5])\n", "show_closest(alice)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }