{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic regression using `statsmodels`" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/data/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", " from pandas.core import datetools\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf\n", "\n", "%matplotlib inline\n", "\n", "sns.set(rc={\n", " 'figure.figsize': (8, 6),\n", " 'font.size': 14\n", "})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in the British Crime Survey 2007-2008 dataset." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "bcs = pd.read_csv('https://github.com/estimand/teaching-datasets/raw/master/british-crime-survey/bcs.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
case_idsexagemarital_statusethnic_origineducationpaid_work_last_wkyears_in_areasame_address_last_yeartenure...worry_rapeworry_assaultworry_insultworry_racismcrime_change_two_yrspersonal_crimeproperty_crimeantisocial_behaviournuisance_neighboursvictim_last_yr
061302140F36.0MarriedWhiteNoneTrue[10, 20)NaNBuying with mortgage/loan...Not very worriedNot very worriedFairly worriedNot very worriedA little more crimeNaNNaN1.1177002.212788False
161384060M44.0SeparatedWhiteApprenticeship or A/AS LevelTrue[0, 1)FalseRent...Not at all worriedNot very worriedNot very worriedNot very worriedNaN-0.3892742.1398111.791787-1.024336False
263684260M43.0MarriedWhiteO Level / GCSETrue[2, 3)NaNBuying with mortgage/loan...Not at all worriedNot very worriedNot very worriedNot very worriedNaN-0.3207770.506092NaNNaNTrue
363790220F27.0SingleBlack or Black BritishApprenticeship or A/AS LevelTrue[0, 1)FalseRent...Fairly worriedFairly worriedNot very worriedNot very worriedNaN0.4646051.6023070.4629700.822929True
463843180M38.0MarriedWhiteDegree or DiplomaTrue[10, 20)NaNBuying with mortgage/loan...Not at all worriedNot very worriedNot very worriedNot very worriedA lot more crime-0.117556-0.6575550.137419-0.780199True
\n", "

5 rows × 33 columns

\n", "
" ], "text/plain": [ " case_id sex age marital_status ethnic_origin \\\n", "0 61302140 F 36.0 Married White \n", "1 61384060 M 44.0 Separated White \n", "2 63684260 M 43.0 Married White \n", "3 63790220 F 27.0 Single Black or Black British \n", "4 63843180 M 38.0 Married White \n", "\n", " education paid_work_last_wk years_in_area \\\n", "0 None True [10, 20) \n", "1 Apprenticeship or A/AS Level True [0, 1) \n", "2 O Level / GCSE True [2, 3) \n", "3 Apprenticeship or A/AS Level True [0, 1) \n", "4 Degree or Diploma True [10, 20) \n", "\n", " same_address_last_year tenure ... \\\n", "0 NaN Buying with mortgage/loan ... \n", "1 False Rent ... \n", "2 NaN Buying with mortgage/loan ... \n", "3 False Rent ... \n", "4 NaN Buying with mortgage/loan ... \n", "\n", " worry_rape worry_assault worry_insult worry_racism \\\n", "0 Not very worried Not very worried Fairly worried Not very worried \n", "1 Not at all worried Not very worried Not very worried Not very worried \n", "2 Not at all worried Not very worried Not very worried Not very worried \n", "3 Fairly worried Fairly worried Not very worried Not very worried \n", "4 Not at all worried Not very worried Not very worried Not very worried \n", "\n", " crime_change_two_yrs personal_crime property_crime antisocial_behaviour \\\n", "0 A little more crime NaN NaN 1.117700 \n", "1 NaN -0.389274 2.139811 1.791787 \n", "2 NaN -0.320777 0.506092 NaN \n", "3 NaN 0.464605 1.602307 0.462970 \n", "4 A lot more crime -0.117556 -0.657555 0.137419 \n", "\n", " nuisance_neighbours victim_last_yr \n", "0 2.212788 False \n", "1 -1.024336 False \n", "2 NaN True \n", "3 0.822929 True \n", "4 -0.780199 True \n", "\n", "[5 rows x 33 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bcs.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define predictors and response." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "predictors = ['sex', 'age', 'safety_walk_night']\n", "response = 'victim_last_yr'\n", "all_vars = predictors + [response]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remove missing values." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "bcs.dropna(subset=all_vars, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## EDA" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfQAAAF0CAYAAADVZstSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAG01JREFUeJzt3XtU1HX+x/HXMCMKiAoGgbfNy7El\nW3XPTw9xOrmJK2ppQnk52tEj5rrrWWNNoxJP5nrUjt22NM8p3ZOXTDNNwbX1kmj2x1prZXW6aGu7\nmnZkcHFARBSB+f3BT36ZpiMx82XePB9/DcPwnTfo8JzP9zvM1+X3+/0CAABhLcLpAQAAwM9H0AEA\nMICgAwBgAEEHAMAAgg4AgAEEHQAAAzxOD/BznDpV7vQIAACETEJC7E9+jhU6AAAGEHQAAAwg6AAA\nGEDQAQAwgKADAGAAQQcAwACCDgCAAQQdAAADCDoAAAYQdAAADCDoAAAYQNABADCAoAMAYEBYn20N\ndd566w0dOPCh02PckIqKCklSTEyMw5MErn//VI0Z86DTYwDAVQVthT579mylpaVp+PDh9deVlpYq\nOztbGRkZys7OVllZmSTJ7/drwYIFGjx4sEaMGKEvv/wyWGOhiaiquqCqqgtOjwEAZrj8fr8/GBs+\ncOCAoqOj9fjjj2vbtm2SpGeeeUbt2rXT1KlTtXz5cpWVlSk3N1f79u3T66+/rhUrVuizzz7TwoUL\ntXHjxuveB+dDD1+5uTmSpGefXeLwJAAQPhw5H3r//v3Vtm3by64rLCxUZmamJCkzM1O7d+++7HqX\ny6W+ffvqzJkzKi4uDtZoAACYE9Jj6CUlJUpMTJQkJSYm6vTp05Ikr9erpKSk+tslJSXJ6/XW3/an\nxMVFy+NxB29gBI3bXfdc8lrPNgEAgWsSL4q72l5/l8t13a/z+c4FYxyEQE1NrSQOmwDAjXBkl/vV\ntG/fvn5XenFxseLj4yXVrciLiorqb1dUVHTd1TkAAPh/IQ16enq68vPzJUn5+fkaNGjQZdf7/X59\n+umnio2NJegAANyAoO1ynzlzpv75z3/K5/NpwIABevjhhzV16lTNmDFDmzZtUnJysl566SVJ0m9+\n8xvt27dPgwcPVlRUlBYtWhSssQAAMClof7YWChx/DV/82RoA3LgmcwwdAAAEB0EHAMAAgg4AgAEE\nHQAAAwg6AAAGEHQAAAwg6AAAGEDQAaAZOXToKx069JXTYyAImsTJWQAAoVFQ8LYk6Ze/vM3hSdDY\nWKEDQDNx6NBXOnz4ax0+/DWrdIMIOgA0E5dW5z++DBsIOgAABhB0AGgmRo584KqXYQMvigOAZuKX\nv7xNt96aUn8ZthB0AGhGfv3r/3F6BAQJu9wBoBk5ePBjHTz4sdNjIAgIOgA0E/zZmm0EHQCaCf5s\nzTaCDgCAAQQdAJoJ/mzNNoIOAIABBB0AmgmOodtG0AEAMICgA0Az0aFDx6tehg0EHQCaiQ8++MdV\nL8MGgg4AgAEEHQCaiZEj77/qZdhA0AGgmcjIuEdRUdGKiopWRsY9To+DRsbZ1gCgGWFlbhdBB4Bm\nhJW5XexyBwDAAIIOAIABBB0AAAMIOgAABhB0AAAM4FXuANBAb731hg4c+NDpMW5IRUWFJCkmJsbh\nSW5M//6pGjPmQafHaNJYoQNAM1JVdUFVVRecHgNBwAodABpozJgHw27VmJubI0l69tklDk+CxsYK\nHQAAAwg6AAAGEHQAAAwg6AAAGEDQAQAwgKADAGAAQQcAwACCDgCAAQQdAAADCDoAAAYQdAAADCDo\nAAAYQNABADCAoAMAYABBBwDAAIIOAIABBB0AAAMIOgAABhB0AAAMIOgAABhA0AEAMICgAwBggMeJ\nO121apU2btwol8ulnj176umnn1ZxcbFmzpypsrIy3XbbbXrmmWcUGRnpxHgAAISdkK/QvV6v1qxZ\no7ffflvbtm1TTU2N3nnnHT333HOaNGmSdu3apTZt2mjTpk2hHg0AgLDlyC73mpoanT9/XtXV1Tp/\n/rwSEhL0wQcfaMiQIZKkrKwsFRYWOjEaAABhKeS73G+++WZNnjxZAwcOVMuWLXXnnXeqV69eatOm\njTyeunGSkpLk9XpDPRoAAGEr5EEvKytTYWGhCgsLFRsbqz/96U96//33r7idy+W67rbi4qLl8biD\nMSaCzO2u2zmUkBDr8CRA88Jjz66QB/0f//iHOnXqpPj4eElSRkaGDh48qDNnzqi6uloej0dFRUVK\nTEy87rZ8vnPBHhdBUlNTK0k6darc4UmA5oXHXni71hOxkB9D79Chgz777DNVVlbK7/dr//796tGj\nh1JTU7Vz505J0pYtW5Senh7q0QAACFshX6H36dNHQ4YMUVZWljwej1JSUjR27FjdfffdeuSRR/Ti\niy8qJSVFo0ePDvVoAACELUf+Dj0nJ0c5OTmXXde5c2f+VA0AgAbineIAADCAoAMAYABBBwDAAIIO\nAIABBB0AAAMIOgAABhB0AAAMIOgAABhA0AEAMICgAwBgAEEHAMAAgg4AgAEEHQAAAwg6AAAGEHQA\nAAwg6AAAGEDQAQAwgKADAGAAQQcAwACCDgCAAQQdAAADCDoAAAYQdAAADCDoAAAYQNABADCAoAMA\nYABBBwDAAIIOAIABBB0AAAMIOgAABhB0AAAMIOgAABhA0AEAMICgAwBgAEEHAMAAgg4AgAEEHQAA\nAwg6AAAGEHQAAAzwOD1AU7No0Tz5fKedHsO8Sz/j3NwchyexLS4uXnl585weA0AIEPQf8flOq6Sk\nRK4WUU6PYpr//3YOnT5zzuFJ7PJfrHR6BAAhRNCvwtUiSq173Of0GMDPcvbIVqdHABBCHEMHAMAA\ngg4AgAEEHQAAAwg6AAAGEHQAAAwg6AAAGEDQAQAwgKADAGAAQQcAwACCDgCAAQQdAAADCDoAAAYQ\ndAAADCDoAAAYQNABADCAoAMAYABBBwDAAIIOAIABBB0AAAMcCfqZM2eUk5OjoUOHatiwYTp48KBK\nS0uVnZ2tjIwMZWdnq6yszInRAAAIS44EfeHChbrrrru0Y8cOFRQUqHv37lq+fLnS0tK0a9cupaWl\nafny5U6MBgBAWAoo6EePHtW4ceOUnp4uSfryyy+1dOnSBt3h2bNndeDAAY0aNUqSFBkZqTZt2qiw\nsFCZmZmSpMzMTO3evbtB2wcAoDkKKOjz5s3TtGnTFBsbK0lKSUnRjh07GnSHx48fV3x8vGbPnq3M\nzEzNmTNH586dU0lJiRITEyVJiYmJOn36dIO2DwBAc+QJ5Ebl5eUaMGCAXnjhBUlSRESEWrRo0aA7\nrK6u1ldffaUnn3xSffr00YIFCxq8ez0uLloej7tBX/tT3G5eJwg73O4IJSTEOj0GmpBLv+P4f2FP\nQEF3u926ePGiXC6XJMnr9SoiomHhS0pKUlJSkvr06SNJGjp0qJYvX6727duruLhYiYmJKi4uVnx8\n/HW35fOda9AM11JTU9vo2wScUlNTq1Onyp0eA03Ipd9x/L8IT9d6IhZQlcePH6/p06fL5/Np6dKl\nGj9+vCZPntzAYRKUlJSkf//735Kk/fv3q3v37kpPT1d+fr4kKT8/X4MGDWrQ9gEAaI4CWqFnZmaq\nU6dO2rt3ryorK7V48WL169evwXf65JNP6tFHH9XFixfVuXNnPf3006qtrdWMGTO0adMmJScn66WX\nXmrw9gEAaG4CCrok9evX72dF/IdSUlK0efPmK65fvXp1o2wfAIDmJqCgP/DAA/XHzy+JjY1V3759\nNWXKFMXExARlOAAAEJiAjqGnpaUpOTlZ06ZN07Rp09ShQwfdfvvt8nq9mjdvXpBHBAAA1xPQCv3A\ngQPasGFD/ccDBw7UpEmTtHr1at1zzz1BGw4AAAQmoBW6z+fThQsX6j+uqqqS1+uVy+VSq1atgjYc\nAAAITEAr9GHDhmns2LEaNmyYXC6Xtm/friFDhqiiokIdO3YM9owAAOA6Agr6I488or59++rDDz+U\nJOXk5GjgwIGSpJdffjl40wEAgIAEtMu9vLxcn3zyib799lt9+eWXWrlypSZOnBjs2QAAQIACCnpe\nXp7cbreOHj2qsWPHyu12q3fv3sGeDQAABCigoB87dkwzZsxQq1atNHz4cL366qv64osvgj0bAAAI\nUEBBj4yMlCS1aNFCpaWlatGihYqKioI6GAAACFxAL4q75ZZbVFpaqhEjRmjs2LGKjY1VSkpKsGcD\nAAABCijozz33nCQpOztbv/rVr+rPjw4AAJqGgE/OckljnaAFAAA0noCOoQMAgKaNoAMAYMAN73IH\ngGBZtGiefL7TTo9h2qWfb25ujsOT2BYXF6+8vHkhvU+CDqDJ8PlOq+T0fxURxa+mYKmN8EuSfJWl\nDk9iV21ltSP3y6MGQJMSEeVR3NAuTo8BNJhvx3eO3C/H0AEAMICgAwBgAEEHAMAAjqH/SEVFhfwX\nz+vska1OjwL8LP6Llaqo8Ds9BoAQYYUOAIABrNB/JCYmRhdqXGrd4z6nRwF+lrNHtiomJtrpMQCE\nCCt0AAAMIOgAABhA0AEAMICgAwBgAEEHAMAAgg4AgAEEHQAAAwg6AAAGEHQAAAwg6AAAGEDQAQAw\ngKADAGAAQQcAwACCDgCAAQQdAAADCDoAAAYQdAAADCDoAAAYQNABADCAoAMAYABBBwDAAIIOAIAB\nBB0AAAMIOgAABhB0AAAMIOgAABhA0AEAMICgAwBgAEEHAMAAgg4AgAEEHQAAAwg6AAAGEHQAAAwg\n6AAAGEDQAQAwgKADAGCAx+kBAOCSiooK1V6olm/Hd06PAjRYbWW1KmorQn6/jq3Qa2pqlJmZqd//\n/veSpOPHj2v06NHKyMjQjBkzVFVV5dRoAACEHcdW6GvWrFH37t119uxZSdJzzz2nSZMm6d5779Xc\nuXO1adMmjR8/3qnxADggJiZGVREXFTe0i9OjAA3m2/GdYqJiQn6/jqzQi4qK9N5772nUqFGSJL/f\nrw8++EBDhgyRJGVlZamwsNCJ0QAACEuOrNAXLVqk3NxcVVTUHWPw+Xxq06aNPJ66cZKSkuT1eq+7\nnbi4aHk87kadze3mdYKww+2OUEJCrNNjBIzHH6xw4rEX8qDv3btX8fHxuv322/Xhhx/+5O1cLtd1\nt+XznWvM0SRJNTW1jb5NwCk1NbU6darc6TECxuMPVgTrsXetJwkhD/onn3yiPXv26P3339eFCxd0\n9uxZLVy4UGfOnFF1dbU8Ho+KioqUmJgY6tEAAAhbId+/NWvWLL3//vvas2ePXnjhBd1xxx16/vnn\nlZqaqp07d0qStmzZovT09FCPBgBA2GoyB6xyc3O1cuVKDR48WKWlpRo9erTTIwEAEDYcfWOZ1NRU\npaamSpI6d+6sTZs2OTkOAABhq8ms0AEAQMPx1q9X4b9YqbNHtjo9hmn+mrp3AnS5Ix2exC7/xUpJ\n0U6PASBECPqPxMXFOz1Cs+DznZckxbUhOMETzf9noBkh6D+SlzfP6RGahdzcHEnSs88ucXgSALCB\nY+gAABhA0AEAMICgAwBgAEEHAMAAgg4AgAEEHQAAAwg6AAAGEHQAAAwg6AAAGEDQAQAwgKADAGAA\nQQcAwACCDgCAAQQdAAADCDoAAAYQdAAADCDoAAAYQNABADCAoAMAYABBBwDAAIIOAIABBB0AAAMI\nOgAABhB0AAAM8Dg9AAD8UG1ltXw7vnN6DLNqq2okSRGRbocnsau2slqKCv39EnQATUZcXLzTI5jn\nO39akhQX1c7hSQyLcub/MkEH0GTk5c1zegTzcnNzJEnPPrvE4UnQ2DiGDgCAAQQdAAADCDoAAAYQ\ndAAADCDoAAAYQNABADCAoAMAYABBBwDAAIIOAIABBB0AAAMIOgAABhB0AAAMIOgAABhA0AEAMICg\nAwBgAEEHAMAAgg4AgAEEHQAAAwg6AAAGEHQAAAwg6AAAGEDQAQAwgKADAGAAQQcAwACCDgCAAQQd\nAAADCDoAAAYQdAAADCDoAAAYQNABADAg5EE/efKkJkyYoGHDhunee+/V6tWrJUmlpaXKzs5WRkaG\nsrOzVVZWFurRAAAIWyEPutvt1hNPPKHt27drw4YNWrdunY4cOaLly5crLS1Nu3btUlpampYvXx7q\n0QAACFshD3piYqJ69eolSWrdurW6desmr9erwsJCZWZmSpIyMzO1e/fuUI8GAEDY8jh55ydOnNDX\nX3+tPn36qKSkRImJiZLqon/69Onrfn1cXLQ8Hnewx0QQuN11zyUTEmIdngRoXnjs2eVY0CsqKpST\nk6O8vDy1bt26Qdvw+c418lQIlZqaWknSqVPlDk8CNC889sLbtZ6IOfIq94sXLyonJ0cjRoxQRkaG\nJKl9+/YqLi6WJBUXFys+Pt6J0QAACEshD7rf79ecOXPUrVs3ZWdn11+fnp6u/Px8SVJ+fr4GDRoU\n6tEAAAhbId/l/vHHH6ugoEA9e/bUyJEjJUkzZ87U1KlTNWPGDG3atEnJycl66aWXQj0aAABhK+RB\n79evnw4fPnzVz136m3QAAHBjeKc4AAAMIOgAABhA0AEAMICgAwBgAEEHAMAAgg4AgAEEHQAAA1x+\nv9/v9BANxXsR13nrrTd04MCHTo9xQ3y+upPvxMWFz1v89u+fqjFjHnR6DDQhPPZCh8dfnWu9l7uj\nZ1tD8xUZ2dLpEYBmiceeXazQAQAIE03ubGsAAKBxEXQAAAwg6AAAGEDQAQAwgKADAGAAQQcAwACC\nDgCAAQQdAAADCDoAAAYQdAAADCDoAAAYQNABADCAoAMAYEBYn20NAADUYYUOAIABBB0AAAMIOgAA\nBhB0AAAMIOgAABhA0AEAMMDj9ACwIyUlRT179qz/eNmyZerUqdNVb3vixAn94Q9/0LZt20I1HmCW\nz+fTpEmTJEn//e9/FRERofj4eEnSxo0bFRkZ6eB0CBWCjkbTqlUrFRQUOD0G0OzExcXVP/aWLl2q\n6OhoPfTQQ5fdxu/3y+/3KyKCHbNW8S+LoDpx4oTGjx+vrKwsZWVl6ZNPPrniNv/61780atQojRw5\nUiNGjNDRo0clSQUFBfXXz507VzU1NSGeHghvx44d0/DhwzV37lxlZWXp5MmT6tevX/3n33nnHc2Z\nM0dS3cp++vTpuv/++zVq1Ch9+umnTo2NBmKFjkZz/vx5jRw5UpLUqVMnLVu2TO3bt9fKlSvVsmVL\nHT16VDNnztTmzZsv+7o333xTEydO1H333aeqqirV1tbq22+/1fbt27V+/Xq1aNFC8+bN09/+9jdl\nZmY68a0BYevIkSNatGiR5s+fr+rq6p+83YIFCzRlyhT17duXQ2JhiqCj0Vxtl3t1dbXmz5+vQ4cO\nKSIion71/UN9+/bVK6+8oqKiImVkZOiWW27R/v379cUXX2jUqFGS6p4stG/fPhTfBmBKly5d1Lt3\n7+vebv/+/frPf/5T/3FZWZnOnz+vVq1aBXM8NCKCjqBatWqVbrrpJhUUFKi2tvaqv1hGjBihPn36\n6L333tNDDz2kBQsWyO/3KysrS7NmzXJgasCOqKio+ssRERH64ek7Lly4UH/Z7/fzArowxzF0BFV5\nebkSEhIUERGhgoKCqx4HP378uDp37qyJEycqPT1dhw8fVlpamnbu3KmSkhJJUmlpqb7//vtQjw+Y\nEhERobZt2+ro0aOqra3Vu+++W/+5tLQ0rVu3rv7jr7/+2okR8TOwQkdQjR8/Xg8//LB27Nih1NRU\nRUdHX3Gbv//979q6das8Ho9uuukm/fGPf1S7du00Y8YMTZ48WbW1tWrRooXmzp2rjh07OvBdAHY8\n+uijmjJlipKTk9WjRw9VVVVJkp566inNmzdPb7/9tmpqapSamqqnnnrK4WlxIzh9KgAABrDLHQAA\nAwg6AAAGEHQAAAwg6AAAGEDQAQAwgKADAGAAQQfC1Pr167Vq1apr3ubEiRPasGHDZdf97ne/03ff\nfdcoMyxdulSLFy9u8NevWrWq/s2DAPw8BB0IU+PGjas/B/ZP+f77768I+ooVK9SlS5cgTha4NWvW\nNFrQa2trxdtqoDnjneKAJm7ZsmUqKytTXl6eJMnn82no0KG6//77JUmPP/64JOnVV1/Vtm3b5HK5\nFB0drXXr1mn+/Pk6ceKERo4cqV/84hdasmSJ0tPT9corr6hnz56aMGGCevXqpc8//1zff/+9Jk6c\nqJtvvllr165VcXGxcnNzNWzYsIDmPHz4sP785z+rsrJSFy5c0JgxY+qfcGzYsEGrVq1SZGSkamtr\n9eKLL2rXrl0qLi5WTk6OWrZsqeeff149evS4YrsrVqzQyZMnNXfuXEl1p/m87777VFhYqL/+9a86\nduyYzp07p+PHj2vt2rVq27btz/2RA2GJoANNXFZWlsaMGaPHHntMHo9H27ZtU3p6uqKjo3Xu3DlJ\n0pYtW7Rnzx6tX79erVu3ls/nU0REhObOnavFixdfccraHyoqKtLatWt16tQpZWRkaNKkSXrzzTf1\n+eefa/r06QEHvWPHjvXRrqio0OjRo3XXXXepe/fueuaZZ7Rt2zYlJyerqqpKNTU1mjZtmjZu3Kgl\nS5aoZ8+eP7ndMWPG6J577tGsWbMUExOjDRs2aPjw4fUnHfnoo4+0efNmxcfH38BPFbCHXe5AE9eh\nQwd1795d+/btk1QX7wceeOCy2+zdu1fjxo1T69atJUlxcXEBb3/o0KGKiIjQzTffrHbt2um3v/2t\nJKlXr17yer2XnZHrWs6fP6+8vDyNGDFC48aNU3FxsQ4dOiRJuuOOOzR79my9/vrr8nq9l50B7Hra\ntm2r9PR0FRQUqLq6Whs3btS4cePqPz9gwABiDoigA2EhKytL+fn5+uabb1ReXq5+/fo12rZbtmxZ\nf9ntdtd/7Ha7JdWd0z4QL7zwghISErRlyxZt3bpVvXv3rn8y8PLLL2vmzJmqrKzUxIkT65+cBGrC\nhAlav369CgsL1b17d3Xt2rX+czExMTe0LcAqgg6EgSFDhujAgQN67bXXlJWVdcXnBw4cqPXr1+vs\n2bOS6o6zS1Lr1q3rrwu28vJyJSUlyePx6JtvvtFHH30kqe4JwfHjx9W7d29NnTpVd955Z/2pOWNi\nYlReXn7dbffs2VPt2rXTokWLNH78+KB+H0C44hg6EAaioqI0aNAgbd68WYWFhVd8PjMzU16vV2PH\njpXb7VZMTIzeeOMN3XrrreratauGDx+ubt26acmSJUGbcdq0aXrssce0detWdenSRf3795dU9+rz\nJ554QuXl5XK5XEpOTtasWbMkSRMnTlReXp5atWr1ky+Ku2T06NH6y1/+orvvvjto3wMQzjh9KoCw\nMGfOHHXt2lVTpkxxehSgSWKXO4Amzev1asiQITp27JgefPBBp8cBmixW6ACuqaSkRJMnT77i+sGD\nB2v69OmNdj8vv/yy3n333Suuf+2119S+fftGux/AKoIOAIAB7HIHAMAAgg4AgAEEHQAAAwg6AAAG\nEHQAAAz4X2IN5wVgupxEAAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.boxplot(x='victim_last_yr', y='age', data=bcs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Odds and odds ratios" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Probability of having experienced crime by sex:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "p_men = bcs[bcs['sex'] == 'M']['victim_last_yr'].mean()\n", "p_women = bcs[bcs['sex'] == 'F']['victim_last_yr'].mean()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.20366868381240544" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p_men" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.20199304017715913" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p_women" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Corresponding odds:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "odds_men = p_men / (1 - p_men)\n", "odds_women = p_women / (1 - p_women)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25575872714319636" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_men" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25312190287413283" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_women" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "odds_men = bcs[bcs['sex'] == 'M']['victim_last_yr'].sum() / (~bcs[bcs['sex'] == 'M']['victim_last_yr']).sum()\n", "odds_women = bcs[bcs['sex'] == 'F']['victim_last_yr'].sum() / (~bcs[bcs['sex'] == 'F']['victim_last_yr']).sum()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25575872714319636" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_men" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25312190287413283" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_women" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Odds ratio of a woman having experienced crime (compared to a man):" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.98969018848929757" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_women / odds_men" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Logistic regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modelling the probability of having experienced crime by sex.\n", "\n", "*Note*: no intercept means there is no reference category." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "bcs['victim_last_yr'] = bcs['victim_last_yr'].astype(int) # Recode as 0/1\n", "model1 = smf.glm('victim_last_yr ~ -1 + sex', data=bcs, family=sm.families.Binomial()).fit()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Generalized Linear Model Regression Results
Dep. Variable: victim_last_yr No. Observations: 11610
Model: GLM Df Residuals: 11608
Model Family: Binomial Df Model: 1
Link Function: logit Scale: 1.0
Method: IRLS Log-Likelihood: -5853.7
Date: Sun, 03 Dec 2017 Deviance: 11707.
Time: 13:23:44 Pearson chi2: 1.16e+04
No. Iterations: 4
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err z P>|z| [0.025 0.975]
sex[F] -1.3739 0.031 -43.858 0.000 -1.435 -1.312
sex[M] -1.3635 0.034 -39.932 0.000 -1.430 -1.297
" ], "text/plain": [ "\n", "\"\"\"\n", " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: victim_last_yr No. Observations: 11610\n", "Model: GLM Df Residuals: 11608\n", "Model Family: Binomial Df Model: 1\n", "Link Function: logit Scale: 1.0\n", "Method: IRLS Log-Likelihood: -5853.7\n", "Date: Sun, 03 Dec 2017 Deviance: 11707.\n", "Time: 13:23:44 Pearson chi2: 1.16e+04\n", "No. Iterations: 4 \n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "sex[F] -1.3739 0.031 -43.858 0.000 -1.435 -1.312\n", "sex[M] -1.3635 0.034 -39.932 0.000 -1.430 -1.297\n", "==============================================================================\n", "\"\"\"" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model1.summary()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "sex[F] -1.373884\n", "sex[M] -1.363521\n", "dtype: float64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model1.params" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Taking the exponential of the regression coefficients returns the odds." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "sex[F] 0.253122\n", "sex[M] 0.255759\n", "dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp(model1.params)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25575872714319636" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_men" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25312190287413283" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_women" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Including the intercept means one category (`sex` = 'M') acts as reference." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "model2 = smf.glm(\"victim_last_yr ~ C(sex, Treatment(reference='M'))\", data=bcs, family=sm.families.Binomial()).fit()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Generalized Linear Model Regression Results
Dep. Variable: victim_last_yr No. Observations: 11610
Model: GLM Df Residuals: 11608
Model Family: Binomial Df Model: 1
Link Function: logit Scale: 1.0
Method: IRLS Log-Likelihood: -5853.7
Date: Sun, 03 Dec 2017 Deviance: 11707.
Time: 13:23:44 Pearson chi2: 1.16e+04
No. Iterations: 4
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err z P>|z| [0.025 0.975]
Intercept -1.3635 0.034 -39.932 0.000 -1.430 -1.297
C(sex, Treatment(reference='M'))[T.F] -0.0104 0.046 -0.224 0.823 -0.101 0.080
" ], "text/plain": [ "\n", "\"\"\"\n", " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: victim_last_yr No. Observations: 11610\n", "Model: GLM Df Residuals: 11608\n", "Model Family: Binomial Df Model: 1\n", "Link Function: logit Scale: 1.0\n", "Method: IRLS Log-Likelihood: -5853.7\n", "Date: Sun, 03 Dec 2017 Deviance: 11707.\n", "Time: 13:23:44 Pearson chi2: 1.16e+04\n", "No. Iterations: 4 \n", "=========================================================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "---------------------------------------------------------------------------------------------------------\n", "Intercept -1.3635 0.034 -39.932 0.000 -1.430 -1.297\n", "C(sex, Treatment(reference='M'))[T.F] -0.0104 0.046 -0.224 0.823 -0.101 0.080\n", "=========================================================================================================\n", "\"\"\"" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model2.summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Taking the exponential of the regression coefficients returns the odds of the reference category, and the odds ratio of the outcome in the non-reference category." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Intercept 0.255759\n", "C(sex, Treatment(reference='M'))[T.F] 0.989690\n", "dtype: float64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp(model2.params)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25575872714319636" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_men" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.98969018848929757" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_women / odds_men" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The odds in the non-reference category can be computed as the odds in the reference category (i.e. the intercept) multiplied by the odds ratio in the non-reference category (i.e. the regression coefficient)." ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25312190287419911" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.prod(np.exp(model2.params))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25312190287413283" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "odds_women" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's try including more variables." ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "model3 = smf.glm(\n", " \"victim_last_yr ~ C(sex, Treatment(reference='M')) + age + C(safety_walk_night, Treatment(reference='Very safe'))\",\n", " data=bcs,\n", " family=sm.families.Binomial()\n", ").fit()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Generalized Linear Model Regression Results
Dep. Variable: victim_last_yr No. Observations: 11610
Model: GLM Df Residuals: 11604
Model Family: Binomial Df Model: 5
Link Function: logit Scale: 1.0
Method: IRLS Log-Likelihood: -5550.0
Date: Sun, 03 Dec 2017 Deviance: 11100.
Time: 13:23:44 Pearson chi2: 1.15e+04
No. Iterations: 5
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err z P>|z| [0.025 0.975]
Intercept -0.1315 0.079 -1.666 0.096 -0.286 0.023
C(sex, Treatment(reference='M'))[T.F] -0.1883 0.051 -3.704 0.000 -0.288 -0.089
C(safety_walk_night, Treatment(reference='Very safe'))[T.A bit unsafe] 0.6191 0.072 8.657 0.000 0.479 0.759
C(safety_walk_night, Treatment(reference='Very safe'))[T.Fairly safe] 0.2184 0.063 3.456 0.001 0.095 0.342
C(safety_walk_night, Treatment(reference='Very safe'))[T.Very unsafe] 0.7749 0.089 8.680 0.000 0.600 0.950
age -0.0308 0.001 -22.432 0.000 -0.033 -0.028
" ], "text/plain": [ "\n", "\"\"\"\n", " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: victim_last_yr No. Observations: 11610\n", "Model: GLM Df Residuals: 11604\n", "Model Family: Binomial Df Model: 5\n", "Link Function: logit Scale: 1.0\n", "Method: IRLS Log-Likelihood: -5550.0\n", "Date: Sun, 03 Dec 2017 Deviance: 11100.\n", "Time: 13:23:44 Pearson chi2: 1.15e+04\n", "No. Iterations: 5 \n", "==========================================================================================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------------------------------------------------------------------\n", "Intercept -0.1315 0.079 -1.666 0.096 -0.286 0.023\n", "C(sex, Treatment(reference='M'))[T.F] -0.1883 0.051 -3.704 0.000 -0.288 -0.089\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.A bit unsafe] 0.6191 0.072 8.657 0.000 0.479 0.759\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.Fairly safe] 0.2184 0.063 3.456 0.001 0.095 0.342\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.Very unsafe] 0.7749 0.089 8.680 0.000 0.600 0.950\n", "age -0.0308 0.001 -22.432 0.000 -0.033 -0.028\n", "==========================================================================================================================================\n", "\"\"\"" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model3.summary()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Intercept 0.876819\n", "C(sex, Treatment(reference='M'))[T.F] 0.828330\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.A bit unsafe] 1.857319\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.Fairly safe] 1.244052\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.Very unsafe] 2.170337\n", "age 0.969700\n", "dtype: float64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp(model3.params)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
Intercept0.7511611.023499
C(sex, Treatment(reference='M'))[T.F]0.7497610.915132
C(safety_walk_night, Treatment(reference='Very safe'))[T.A bit unsafe]1.6143932.136799
C(safety_walk_night, Treatment(reference='Very safe'))[T.Fairly safe]1.0991511.408054
C(safety_walk_night, Treatment(reference='Very safe'))[T.Very unsafe]1.8219522.585338
age0.9670960.972310
\n", "
" ], "text/plain": [ " 0 1\n", "Intercept 0.751161 1.023499\n", "C(sex, Treatment(reference='M'))[T.F] 0.749761 0.915132\n", "C(safety_walk_night, Treatment(reference='Very ... 1.614393 2.136799\n", "C(safety_walk_night, Treatment(reference='Very ... 1.099151 1.408054\n", "C(safety_walk_night, Treatment(reference='Very ... 1.821952 2.585338\n", "age 0.967096 0.972310" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp(model3.conf_int())" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Intercept 9.578223e-02\n", "C(sex, Treatment(reference='M'))[T.F] 2.120706e-04\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.A bit unsafe] 4.847630e-18\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.Fairly safe] 5.477788e-04\n", "C(safety_walk_night, Treatment(reference='Very safe'))[T.Very unsafe] 3.964205e-18\n", "age 1.918834e-111\n", "dtype: float64" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model3.pvalues" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }