{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import cvxpy as cvx\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from itertools import product\n", "from sklearn import model_selection as ms\n", "\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data pre-processing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in the *Child Health and Development Studies* data from the R package [`mosaicData`](https://cran.r-project.org/package=mosaicData)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "gestation = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/mosaicData/Gestation.csv', index_col=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop some columns." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "gestation.drop(columns=['id', 'pluralty', 'outcome', 'date'], inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check for missing values." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dwt 0.403722\n", "dht 0.398058\n", "inc 0.100324\n", "wt.1 0.029126\n", "drace 0.025081\n", "ht 0.017799\n", "number 0.016990\n", "ded 0.010518\n", "gestation 0.010518\n", "time 0.008091\n", "smoke 0.008091\n", "dage 0.005663\n", "age 0.001618\n", "ed 0.000809\n", "race 0.000809\n", "marital 0.000000\n", "parity 0.000000\n", "wt 0.000000\n", "sex 0.000000\n", "dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gestation.isnull().mean().sort_values(ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop variables with many missing values." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "gestation.drop(columns=['dht', 'dwt', 'inc'], inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop observations with missing values." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "gestation.dropna(inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create dummies for categorical variables." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "gestation = pd.get_dummies(gestation, columns=['race', 'ed', 'drace', 'ded', 'marital', 'smoke', 'time', 'number'], drop_first=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | gestation | \n", "sex | \n", "wt | \n", "parity | \n", "age | \n", "ht | \n", "wt.1 | \n", "dage | \n", "race_1.0 | \n", "race_2.0 | \n", "... | \n", "time_8.0 | \n", "time_9.0 | \n", "number_1.0 | \n", "number_2.0 | \n", "number_3.0 | \n", "number_4.0 | \n", "number_5.0 | \n", "number_6.0 | \n", "number_7.0 | \n", "number_8.0 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "284.0 | \n", "1 | \n", "120 | \n", "1 | \n", "27.0 | \n", "62.0 | \n", "100.0 | \n", "31.0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
2 | \n", "282.0 | \n", "1 | \n", "113 | \n", "2 | \n", "33.0 | \n", "64.0 | \n", "135.0 | \n", "38.0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "279.0 | \n", "1 | \n", "128 | \n", "1 | \n", "28.0 | \n", "64.0 | \n", "115.0 | \n", "32.0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 | \n", "282.0 | \n", "1 | \n", "108 | \n", "1 | \n", "23.0 | \n", "67.0 | \n", "125.0 | \n", "24.0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
6 | \n", "286.0 | \n", "1 | \n", "136 | \n", "4 | \n", "25.0 | \n", "62.0 | \n", "93.0 | \n", "28.0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 rows × 65 columns
\n", "\n", " | l1_ratio | \n", "lambda | \n", "mse_test | \n", "mse_train | \n", "
---|---|---|---|---|
0 | \n", "0.00 | \n", "0.01 | \n", "262.517981 | \n", "230.772366 | \n", "
1 | \n", "0.25 | \n", "0.01 | \n", "263.030008 | \n", "230.047452 | \n", "
2 | \n", "0.50 | \n", "0.01 | \n", "263.539374 | \n", "229.085362 | \n", "
3 | \n", "0.75 | \n", "0.01 | \n", "263.961701 | \n", "227.622517 | \n", "
4 | \n", "1.00 | \n", "0.01 | \n", "266.286687 | \n", "224.393952 | \n", "