#In this project we'll work with some data and with the goal of trying to find replacement players for the ones lost at the start of the off-season - During the 2001-02 offseason, the team lost three key free agents to larger market teams: 2000 AL MVP Jason Giambi to the New York Yankees, outfielder Johnny Damon to the Boston Red Sox, and closer Jason Isringhausen to the St. Louis Cardinals.
batting <- read.csv('Batting.csv')
head(batting)
str(batting)
## 'data.frame': 97889 obs. of 24 variables:
## $ playerID : Factor w/ 18107 levels "aardsda01","aaronha01",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ yearID : int 2004 2006 2007 2008 2009 2010 2012 1954 1955 1956 ...
## $ stint : int 1 1 1 1 1 1 1 1 1 1 ...
## $ teamID : Factor w/ 149 levels "ALT","ANA","ARI",..: 117 35 33 16 116 116 93 80 80 80 ...
## $ lgID : Factor w/ 6 levels "AA","AL","FL",..: 4 4 2 2 2 2 2 4 4 4 ...
## $ G : int 11 45 25 47 73 53 1 122 153 153 ...
## $ G_batting: int 11 43 2 5 3 4 NA 122 153 153 ...
## $ AB : int 0 2 0 1 0 0 NA 468 602 609 ...
## $ R : int 0 0 0 0 0 0 NA 58 105 106 ...
## $ H : int 0 0 0 0 0 0 NA 131 189 200 ...
## $ X2B : int 0 0 0 0 0 0 NA 27 37 34 ...
## $ X3B : int 0 0 0 0 0 0 NA 6 9 14 ...
## $ HR : int 0 0 0 0 0 0 NA 13 27 26 ...
## $ RBI : int 0 0 0 0 0 0 NA 69 106 92 ...
## $ SB : int 0 0 0 0 0 0 NA 2 3 2 ...
## $ CS : int 0 0 0 0 0 0 NA 2 1 4 ...
## $ BB : int 0 0 0 0 0 0 NA 28 49 37 ...
## $ SO : int 0 0 0 1 0 0 NA 39 61 54 ...
## $ IBB : int 0 0 0 0 0 0 NA NA 5 6 ...
## $ HBP : int 0 0 0 0 0 0 NA 3 3 2 ...
## $ SH : int 0 1 0 0 0 0 NA 6 7 5 ...
## $ SF : int 0 0 0 0 0 0 NA 4 4 7 ...
## $ GIDP : int 0 0 0 0 0 0 NA 13 20 21 ...
## $ G_old : int 11 45 2 5 NA NA NA 122 153 153 ...
head(batting$AB)
## [1] 0 2 0 1 0 0
head(batting$X2B)
## [1] 0 0 0 0 0 0
#Getting Batting average
batting$BA <- batting$H / batting$AB
tail(batting$BA,5)
## [1] 0.1230769 0.2746479 0.1470588 0.2745098 0.2138728
# On Base Percentage
batting$OBP <- (batting$H + batting$BB + batting$HBP)/(batting$AB + batting$BB + batting$HBP + batting$SF)
# Creating X1B (Singles)
batting$X1B <- batting$H - batting$X2B - batting$X3B - batting$HR
# Creating Slugging Average (SLG)
batting$SLG <- ((1 * batting$X1B) + (2 * batting$X2B) + (3 * batting$X3B) + (4 * batting$HR) ) / batting$AB
#Checking structure of new columns
str(batting)
## 'data.frame': 97889 obs. of 28 variables:
## $ playerID : Factor w/ 18107 levels "aardsda01","aaronha01",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ yearID : int 2004 2006 2007 2008 2009 2010 2012 1954 1955 1956 ...
## $ stint : int 1 1 1 1 1 1 1 1 1 1 ...
## $ teamID : Factor w/ 149 levels "ALT","ANA","ARI",..: 117 35 33 16 116 116 93 80 80 80 ...
## $ lgID : Factor w/ 6 levels "AA","AL","FL",..: 4 4 2 2 2 2 2 4 4 4 ...
## $ G : int 11 45 25 47 73 53 1 122 153 153 ...
## $ G_batting: int 11 43 2 5 3 4 NA 122 153 153 ...
## $ AB : int 0 2 0 1 0 0 NA 468 602 609 ...
## $ R : int 0 0 0 0 0 0 NA 58 105 106 ...
## $ H : int 0 0 0 0 0 0 NA 131 189 200 ...
## $ X2B : int 0 0 0 0 0 0 NA 27 37 34 ...
## $ X3B : int 0 0 0 0 0 0 NA 6 9 14 ...
## $ HR : int 0 0 0 0 0 0 NA 13 27 26 ...
## $ RBI : int 0 0 0 0 0 0 NA 69 106 92 ...
## $ SB : int 0 0 0 0 0 0 NA 2 3 2 ...
## $ CS : int 0 0 0 0 0 0 NA 2 1 4 ...
## $ BB : int 0 0 0 0 0 0 NA 28 49 37 ...
## $ SO : int 0 0 0 1 0 0 NA 39 61 54 ...
## $ IBB : int 0 0 0 0 0 0 NA NA 5 6 ...
## $ HBP : int 0 0 0 0 0 0 NA 3 3 2 ...
## $ SH : int 0 1 0 0 0 0 NA 6 7 5 ...
## $ SF : int 0 0 0 0 0 0 NA 4 4 7 ...
## $ GIDP : int 0 0 0 0 0 0 NA 13 20 21 ...
## $ G_old : int 11 45 2 5 NA NA NA 122 153 153 ...
## $ BA : num NaN 0 NaN 0 NaN ...
## $ OBP : num NaN 0 NaN 0 NaN ...
## $ X1B : int 0 0 0 0 0 0 NA 85 116 126 ...
## $ SLG : num NaN 0 NaN 0 NaN ...
sal <- read.csv('Salaries.csv')
#Use summary to get a summary of the batting data frame and notice the minimum year in the yearID column. Our batting data goes back to 1871! Our salary data starts at 1985, meaning we need to remove the batting data that occured before 1985.
#Use subset() to reassign batting to only contain data from 1985 and onwards
batting <- subset(batting,yearID >= 1985)
summary(batting)
## playerID yearID stint teamID
## moyerja01: 27 Min. :1985 Min. :1.00 SDN : 1313
## mulhote01: 26 1st Qu.:1993 1st Qu.:1.00 CLE : 1306
## weathda01: 26 Median :2000 Median :1.00 PIT : 1299
## maddugr01: 25 Mean :2000 Mean :1.08 NYN : 1297
## sierrru01: 25 3rd Qu.:2007 3rd Qu.:1.00 BOS : 1279
## thomeji01: 25 Max. :2013 Max. :4.00 CIN : 1279
## (Other) :35498 (Other):27879
## lgID G G_batting AB
## AA: 0 Min. : 1.0 Min. : 0.00 Min. : 0.0
## AL:17226 1st Qu.: 14.0 1st Qu.: 4.00 1st Qu.: 3.0
## FL: 0 Median : 34.0 Median : 27.00 Median : 47.0
## NL:18426 Mean : 51.7 Mean : 46.28 Mean :144.7
## PL: 0 3rd Qu.: 77.0 3rd Qu.: 77.00 3rd Qu.:241.0
## UA: 0 Max. :163.0 Max. :163.00 Max. :716.0
## NA's :1406 NA's :4377
## R H X2B X3B
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 4.00 Median : 8.00 Median : 1.000 Median : 0.000
## Mean : 19.44 Mean : 37.95 Mean : 7.293 Mean : 0.824
## 3rd Qu.: 30.00 3rd Qu.: 61.00 3rd Qu.:11.000 3rd Qu.: 1.000
## Max. :152.00 Max. :262.00 Max. :59.000 Max. :23.000
## NA's :4377 NA's :4377 NA's :4377 NA's :4377
## HR RBI SB CS
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 3.00 Median : 0.000 Median : 0.000
## Mean : 4.169 Mean : 18.41 Mean : 2.811 Mean : 1.219
## 3rd Qu.: 5.000 3rd Qu.: 27.00 3rd Qu.: 2.000 3rd Qu.: 1.000
## Max. :73.000 Max. :165.00 Max. :110.000 Max. :29.000
## NA's :4377 NA's :4377 NA's :4377 NA's :4377
## BB SO IBB HBP
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 3.00 Median : 12.00 Median : 0.000 Median : 0.000
## Mean : 14.06 Mean : 27.03 Mean : 1.171 Mean : 1.273
## 3rd Qu.: 21.00 3rd Qu.: 42.00 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :232.00 Max. :223.00 Max. :120.000 Max. :35.000
## NA's :4377 NA's :4377 NA's :4378 NA's :4387
## SH SF GIDP G_old
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 11.0
## Median : 0.000 Median : 0.000 Median : 1.00 Median : 32.0
## Mean : 1.465 Mean : 1.212 Mean : 3.25 Mean : 49.7
## 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 5.00 3rd Qu.: 77.0
## Max. :39.000 Max. :17.000 Max. :35.00 Max. :163.0
## NA's :4377 NA's :4378 NA's :4377 NA's :5189
## BA OBP X1B SLG
## Min. :0.000 Min. :0.000 Min. : 0.00 Min. :0.000
## 1st Qu.:0.136 1st Qu.:0.188 1st Qu.: 0.00 1st Qu.:0.167
## Median :0.233 Median :0.296 Median : 6.00 Median :0.333
## Mean :0.205 Mean :0.262 Mean : 25.66 Mean :0.304
## 3rd Qu.:0.274 3rd Qu.:0.342 3rd Qu.: 42.00 3rd Qu.:0.423
## Max. :1.000 Max. :1.000 Max. :225.00 Max. :4.000
## NA's :8905 NA's :8821 NA's :4377 NA's :8905
#Using the merge() function to merge the batting and sal data frames by c('playerID','yearID'). Call the new data frame combo
combo <- merge(batting,sal,by=c('playerID','yearID'))
summary(combo)
## playerID yearID stint teamID.x
## moyerja01: 27 Min. :1985 Min. :1.000 LAN : 940
## thomeji01: 25 1st Qu.:1993 1st Qu.:1.000 PHI : 937
## weathda01: 25 Median :1999 Median :1.000 BOS : 935
## vizquom01: 24 Mean :1999 Mean :1.098 NYA : 928
## gaettga01: 23 3rd Qu.:2006 3rd Qu.:1.000 CLE : 920
## griffke02: 23 Max. :2013 Max. :4.000 SDN : 914
## (Other) :25250 (Other):19823
## lgID.x G G_batting AB
## AA: 0 Min. : 1.00 Min. : 0.00 Min. : 0.0
## AL:12292 1st Qu.: 26.00 1st Qu.: 8.00 1st Qu.: 5.0
## FL: 0 Median : 50.00 Median : 42.00 Median : 85.0
## NL:13105 Mean : 64.06 Mean : 57.58 Mean :182.4
## PL: 0 3rd Qu.:101.00 3rd Qu.:101.00 3rd Qu.:336.0
## UA: 0 Max. :163.00 Max. :163.00 Max. :716.0
## NA's :906 NA's :2661
## R H X2B X3B
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 9.00 Median : 19.00 Median : 3.000 Median : 0.000
## Mean : 24.71 Mean : 48.18 Mean : 9.276 Mean : 1.033
## 3rd Qu.: 43.00 3rd Qu.: 87.25 3rd Qu.:16.000 3rd Qu.: 1.000
## Max. :152.00 Max. :262.00 Max. :59.000 Max. :23.000
## NA's :2661 NA's :2661 NA's :2661 NA's :2661
## HR RBI SB CS
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 1.000 Median : 8.00 Median : 0.000 Median : 0.00
## Mean : 5.369 Mean : 23.56 Mean : 3.568 Mean : 1.54
## 3rd Qu.: 7.000 3rd Qu.: 39.00 3rd Qu.: 3.000 3rd Qu.: 2.00
## Max. :73.000 Max. :165.00 Max. :110.000 Max. :29.00
## NA's :2661 NA's :2661 NA's :2661 NA's :2661
## BB SO IBB HBP
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 2.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 6.00 Median : 20.00 Median : 0.000 Median : 0.000
## Mean : 17.98 Mean : 33.52 Mean : 1.533 Mean : 1.614
## 3rd Qu.: 29.00 3rd Qu.: 55.00 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :232.00 Max. :223.00 Max. :120.000 Max. :35.000
## NA's :2661 NA's :2661 NA's :2662 NA's :2670
## SH SF GIDP G_old
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 20.00
## Median : 0.000 Median : 0.000 Median : 2.000 Median : 47.00
## Mean : 1.786 Mean : 1.554 Mean : 4.127 Mean : 61.43
## 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 7.000 3rd Qu.:101.00
## Max. :39.000 Max. :17.000 Max. :35.000 Max. :163.00
## NA's :2661 NA's :2662 NA's :2661 NA's :3414
## BA OBP X1B SLG
## Min. :0.000 Min. :0.000 Min. : 0.0 Min. :0.000
## 1st Qu.:0.160 1st Qu.:0.208 1st Qu.: 0.0 1st Qu.:0.200
## Median :0.242 Median :0.305 Median : 13.0 Median :0.351
## Mean :0.212 Mean :0.270 Mean : 32.5 Mean :0.317
## 3rd Qu.:0.276 3rd Qu.:0.346 3rd Qu.: 59.0 3rd Qu.:0.432
## Max. :1.000 Max. :1.000 Max. :225.0 Max. :4.000
## NA's :5618 NA's :5562 NA's :2661 NA's :5618
## teamID.y lgID.y salary
## CLE : 935 AL:12304 Min. : 0
## PIT : 932 NL:13093 1st Qu.: 255000
## PHI : 931 Median : 550000
## SDN : 923 Mean : 1879256
## LAN : 921 3rd Qu.: 2150000
## CIN : 912 Max. :33000000
## (Other):19843
#Analyzing the Lost Players
#The Oakland A's lost 3 key players during the off-season. We'll want to get their stats to see what we have to replace. The players lost were: first baseman 2000 AL MVP Jason Giambi (giambja01) to the New York Yankees, outfielder Johnny Damon (damonjo01) to the Boston Red Sox and infielder Rainer Gustavo "Ray" Olmedo ('saenzol01').
lost_players <- subset(combo,playerID %in% c('giambja01','damonjo01','saenzol01') )
lost_players <- subset(lost_players,yearID == 2001)
lost_players
#Reduce the lost_players data frame to the following columns: playerID,H,X2B,X3B,HR,OBP,SLG,BA,AB
lost_players <- lost_players[,c('playerID','H','X2B','X3B','HR','OBP','SLG','BA','AB')]
head(lost_players)
#Replacement Players
#Now we have all the information we need!Now we find Replacement Players for the key three players we lost, we have three constraints:
#The total combined salary of the three players can not exceed 15 million dollars.
#Their combined number of At Bats (AB) needs to be equal to or greater than the lost players.
#Their mean OBP had to equal to or greater than the mean OBP of the lost players
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
avail.players <- filter(combo,yearID==2001)
library(ggplot2)
ggplot(avail.players,aes(x=OBP,y=salary)) + geom_point()
## Warning: Removed 168 rows containing missing values (geom_point).

#Looks like there is no point in paying above 8 million or so (I'm just eyeballing this number). I'll choose that as a cutt off point. There are also a lot of players with OBP==0. Let's get rid of them too.
avail.players <- filter(avail.players,salary<8000000,OBP>0)
avail.players <- filter(avail.players,AB >= 500)
possible <- head(arrange(avail.players,desc(OBP)),10)
possible <- possible[,c('playerID','OBP','AB','salary')]
possible
possible[2:4,] #Can't choose giambja again, but the other ones look good (2-4). I choose them