#In this project we'll work with some data and with the goal of trying to find replacement players for the ones lost at the start of the off-season - During the 2001-02 offseason, the team lost three key free agents to larger market teams: 2000 AL MVP Jason Giambi to the New York Yankees, outfielder Johnny Damon to the Boston Red Sox, and closer Jason Isringhausen to the St. Louis Cardinals.

batting <- read.csv('Batting.csv')
head(batting)
str(batting)
## 'data.frame':    97889 obs. of  24 variables:
##  $ playerID : Factor w/ 18107 levels "aardsda01","aaronha01",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ yearID   : int  2004 2006 2007 2008 2009 2010 2012 1954 1955 1956 ...
##  $ stint    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ teamID   : Factor w/ 149 levels "ALT","ANA","ARI",..: 117 35 33 16 116 116 93 80 80 80 ...
##  $ lgID     : Factor w/ 6 levels "AA","AL","FL",..: 4 4 2 2 2 2 2 4 4 4 ...
##  $ G        : int  11 45 25 47 73 53 1 122 153 153 ...
##  $ G_batting: int  11 43 2 5 3 4 NA 122 153 153 ...
##  $ AB       : int  0 2 0 1 0 0 NA 468 602 609 ...
##  $ R        : int  0 0 0 0 0 0 NA 58 105 106 ...
##  $ H        : int  0 0 0 0 0 0 NA 131 189 200 ...
##  $ X2B      : int  0 0 0 0 0 0 NA 27 37 34 ...
##  $ X3B      : int  0 0 0 0 0 0 NA 6 9 14 ...
##  $ HR       : int  0 0 0 0 0 0 NA 13 27 26 ...
##  $ RBI      : int  0 0 0 0 0 0 NA 69 106 92 ...
##  $ SB       : int  0 0 0 0 0 0 NA 2 3 2 ...
##  $ CS       : int  0 0 0 0 0 0 NA 2 1 4 ...
##  $ BB       : int  0 0 0 0 0 0 NA 28 49 37 ...
##  $ SO       : int  0 0 0 1 0 0 NA 39 61 54 ...
##  $ IBB      : int  0 0 0 0 0 0 NA NA 5 6 ...
##  $ HBP      : int  0 0 0 0 0 0 NA 3 3 2 ...
##  $ SH       : int  0 1 0 0 0 0 NA 6 7 5 ...
##  $ SF       : int  0 0 0 0 0 0 NA 4 4 7 ...
##  $ GIDP     : int  0 0 0 0 0 0 NA 13 20 21 ...
##  $ G_old    : int  11 45 2 5 NA NA NA 122 153 153 ...
head(batting$AB)
## [1] 0 2 0 1 0 0
head(batting$X2B)
## [1] 0 0 0 0 0 0
#Getting Batting average
batting$BA <- batting$H / batting$AB
tail(batting$BA,5)
## [1] 0.1230769 0.2746479 0.1470588 0.2745098 0.2138728
# On Base Percentage
batting$OBP <- (batting$H + batting$BB + batting$HBP)/(batting$AB + batting$BB + batting$HBP + batting$SF)
# Creating X1B (Singles)
batting$X1B <- batting$H - batting$X2B - batting$X3B - batting$HR
# Creating Slugging Average (SLG)
batting$SLG <- ((1 * batting$X1B) + (2 * batting$X2B) + (3 * batting$X3B) + (4 * batting$HR) ) / batting$AB
#Checking structure of new columns
str(batting)
## 'data.frame':    97889 obs. of  28 variables:
##  $ playerID : Factor w/ 18107 levels "aardsda01","aaronha01",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ yearID   : int  2004 2006 2007 2008 2009 2010 2012 1954 1955 1956 ...
##  $ stint    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ teamID   : Factor w/ 149 levels "ALT","ANA","ARI",..: 117 35 33 16 116 116 93 80 80 80 ...
##  $ lgID     : Factor w/ 6 levels "AA","AL","FL",..: 4 4 2 2 2 2 2 4 4 4 ...
##  $ G        : int  11 45 25 47 73 53 1 122 153 153 ...
##  $ G_batting: int  11 43 2 5 3 4 NA 122 153 153 ...
##  $ AB       : int  0 2 0 1 0 0 NA 468 602 609 ...
##  $ R        : int  0 0 0 0 0 0 NA 58 105 106 ...
##  $ H        : int  0 0 0 0 0 0 NA 131 189 200 ...
##  $ X2B      : int  0 0 0 0 0 0 NA 27 37 34 ...
##  $ X3B      : int  0 0 0 0 0 0 NA 6 9 14 ...
##  $ HR       : int  0 0 0 0 0 0 NA 13 27 26 ...
##  $ RBI      : int  0 0 0 0 0 0 NA 69 106 92 ...
##  $ SB       : int  0 0 0 0 0 0 NA 2 3 2 ...
##  $ CS       : int  0 0 0 0 0 0 NA 2 1 4 ...
##  $ BB       : int  0 0 0 0 0 0 NA 28 49 37 ...
##  $ SO       : int  0 0 0 1 0 0 NA 39 61 54 ...
##  $ IBB      : int  0 0 0 0 0 0 NA NA 5 6 ...
##  $ HBP      : int  0 0 0 0 0 0 NA 3 3 2 ...
##  $ SH       : int  0 1 0 0 0 0 NA 6 7 5 ...
##  $ SF       : int  0 0 0 0 0 0 NA 4 4 7 ...
##  $ GIDP     : int  0 0 0 0 0 0 NA 13 20 21 ...
##  $ G_old    : int  11 45 2 5 NA NA NA 122 153 153 ...
##  $ BA       : num  NaN 0 NaN 0 NaN ...
##  $ OBP      : num  NaN 0 NaN 0 NaN ...
##  $ X1B      : int  0 0 0 0 0 0 NA 85 116 126 ...
##  $ SLG      : num  NaN 0 NaN 0 NaN ...
sal <- read.csv('Salaries.csv')
#Use summary to get a summary of the batting data frame and notice the minimum year in the yearID column. Our batting data goes back to 1871! Our salary data starts at 1985, meaning we need to remove the batting data that occured before 1985.
#Use subset() to reassign batting to only contain data from 1985 and onwards
batting <- subset(batting,yearID >= 1985)
summary(batting)
##       playerID         yearID         stint          teamID     
##  moyerja01:   27   Min.   :1985   Min.   :1.00   SDN    : 1313  
##  mulhote01:   26   1st Qu.:1993   1st Qu.:1.00   CLE    : 1306  
##  weathda01:   26   Median :2000   Median :1.00   PIT    : 1299  
##  maddugr01:   25   Mean   :2000   Mean   :1.08   NYN    : 1297  
##  sierrru01:   25   3rd Qu.:2007   3rd Qu.:1.00   BOS    : 1279  
##  thomeji01:   25   Max.   :2013   Max.   :4.00   CIN    : 1279  
##  (Other)  :35498                                 (Other):27879  
##  lgID             G           G_batting            AB       
##  AA:    0   Min.   :  1.0   Min.   :  0.00   Min.   :  0.0  
##  AL:17226   1st Qu.: 14.0   1st Qu.:  4.00   1st Qu.:  3.0  
##  FL:    0   Median : 34.0   Median : 27.00   Median : 47.0  
##  NL:18426   Mean   : 51.7   Mean   : 46.28   Mean   :144.7  
##  PL:    0   3rd Qu.: 77.0   3rd Qu.: 77.00   3rd Qu.:241.0  
##  UA:    0   Max.   :163.0   Max.   :163.00   Max.   :716.0  
##                             NA's   :1406     NA's   :4377   
##        R                H               X2B              X3B        
##  Min.   :  0.00   Min.   :  0.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median :  4.00   Median :  8.00   Median : 1.000   Median : 0.000  
##  Mean   : 19.44   Mean   : 37.95   Mean   : 7.293   Mean   : 0.824  
##  3rd Qu.: 30.00   3rd Qu.: 61.00   3rd Qu.:11.000   3rd Qu.: 1.000  
##  Max.   :152.00   Max.   :262.00   Max.   :59.000   Max.   :23.000  
##  NA's   :4377     NA's   :4377     NA's   :4377     NA's   :4377    
##        HR              RBI               SB                CS        
##  Min.   : 0.000   Min.   :  0.00   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.: 0.000  
##  Median : 0.000   Median :  3.00   Median :  0.000   Median : 0.000  
##  Mean   : 4.169   Mean   : 18.41   Mean   :  2.811   Mean   : 1.219  
##  3rd Qu.: 5.000   3rd Qu.: 27.00   3rd Qu.:  2.000   3rd Qu.: 1.000  
##  Max.   :73.000   Max.   :165.00   Max.   :110.000   Max.   :29.000  
##  NA's   :4377     NA's   :4377     NA's   :4377      NA's   :4377    
##        BB               SO              IBB               HBP        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.:  0.00   1st Qu.:  1.00   1st Qu.:  0.000   1st Qu.: 0.000  
##  Median :  3.00   Median : 12.00   Median :  0.000   Median : 0.000  
##  Mean   : 14.06   Mean   : 27.03   Mean   :  1.171   Mean   : 1.273  
##  3rd Qu.: 21.00   3rd Qu.: 42.00   3rd Qu.:  1.000   3rd Qu.: 1.000  
##  Max.   :232.00   Max.   :223.00   Max.   :120.000   Max.   :35.000  
##  NA's   :4377     NA's   :4377     NA's   :4378      NA's   :4387    
##        SH               SF              GIDP           G_old      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   :  0.0  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.00   1st Qu.: 11.0  
##  Median : 0.000   Median : 0.000   Median : 1.00   Median : 32.0  
##  Mean   : 1.465   Mean   : 1.212   Mean   : 3.25   Mean   : 49.7  
##  3rd Qu.: 2.000   3rd Qu.: 2.000   3rd Qu.: 5.00   3rd Qu.: 77.0  
##  Max.   :39.000   Max.   :17.000   Max.   :35.00   Max.   :163.0  
##  NA's   :4377     NA's   :4378     NA's   :4377    NA's   :5189   
##        BA             OBP             X1B              SLG       
##  Min.   :0.000   Min.   :0.000   Min.   :  0.00   Min.   :0.000  
##  1st Qu.:0.136   1st Qu.:0.188   1st Qu.:  0.00   1st Qu.:0.167  
##  Median :0.233   Median :0.296   Median :  6.00   Median :0.333  
##  Mean   :0.205   Mean   :0.262   Mean   : 25.66   Mean   :0.304  
##  3rd Qu.:0.274   3rd Qu.:0.342   3rd Qu.: 42.00   3rd Qu.:0.423  
##  Max.   :1.000   Max.   :1.000   Max.   :225.00   Max.   :4.000  
##  NA's   :8905    NA's   :8821    NA's   :4377     NA's   :8905
#Using the merge() function to merge the batting and sal data frames by c('playerID','yearID'). Call the new data frame combo
combo <- merge(batting,sal,by=c('playerID','yearID'))
summary(combo)
##       playerID         yearID         stint          teamID.x    
##  moyerja01:   27   Min.   :1985   Min.   :1.000   LAN    :  940  
##  thomeji01:   25   1st Qu.:1993   1st Qu.:1.000   PHI    :  937  
##  weathda01:   25   Median :1999   Median :1.000   BOS    :  935  
##  vizquom01:   24   Mean   :1999   Mean   :1.098   NYA    :  928  
##  gaettga01:   23   3rd Qu.:2006   3rd Qu.:1.000   CLE    :  920  
##  griffke02:   23   Max.   :2013   Max.   :4.000   SDN    :  914  
##  (Other)  :25250                                  (Other):19823  
##  lgID.x           G            G_batting            AB       
##  AA:    0   Min.   :  1.00   Min.   :  0.00   Min.   :  0.0  
##  AL:12292   1st Qu.: 26.00   1st Qu.:  8.00   1st Qu.:  5.0  
##  FL:    0   Median : 50.00   Median : 42.00   Median : 85.0  
##  NL:13105   Mean   : 64.06   Mean   : 57.58   Mean   :182.4  
##  PL:    0   3rd Qu.:101.00   3rd Qu.:101.00   3rd Qu.:336.0  
##  UA:    0   Max.   :163.00   Max.   :163.00   Max.   :716.0  
##                              NA's   :906      NA's   :2661   
##        R                H               X2B              X3B        
##  Min.   :  0.00   Min.   :  0.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:  0.00   1st Qu.:  1.00   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median :  9.00   Median : 19.00   Median : 3.000   Median : 0.000  
##  Mean   : 24.71   Mean   : 48.18   Mean   : 9.276   Mean   : 1.033  
##  3rd Qu.: 43.00   3rd Qu.: 87.25   3rd Qu.:16.000   3rd Qu.: 1.000  
##  Max.   :152.00   Max.   :262.00   Max.   :59.000   Max.   :23.000  
##  NA's   :2661     NA's   :2661     NA's   :2661     NA's   :2661    
##        HR              RBI               SB                CS       
##  Min.   : 0.000   Min.   :  0.00   Min.   :  0.000   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.: 0.00  
##  Median : 1.000   Median :  8.00   Median :  0.000   Median : 0.00  
##  Mean   : 5.369   Mean   : 23.56   Mean   :  3.568   Mean   : 1.54  
##  3rd Qu.: 7.000   3rd Qu.: 39.00   3rd Qu.:  3.000   3rd Qu.: 2.00  
##  Max.   :73.000   Max.   :165.00   Max.   :110.000   Max.   :29.00  
##  NA's   :2661     NA's   :2661     NA's   :2661      NA's   :2661   
##        BB               SO              IBB               HBP        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.:  0.00   1st Qu.:  2.00   1st Qu.:  0.000   1st Qu.: 0.000  
##  Median :  6.00   Median : 20.00   Median :  0.000   Median : 0.000  
##  Mean   : 17.98   Mean   : 33.52   Mean   :  1.533   Mean   : 1.614  
##  3rd Qu.: 29.00   3rd Qu.: 55.00   3rd Qu.:  2.000   3rd Qu.: 2.000  
##  Max.   :232.00   Max.   :223.00   Max.   :120.000   Max.   :35.000  
##  NA's   :2661     NA's   :2661     NA's   :2662      NA's   :2670    
##        SH               SF              GIDP            G_old       
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 20.00  
##  Median : 0.000   Median : 0.000   Median : 2.000   Median : 47.00  
##  Mean   : 1.786   Mean   : 1.554   Mean   : 4.127   Mean   : 61.43  
##  3rd Qu.: 2.000   3rd Qu.: 2.000   3rd Qu.: 7.000   3rd Qu.:101.00  
##  Max.   :39.000   Max.   :17.000   Max.   :35.000   Max.   :163.00  
##  NA's   :2661     NA's   :2662     NA's   :2661     NA's   :3414    
##        BA             OBP             X1B             SLG       
##  Min.   :0.000   Min.   :0.000   Min.   :  0.0   Min.   :0.000  
##  1st Qu.:0.160   1st Qu.:0.208   1st Qu.:  0.0   1st Qu.:0.200  
##  Median :0.242   Median :0.305   Median : 13.0   Median :0.351  
##  Mean   :0.212   Mean   :0.270   Mean   : 32.5   Mean   :0.317  
##  3rd Qu.:0.276   3rd Qu.:0.346   3rd Qu.: 59.0   3rd Qu.:0.432  
##  Max.   :1.000   Max.   :1.000   Max.   :225.0   Max.   :4.000  
##  NA's   :5618    NA's   :5562    NA's   :2661    NA's   :5618   
##     teamID.y     lgID.y         salary        
##  CLE    :  935   AL:12304   Min.   :       0  
##  PIT    :  932   NL:13093   1st Qu.:  255000  
##  PHI    :  931              Median :  550000  
##  SDN    :  923              Mean   : 1879256  
##  LAN    :  921              3rd Qu.: 2150000  
##  CIN    :  912              Max.   :33000000  
##  (Other):19843
#Analyzing the Lost Players
#The Oakland A's lost 3 key players during the off-season. We'll want to get their stats to see what we have to replace. The players lost were: first baseman 2000 AL MVP Jason Giambi (giambja01) to the New York Yankees, outfielder Johnny Damon (damonjo01) to the Boston Red Sox and infielder Rainer Gustavo "Ray" Olmedo ('saenzol01').
lost_players <- subset(combo,playerID %in% c('giambja01','damonjo01','saenzol01') )
lost_players <- subset(lost_players,yearID == 2001)
lost_players
#Reduce the lost_players data frame to the following columns: playerID,H,X2B,X3B,HR,OBP,SLG,BA,AB

lost_players <- lost_players[,c('playerID','H','X2B','X3B','HR','OBP','SLG','BA','AB')]
head(lost_players)
#Replacement Players
#Now we have all the information we need!Now we find Replacement Players for the key three players we lost, we have three constraints:

#The total combined salary of the three players can not exceed 15 million dollars.
#Their combined number of At Bats (AB) needs to be equal to or greater than the lost players.
#Their mean OBP had to equal to or greater than the mean OBP of the lost players
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
avail.players <- filter(combo,yearID==2001)

library(ggplot2)
ggplot(avail.players,aes(x=OBP,y=salary)) + geom_point()
## Warning: Removed 168 rows containing missing values (geom_point).

#Looks like there is no point in paying above 8 million or so (I'm just eyeballing this number). I'll choose that as a cutt off point. There are also a lot of players with OBP==0. Let's get rid of them too.
avail.players <- filter(avail.players,salary<8000000,OBP>0)
avail.players <- filter(avail.players,AB >= 500)

possible <- head(arrange(avail.players,desc(OBP)),10)

possible <- possible[,c('playerID','OBP','AB','salary')]

possible
possible[2:4,]     #Can't choose giambja again, but the other ones look good (2-4). I choose them