Note:

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code. Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Baseball players batting performance from 2014

In this tutorial, we will look at the performance of Major League Baseball (MLB) players in the year of 2014. source from baseballguru.com.

First we load R libraries that we need for this tutorial. Basic libraries of functions are loaded every time R starts. More specialized functions need to be loaded first before they can used.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(DT)
library(RColorBrewer)

Read in the data

Now let’s read in the baseball 2014 batting performance data set.

mlb2014=read_csv(file="data/mlb2014.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   playerID = col_character(),
##   nameFirst = col_character(),
##   nameLast = col_character(),
##   bats = col_character(),
##   throws = col_character(),
##   teamID = col_character(),
##   lg = col_character(),
##   avg = col_character(),
##   obp = col_character(),
##   slg = col_character(),
##   woba = col_character(),
##   ROTO = col_character(),
##   pos1 = col_character(),
##   pos2 = col_character(),
##   pos3 = col_character(),
##   pos4 = col_character(),
##   pos5 = col_character(),
##   baseball_prospectus_id = col_character()
## )
## See spec(...) for full column specifications.

Most of variables are read as characters. It is because “-” was used as an indicator of missing value. We add “-” in the string or recognized NA symbols and the issue is solved.

mlb2014=read_csv(file="data/mlb2014.csv", na=c("", "-", "NA"))
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   playerID = col_character(),
##   nameFirst = col_character(),
##   nameLast = col_character(),
##   bats = col_character(),
##   throws = col_character(),
##   teamID = col_character(),
##   lg = col_character(),
##   avg = col_double(),
##   obp = col_double(),
##   slg = col_double(),
##   woba = col_double(),
##   ROTO = col_character(),
##   pos1 = col_character(),
##   pos2 = col_character(),
##   pos3 = col_character(),
##   pos4 = col_character(),
##   pos5 = col_character(),
##   baseball_prospectus_id = col_character()
## )
## See spec(...) for full column specifications.

Now we use datatable to explore the data a little bit.

dim(mlb2014)
## [1] 649  48
datatable(select(sample_n(mlb2014,50), ends_with("G")), options = list(scrollX=T, pageLength = 5))

Association: categorical variables

Positions versus handedness

alt text

alt text

table(mlb2014$pos1)
## 
##  1B  2B  3B   C  DH  OF  SS 
##  88  76  70  98  11 238  68
table(mlb2014$bats)
## 
##   B   L   R   S 
##  68 203 346  28
table(mlb2014$pos1, mlb2014$bats)
##     
##        B   L   R   S
##   1B   7  43  36   2
##   2B  14  20  36   4
##   3B   6  19  41   4
##   C   10  11  73   3
##   DH   2   7   2   0
##   OF  15  96 115  11
##   SS  14   7  43   4
col.use=brewer.pal(4, 'Set2')
plot(table(mlb2014$pos1, mlb2014$bats), col=col.use)

chisq.test(table(mlb2014$pos1, mlb2014$bats))
## Warning in chisq.test(table(mlb2014$pos1, mlb2014$bats)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(mlb2014$pos1, mlb2014$bats)
## X-squared = 77.421, df = 18, p-value = 2.412e-09

Association: categorical versus continuous

Slugging percentage versus positions.

From wikipedia: “In baseball statistics, slugging percentage (SLG) is a popular measure of the power of a hitter.”

hist(mlb2014$slg)

plot(as.factor(mlb2014$pos1), mlb2014$slg, col=col.use)

summary(lm(slg~as.factor(pos1), data=mlb2014))
## 
## Call:
## lm(formula = slg ~ as.factor(pos1), data = mlb2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2958 -0.0498  0.0042  0.0592  0.8976 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.389805   0.010937  35.641  < 2e-16 ***
## as.factor(pos1)2B -0.058523   0.016315  -3.587 0.000361 ***
## as.factor(pos1)3B -0.001376   0.016379  -0.084 0.933076    
## as.factor(pos1)C  -0.041127   0.015216  -2.703 0.007064 ** 
## as.factor(pos1)DH -0.124090   0.040078  -3.096 0.002050 ** 
## as.factor(pos1)OF -0.027011   0.012895  -2.095 0.036613 *  
## as.factor(pos1)SS -0.053897   0.016725  -3.223 0.001338 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.102 on 609 degrees of freedom
##   (33 observations deleted due to missingness)
## Multiple R-squared:  0.04606,    Adjusted R-squared:  0.03666 
## F-statistic: 4.901 on 6 and 609 DF,  p-value: 6.604e-05
anova(lm(slg~as.factor(pos1), data=mlb2014))
## Analysis of Variance Table
## 
## Response: slg
##                  Df Sum Sq  Mean Sq F value    Pr(>F)    
## as.factor(pos1)   6 0.3060 0.051002  4.9009 6.604e-05 ***
## Residuals       609 6.3376 0.010407                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

On base percentage versus position.

Another important baseball statistics is On Base Percentage (OBP).

hist(mlb2014$obp)

plot(as.factor(mlb2014$pos1), mlb2014$obp, col=col.use)

summary(lm(obp~as.factor(pos1), data=mlb2014))
## 
## Call:
## lm(formula = obp ~ as.factor(pos1), data = mlb2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24250 -0.02645  0.00650  0.03466  0.31666 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.305287   0.006387  47.795   <2e-16 ***
## as.factor(pos1)2B -0.010676   0.009492  -1.125   0.2611    
## as.factor(pos1)3B  0.003056   0.009566   0.319   0.7495    
## as.factor(pos1)C  -0.014513   0.008886  -1.633   0.1029    
## as.factor(pos1)DH -0.051430   0.023407  -2.197   0.0284 *  
## as.factor(pos1)OF -0.006790   0.007503  -0.905   0.3659    
## as.factor(pos1)SS -0.015410   0.009768  -1.578   0.1151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05958 on 616 degrees of freedom
##   (26 observations deleted due to missingness)
## Multiple R-squared:  0.01604,    Adjusted R-squared:  0.006459 
## F-statistic: 1.674 on 6 and 616 DF,  p-value: 0.1249
anova(lm(obp~as.factor(pos1), data=mlb2014))
## Analysis of Variance Table
## 
## Response: obp
##                  Df  Sum Sq   Mean Sq F value Pr(>F)
## as.factor(pos1)   6 0.03565 0.0059417   1.674 0.1249
## Residuals       616 2.18648 0.0035495

Association: continuous versus continuous

On base percentage versus slugging percentage

cor(mlb2014$slg, mlb2014$obp)
## [1] NA
cor(mlb2014$slg, mlb2014$obp, use="complete.obs")
## [1] 0.7415133
plot(slg~obp, data=mlb2014)

cor.test(~slg+obp, data=mlb2014)
## 
##  Pearson's product-moment correlation
## 
## data:  slg and obp
## t = 27.385, df = 614, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7037393 0.7751067
## sample estimates:
##       cor 
## 0.7415133

On base percentage versus age

hist(mlb2014$age)

plot(obp~age, data=mlb2014)

summary(lm(obp~age, data=mlb2014))
## 
## Call:
## lm(formula = obp ~ age, data = mlb2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23683 -0.02771  0.00653  0.03512  0.32886 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.2788919  0.0169291  16.474   <2e-16 ***
## age         0.0006635  0.0005954   1.114    0.266    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05981 on 620 degrees of freedom
##   (27 observations deleted due to missingness)
## Multiple R-squared:  0.001999,   Adjusted R-squared:  0.0003894 
## F-statistic: 1.242 on 1 and 620 DF,  p-value: 0.2655
cor.test(~age+obp, data=mlb2014)
## 
##  Pearson's product-moment correlation
## 
## data:  age and obp
## t = 1.1144, df = 620, p-value = 0.2655
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03402394  0.12289378
## sample estimates:
##       cor 
## 0.0447107