Tutorial 2: finding assiciations

Baseball players batting performance from 2014

In this tutorial, we will look at the performance of Major League Baseball (MLB) players in the year of 2014. source from baseballguru.com.

First we load R libraries that we need for this tutorial. Basic libraries of functions are loaded every time R starts. More specialized functions need to be loaded first before they can used.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(DT)
library(RColorBrewer)

Read in the data

Now let’s read in the baseball 2014 batting performance data set.

mlb2014=read_csv(file="data/mlb2014.csv")

## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   playerID = col_character(),
##   nameFirst = col_character(),
##   nameLast = col_character(),
##   bats = col_character(),
##   throws = col_character(),
##   teamID = col_character(),
##   lg = col_character(),
##   avg = col_character(),
##   obp = col_character(),
##   slg = col_character(),
##   woba = col_character(),
##   ROTO = col_character(),
##   pos1 = col_character(),
##   pos2 = col_character(),
##   pos3 = col_character(),
##   pos4 = col_character(),
##   pos5 = col_character(),
##   baseball_prospectus_id = col_character()
## )

## See spec(...) for full column specifications.

Most of variables are read as characters. It is because “-” was used as an indicator of missing value. We add “-” in the string or recognized NA symbols and the issue is solved.

mlb2014=read_csv(file="data/mlb2014.csv", na=c("", "-", "NA"))

## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   playerID = col_character(),
##   nameFirst = col_character(),
##   nameLast = col_character(),
##   bats = col_character(),
##   throws = col_character(),
##   teamID = col_character(),
##   lg = col_character(),
##   avg = col_double(),
##   obp = col_double(),
##   slg = col_double(),
##   woba = col_double(),
##   ROTO = col_character(),
##   pos1 = col_character(),
##   pos2 = col_character(),
##   pos3 = col_character(),
##   pos4 = col_character(),
##   pos5 = col_character(),
##   baseball_prospectus_id = col_character()
## )

## See spec(...) for full column specifications.

Now we use datatable to explore the data a little bit.

dim(mlb2014)

## [1] 649  48

datatable(select(sample_n(mlb2014,50), ends_with("G")), options = list(scrollX=T, pageLength = 5))

Association: categorical variables

Positions versus handedness

alt text

table(mlb2014$pos1)

## 
##  1B  2B  3B   C  DH  OF  SS 
##  88  76  70  98  11 238  68

table(mlb2014$bats)

## 
##   B   L   R   S 
##  68 203 346  28

table(mlb2014$pos1, mlb2014$bats)

##     
##        B   L   R   S
##   1B   7  43  36   2
##   2B  14  20  36   4
##   3B   6  19  41   4
##   C   10  11  73   3
##   DH   2   7   2   0
##   OF  15  96 115  11
##   SS  14   7  43   4

col.use=brewer.pal(4, 'Set2')
plot(table(mlb2014$pos1, mlb2014$bats), col=col.use)

chisq.test(table(mlb2014$pos1, mlb2014$bats))

## Warning in chisq.test(table(mlb2014$pos1, mlb2014$bats)): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  table(mlb2014$pos1, mlb2014$bats)
## X-squared = 77.421, df = 18, p-value = 2.412e-09

Association: categorical versus continuous

Slugging percentage versus positions.

From wikipedia: “In baseball statistics, slugging percentage (SLG) is a popular measure of the power of a hitter.”

hist(mlb2014$slg)

plot(as.factor(mlb2014$pos1), mlb2014$slg, col=col.use)

summary(lm(slg~as.factor(pos1), data=mlb2014))

## 
## Call:
## lm(formula = slg ~ as.factor(pos1), data = mlb2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2958 -0.0498  0.0042  0.0592  0.8976 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.389805   0.010937  35.641  < 2e-16 ***
## as.factor(pos1)2B -0.058523   0.016315  -3.587 0.000361 ***
## as.factor(pos1)3B -0.001376   0.016379  -0.084 0.933076    
## as.factor(pos1)C  -0.041127   0.015216  -2.703 0.007064 ** 
## as.factor(pos1)DH -0.124090   0.040078  -3.096 0.002050 ** 
## as.factor(pos1)OF -0.027011   0.012895  -2.095 0.036613 *  
## as.factor(pos1)SS -0.053897   0.016725  -3.223 0.001338 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.102 on 609 degrees of freedom
##   (33 observations deleted due to missingness)
## Multiple R-squared:  0.04606,    Adjusted R-squared:  0.03666 
## F-statistic: 4.901 on 6 and 609 DF,  p-value: 6.604e-05

anova(lm(slg~as.factor(pos1), data=mlb2014))

## Analysis of Variance Table
## 
## Response: slg
##                  Df Sum Sq  Mean Sq F value    Pr(>F)    
## as.factor(pos1)   6 0.3060 0.051002  4.9009 6.604e-05 ***
## Residuals       609 6.3376 0.010407                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

On base percentage versus position.

Another important baseball statistics is On Base Percentage (OBP).

hist(mlb2014$obp)

plot(as.factor(mlb2014$pos1), mlb2014$obp, col=col.use)

summary(lm(obp~as.factor(pos1), data=mlb2014))

## 
## Call:
## lm(formula = obp ~ as.factor(pos1), data = mlb2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24250 -0.02645  0.00650  0.03466  0.31666 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.305287   0.006387  47.795   <2e-16 ***
## as.factor(pos1)2B -0.010676   0.009492  -1.125   0.2611    
## as.factor(pos1)3B  0.003056   0.009566   0.319   0.7495    
## as.factor(pos1)C  -0.014513   0.008886  -1.633   0.1029    
## as.factor(pos1)DH -0.051430   0.023407  -2.197   0.0284 *  
## as.factor(pos1)OF -0.006790   0.007503  -0.905   0.3659    
## as.factor(pos1)SS -0.015410   0.009768  -1.578   0.1151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05958 on 616 degrees of freedom
##   (26 observations deleted due to missingness)
## Multiple R-squared:  0.01604,    Adjusted R-squared:  0.006459 
## F-statistic: 1.674 on 6 and 616 DF,  p-value: 0.1249

anova(lm(obp~as.factor(pos1), data=mlb2014))

## Analysis of Variance Table
## 
## Response: obp
##                  Df  Sum Sq   Mean Sq F value Pr(>F)
## as.factor(pos1)   6 0.03565 0.0059417   1.674 0.1249
## Residuals       616 2.18648 0.0035495

Association: continuous versus continuous

On base percentage versus slugging percentage

cor(mlb2014$slg, mlb2014$obp)

## [1] NA

cor(mlb2014$slg, mlb2014$obp, use="complete.obs")

## [1] 0.7415133

plot(slg~obp, data=mlb2014)

cor.test(~slg+obp, data=mlb2014)

## 
##  Pearson's product-moment correlation
## 
## data:  slg and obp
## t = 27.385, df = 614, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7037393 0.7751067
## sample estimates:
##       cor 
## 0.7415133

On base percentage versus age

hist(mlb2014$age)

plot(obp~age, data=mlb2014)

summary(lm(obp~age, data=mlb2014))

## 
## Call:
## lm(formula = obp ~ age, data = mlb2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23683 -0.02771  0.00653  0.03512  0.32886 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.2788919  0.0169291  16.474   <2e-16 ***
## age         0.0006635  0.0005954   1.114    0.266    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05981 on 620 degrees of freedom
##   (27 observations deleted due to missingness)
## Multiple R-squared:  0.001999,   Adjusted R-squared:  0.0003894 
## F-statistic: 1.242 on 1 and 620 DF,  p-value: 0.2655

cor.test(~age+obp, data=mlb2014)

## 
##  Pearson's product-moment correlation
## 
## data:  age and obp
## t = 1.1144, df = 620, p-value = 0.2655
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03402394  0.12289378
## sample estimates:
##       cor 
## 0.0447107