This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code. Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
In this tutorial, we will look at the performance of Major League Baseball (MLB) players in the year of 2014. source from baseballguru.com.
First we load R libraries that we need for this tutorial. Basic libraries of functions are loaded every time R starts. More specialized functions need to be loaded first before they can used.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(DT)
library(RColorBrewer)
Now let’s read in the baseball 2014 batting performance data set.
mlb2014=read_csv(file="data/mlb2014.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## playerID = col_character(),
## nameFirst = col_character(),
## nameLast = col_character(),
## bats = col_character(),
## throws = col_character(),
## teamID = col_character(),
## lg = col_character(),
## avg = col_character(),
## obp = col_character(),
## slg = col_character(),
## woba = col_character(),
## ROTO = col_character(),
## pos1 = col_character(),
## pos2 = col_character(),
## pos3 = col_character(),
## pos4 = col_character(),
## pos5 = col_character(),
## baseball_prospectus_id = col_character()
## )
## See spec(...) for full column specifications.
Most of variables are read as characters. It is because “-” was used as an indicator of missing value. We add “-” in the string or recognized NA symbols and the issue is solved.
mlb2014=read_csv(file="data/mlb2014.csv", na=c("", "-", "NA"))
## Parsed with column specification:
## cols(
## .default = col_integer(),
## playerID = col_character(),
## nameFirst = col_character(),
## nameLast = col_character(),
## bats = col_character(),
## throws = col_character(),
## teamID = col_character(),
## lg = col_character(),
## avg = col_double(),
## obp = col_double(),
## slg = col_double(),
## woba = col_double(),
## ROTO = col_character(),
## pos1 = col_character(),
## pos2 = col_character(),
## pos3 = col_character(),
## pos4 = col_character(),
## pos5 = col_character(),
## baseball_prospectus_id = col_character()
## )
## See spec(...) for full column specifications.
Now we use datatable to explore the data a little bit.
dim(mlb2014)
## [1] 649 48
datatable(select(sample_n(mlb2014,50), ends_with("G")), options = list(scrollX=T, pageLength = 5))
alt text
table(mlb2014$pos1)
##
## 1B 2B 3B C DH OF SS
## 88 76 70 98 11 238 68
table(mlb2014$bats)
##
## B L R S
## 68 203 346 28
table(mlb2014$pos1, mlb2014$bats)
##
## B L R S
## 1B 7 43 36 2
## 2B 14 20 36 4
## 3B 6 19 41 4
## C 10 11 73 3
## DH 2 7 2 0
## OF 15 96 115 11
## SS 14 7 43 4
col.use=brewer.pal(4, 'Set2')
plot(table(mlb2014$pos1, mlb2014$bats), col=col.use)
chisq.test(table(mlb2014$pos1, mlb2014$bats))
## Warning in chisq.test(table(mlb2014$pos1, mlb2014$bats)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(mlb2014$pos1, mlb2014$bats)
## X-squared = 77.421, df = 18, p-value = 2.412e-09
From wikipedia: “In baseball statistics, slugging percentage (SLG) is a popular measure of the power of a hitter.”
hist(mlb2014$slg)
plot(as.factor(mlb2014$pos1), mlb2014$slg, col=col.use)
summary(lm(slg~as.factor(pos1), data=mlb2014))
##
## Call:
## lm(formula = slg ~ as.factor(pos1), data = mlb2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2958 -0.0498 0.0042 0.0592 0.8976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.389805 0.010937 35.641 < 2e-16 ***
## as.factor(pos1)2B -0.058523 0.016315 -3.587 0.000361 ***
## as.factor(pos1)3B -0.001376 0.016379 -0.084 0.933076
## as.factor(pos1)C -0.041127 0.015216 -2.703 0.007064 **
## as.factor(pos1)DH -0.124090 0.040078 -3.096 0.002050 **
## as.factor(pos1)OF -0.027011 0.012895 -2.095 0.036613 *
## as.factor(pos1)SS -0.053897 0.016725 -3.223 0.001338 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.102 on 609 degrees of freedom
## (33 observations deleted due to missingness)
## Multiple R-squared: 0.04606, Adjusted R-squared: 0.03666
## F-statistic: 4.901 on 6 and 609 DF, p-value: 6.604e-05
anova(lm(slg~as.factor(pos1), data=mlb2014))
## Analysis of Variance Table
##
## Response: slg
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(pos1) 6 0.3060 0.051002 4.9009 6.604e-05 ***
## Residuals 609 6.3376 0.010407
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Another important baseball statistics is On Base Percentage (OBP).
hist(mlb2014$obp)
plot(as.factor(mlb2014$pos1), mlb2014$obp, col=col.use)
summary(lm(obp~as.factor(pos1), data=mlb2014))
##
## Call:
## lm(formula = obp ~ as.factor(pos1), data = mlb2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24250 -0.02645 0.00650 0.03466 0.31666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.305287 0.006387 47.795 <2e-16 ***
## as.factor(pos1)2B -0.010676 0.009492 -1.125 0.2611
## as.factor(pos1)3B 0.003056 0.009566 0.319 0.7495
## as.factor(pos1)C -0.014513 0.008886 -1.633 0.1029
## as.factor(pos1)DH -0.051430 0.023407 -2.197 0.0284 *
## as.factor(pos1)OF -0.006790 0.007503 -0.905 0.3659
## as.factor(pos1)SS -0.015410 0.009768 -1.578 0.1151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05958 on 616 degrees of freedom
## (26 observations deleted due to missingness)
## Multiple R-squared: 0.01604, Adjusted R-squared: 0.006459
## F-statistic: 1.674 on 6 and 616 DF, p-value: 0.1249
anova(lm(obp~as.factor(pos1), data=mlb2014))
## Analysis of Variance Table
##
## Response: obp
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(pos1) 6 0.03565 0.0059417 1.674 0.1249
## Residuals 616 2.18648 0.0035495
cor(mlb2014$slg, mlb2014$obp)
## [1] NA
cor(mlb2014$slg, mlb2014$obp, use="complete.obs")
## [1] 0.7415133
plot(slg~obp, data=mlb2014)
cor.test(~slg+obp, data=mlb2014)
##
## Pearson's product-moment correlation
##
## data: slg and obp
## t = 27.385, df = 614, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7037393 0.7751067
## sample estimates:
## cor
## 0.7415133
hist(mlb2014$age)
plot(obp~age, data=mlb2014)
summary(lm(obp~age, data=mlb2014))
##
## Call:
## lm(formula = obp ~ age, data = mlb2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23683 -0.02771 0.00653 0.03512 0.32886
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2788919 0.0169291 16.474 <2e-16 ***
## age 0.0006635 0.0005954 1.114 0.266
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05981 on 620 degrees of freedom
## (27 observations deleted due to missingness)
## Multiple R-squared: 0.001999, Adjusted R-squared: 0.0003894
## F-statistic: 1.242 on 1 and 620 DF, p-value: 0.2655
cor.test(~age+obp, data=mlb2014)
##
## Pearson's product-moment correlation
##
## data: age and obp
## t = 1.1144, df = 620, p-value = 0.2655
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03402394 0.12289378
## sample estimates:
## cor
## 0.0447107