This script is a collection of graphs created by the students of Bio5702, Current Approaches in Plant Research, a graduate course at Washington University in St. Louis. These graphs are explorations of the dataset “candy_nutrition.txt”, the nutrition label information from 75 different candy types. The point of this homework assignment was to become acquainted with R, basic statistics, data visualization, and ggplot2.

Student #1

Begin homework here

library(ggplot2)
data = read.table ("./candy_nutrition.txt", header=T)

ggplot1, density plot of cholesterol content of different candy

d <- ggplot(data, aes(cholesterol_mg, ..count.., fill = class)) 
d + geom_density(position = "fill") +
  scale_fill_brewer(type="div", palette=1, name="class of candy") +
  scale_colour_brewer(type="div", palette=1, name="class of candy") +
  xlab(label="cholesterol in mg") +
  ylab(label="count") +
  ggtitle("Density plot of total cholesterol colored by candy class")

calories of candy is normalized to its serving_size_g

data$Normalized_cal <- (data$calories/data$serving_size_g)

ggplot2, boxplot of normalized calories of different candy class

d <- ggplot(data, aes(x=class, y=Normalized_cal, colour=class, fill=class))
d + geom_boxplot(alpha=0.9) +
  scale_fill_brewer(type="qual", palette=1, name="Classes of candy") +
  scale_colour_brewer(type="qual", palette=1, name="Classes of candy") +
  xlab(label="Class of candy") +
  ylab(label="Calories per gm serving size") +
  ggtitle("Boxplot of normalized calories colored by candy class") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=90))

Normalized fat and carbohydrate

data$Normalized_fat <- (data$total_fat_g/data$serving_size_g)
data$Normalized_carb <- (data$total_carb_g/data$serving_size_g)

ggplot3, scatterplot of total fat vs. total carbohydrate

d <- ggplot(data, aes(x=Normalized_carb, y=Normalized_fat, colour=class))
d + geom_point(size=4, alpha=0.8) +
  scale_fill_brewer(type="qual", palette=1, name="Classes of candy") +
  scale_colour_brewer(type="qual", palette=1, name="Classes of candy") +
  xlab(label="Total carbohydrate gm per gm serving size") +
  ylab(label="Total fat gm per gm serving size") +
  ggtitle("Fat vs. Carbohydrate") +
  theme_bw()

Student #2

Begin homework here

library(ggplot2)
data = read.table ("./candy_nutrition.txt", header=T)

normalized scatterplot of calories vs fat

data$Calories_per_serving_size = data$calories / data$serving_size_g
data$fat_per_serving_size = data$total_fat_g / data$serving_size_g

head(data)
##     id                     name     company     class serving_size_g
## 1 id_1                mini_eggs     cadbury chocolate             40
## 2 id_2    soft_eating_liquorice darrell_lea liquorice             42
## 3 id_3              raspberries      haribo     sugar             39
## 4 id_4               candy_corn        nice     gummi             41
## 5 id_5           crawlers_minis      trolli      sour             40
## 6 id_6 strawberry_shortcake_mms        mars chocolate             42
##   calories calories_fat total_fat_g saturated_fat_g cholesterol_mg
## 1      190           70           8               5              5
## 2      140           10           1               0              0
## 3      140            0           0               0              0
## 4      160          160           0               0              0
## 5      130            0           0               0              0
## 6      210          100          10               6              5
##   sodium_mg total_carb_g dietary_fiber_g sugars_g protein_g
## 1        30           28             0.5       27         2
## 2        40           30             0.0       16         1
## 3         0           36             0.0       29         1
## 4        75           39             0.0       32         0
## 5        35           31             0.0       24         1
## 6        40           29             0.0       28         2
##   primary_ingredient Calories_per_serving_size fat_per_serving_size
## 1          chocolate                  4.750000           0.20000000
## 2              syrup                  3.333333           0.02380952
## 3              sugar                  3.589744           0.00000000
## 4              sugar                  3.902439           0.00000000
## 5              syrup                  3.250000           0.00000000
## 6          chocolate                  5.000000           0.23809524
p = ggplot(data=data, aes(x=Calories_per_serving_size, y=fat_per_serving_size, colour=class))
p + geom_point(size=4, alpha=0.5)+ #alpha = transparency of the points
  scale_fill_brewer(type="qual", palette=5, name="Classes of candy")+
  xlab(label="Calories")+
  ylab(label="Total fat (g)")+
  ggtitle("Fat vs Calories, normalized for serving size")

comparison of saturated fat from different chocolate companies

just_chocolate = subset(data, class=="chocolate")

p = ggplot(data=just_chocolate, aes(x=company, y=saturated_fat_g, fill=company))
p + geom_boxplot()+
  xlab(label="Company")+
  ylab(label="Saturated Fat (g)")+
  ggtitle("Saturated Fat in the Chocolate from Different Companies")+
scale_fill_manual(values=c( "green","red", "orange", "yellow", "blue"))

dotplot histogram of sodium content

p = ggplot(data=data, aes(x=sodium_mg, fill=class))
p + geom_dotplot()+
  scale_fill_manual(values=c("brown", "green","red", "black", "orange", "yellow", "blue"))+
  xlab(label="Sodium (mg)")+
  ylab(label="Fraction of Total Candy")+
  ggtitle("Dotplot of Sugar content")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Student #3

library(ggplot2)
data <- read.table("./candy_nutrition.txt", header=TRUE)

Let’s look at saturated fat versus total fat to get an idea of what percentage saturated fat is in each candy

p <- ggplot(data=data, aes(x=total_fat_g, y=saturated_fat_g, colour=class))

p + geom_point(size=4, alpha=0.7) + 
  
  scale_fill_brewer(type="qual", palette=1, name="Classes of candy") + 
  xlab(label="Total fat in grams") + 
  ylab(label="Saturated fat in grams") + 
  ggtitle("Saturated versus total fat") + 
  theme_bw()

It seems like this mostly follows a regression line of slope 1 so there is more-or-less a 1-to-1 correlation between saturated fat and total fat meaning that for most of these candies, half of the fat is unsaturated

Let’s see if there is any correlation between the primary ingredient and the total amount of sugar per serving size

p <- ggplot(data=data, aes(x=primary_ingredient, y=(sugars_g/serving_size_g), colour=class))

p + geom_point(size=4, alpha=0.7) + 
  
  scale_fill_brewer(type="qual", palette=1, name="Classes of candy") + 
  xlab(label="Primary ingredient") + 
  ylab(label="Grams of sugar per serving") + 
  ggtitle("Grams of sugar per serving versus primary ingredient") + 
  theme_bw()

DC: Let’s try the same graph with position=“jitter”

p + geom_point(size=4, alpha=0.7, position="jitter") + 
  
  scale_fill_brewer(type="qual", palette=1, name="Classes of candy") + 
  xlab(label="Primary ingredient") + 
  ylab(label="Grams of sugar per serving") + 
  ggtitle("Grams of sugar per serving versus primary ingredient") + 
  theme_bw()

Now let’s do the same but testing sodium

p <- ggplot(data=data, aes(x=primary_ingredient, y=(sodium_mg/serving_size_g), colour=class))

p + geom_point(size=4, alpha=0.7, position="jitter") + 
  
  scale_fill_brewer(type="qual", palette=1, name="Classes of candy") + 
  xlab(label="Primary ingredient") + 
  ylab(label="Milligrams of sodium per serving") + 
  ggtitle("Milligrams of sodium per serving versus primary ingredient") + 
  theme_bw()

There doesn’t seem to be much clustering of the data so there probably isn’t a very strong correlation between milligrams of sodium per serving and the type of primary ingredient

What about protein?

p <- ggplot(data=data, aes(x=primary_ingredient, y=(protein_g/serving_size_g), colour=class))

p + geom_point(size=4, alpha=0.7, position="jitter") + 
  
  scale_fill_brewer(type="qual", palette=1, name="Classes of candy") + 
  xlab(label="Primary ingredient") + 
  ylab(label="Grams of protein per serving") + 
  ggtitle("Grams of sodium per serving versus primary ingredient") + 
  theme_bw()

Student #4

3 PLOTS ASSIGNMENT, due 5pm on 4-3-16

PLOT NUMBER ONE: a bar graph of mean calories per gram for each candy class

upload text document called candy_nutrition.txt by clicking “import dataset” in the Environment window, and choosing From Local File, and finding candy_nutrition.txt

rename candy_nutrition.txt as “data”

library(ggplot2)
data <- read.table("./candy_nutrition.txt", header=TRUE)

create a new column in the data table that is calories per gram for every candy: divide calories per serving divided by # of grams in serving

data$cals_per_gram <- (data$calories / data$serving_size_g)

then reduce the number of decimal places in my new column down to hundredths, aka 2 decimal places

data$cals_per_gram <- round(data$cals_per_gram, 2)

install ggplots library, and upload the library into my current session

library(ggplot2)

then make a scatterplot where the x-axis is candy name, and the y-axis is the number of calories per gram of each candy

p <- ggplot(data, aes(name,cals_per_gram))
p + geom_point()

that’s ugly and the x-axis labels are unreadable, so intead I’ll make a plot where the x-axis is candy class, and y-axis is the mean calories per gram of that class

simply calculating the mean of the cals_per_gram sorted by candy class using tapply gives me a vector, not a data frame

mean_cals_per_gram_by_class <- tapply(data$cals_per_gram, data$class, mean)

look at the output vector by executing this:

mean_cals_per_gram_by_class
##     chocolate         gummi    jelly_bean     liquorice peanut_butter 
##      4.751923      3.504444      3.450000      3.431667      5.020000 
##          sour         sugar 
##      3.642857      3.616667

the vector looks like this, and lacks column labels which I need to have in order to generate a plot with the axes labeled properly chocolate gummi jelly_bean liquorice peanut_butter sour sugar 4.751923 3.504444 3.450000 3.431667 5.020000 3.642857 3.616667

can I use that “vector” to make a decent scatterplot?

p <- ggplot(mean_cals_per_gram_by_class) 
p + geom_point()

NOPE! Because this “vector” lacks column names and my ggplot command lacks definitions for the x and y axes, I can’t make a plot.

Therefore, I need to turn that “vector” into a data.frame

cal_density_by_class <- data.frame(mean_cals_per_gram_by_class)

look at my new data frame

cal_density_by_class
##               mean_cals_per_gram_by_class
## chocolate                        4.751923
## gummi                            3.504444
## jelly_bean                       3.450000
## liquorice                        3.431667
## peanut_butter                    5.020000
## sour                             3.642857
## sugar                            3.616667

that’s nice, but it still doesn’t have a column name for candy class instead of using “tapply”, use “aggregate” to create a data table that puts candy class in its own column

aggregate_test = aggregate(data$cals_per_gram, list(data$class), mean)

look at that new table

aggregate_test
##         Group.1        x
## 1     chocolate 4.751923
## 2         gummi 3.504444
## 3    jelly_bean 3.450000
## 4     liquorice 3.431667
## 5 peanut_butter 5.020000
## 6          sour 3.642857
## 7         sugar 3.616667

it looks like this: Group.1 x 1 chocolate 4.751923 2 gummi 3.504444 3 jelly_bean 3.450000 4 liquorice 3.431667 5 peanut_butter 5.020000 6 sour 3.642857 7 sugar 3.616667

now it’s in a nice data table format like an excel file, not just a list of numbers like the “vector” was.

now I need to rename the columns from “Group.1” and “x”, to “candy_class” and “mean_cals_per_gram” respectively

names(aggregate_test)[1] <- paste("candy_class")
names(aggregate_test)[2] <- paste("avg_cals_per_gram")

now that I have a dataframe with column names, I’ll try turning it into a scatterplot, which ggplot2 calls geom_point

p <- ggplot(aggregate_test, aes(candy_class, avg_cals_per_gram))
p + geom_point() + 
  ggtitle(label="Calories per Gram by Candy Class")

I tried to turn this into a bar plot which would have also made the y-axis range start at zero so the values don’t look so skewed, but I was unsuccessful. Every attempt I made returned some error.

p <- ggplot(data=aggregate_test, aes(candy_class, avg_cals_per_gram)) p + geom_bar() Error: stat_count() must not be used with a y aesthetic.

That’s probably the simplest of my various attempts at achieving a bar plot, but that error message was the main problem I couldn’t figure out how to fix.

DC: Let me see if I can help out. You can read here, under “Details” that by default geom_bar takes count data through stat=“bin”. Let’s change that to stat=“identity”:

p <- ggplot(data=aggregate_test, aes(candy_class, avg_cals_per_gram))
p + geom_bar(stat="identity")

So I’ll use the tips I found on Quick-R instead of ggplot

density <- table(aggregate_test$candy_class)
barplot(density, main="Calories per Gram by Candy Class", height=aggregate_test$avg_cals_per_gram, names.arg=(aggregate_test$candy_class), xlab="Candy Class", ylab="Mean Calories Per Gram")

It would be nice to put the values at the ends of each bar, but I couldn’t find any tips online for that other than “make a text box and manually enter coordinates until you figure out which coordinates place the text over the desired bar”, which sounds sloppy

DC: Let me see if I can help out with that too! I might try geom_text, let’s give it a shot! I’ll specify x=candy_class and y=avg_cals_per_gram values + 0.1 (so that it dodges the bars). Then, the labels are rounded avg_cals_per_gram values.

p <- ggplot(data=aggregate_test, aes(candy_class, avg_cals_per_gram))
p + geom_bar(stat="identity") + geom_text(data=aggregate_test, aes(x=candy_class, y=avg_cals_per_gram+0.1, label=round(avg_cals_per_gram, digits=2)))

PLOT NUMBER TWO: x=primary ingredient, y=calories per gram, color=candy class

p <- ggplot(data, aes(primary_ingredient, cals_per_gram))
p + geom_jitter(aes(colour = class)) +
  ggtitle(label = "Calories per Gram by Primary Ingredient") +
  xlab(label = "Primary Ingredient") +
  ylab(label = "Calories Per Gram") +
  ylim(0,6)

PLOT NUMBER THREE: stacked bars showing the % by weight of macronutrient types (total fat, total carbs, total protein) in a serving, sorted by candy class

Make new columns in the dataframe “data” for “percent by weight of fat”, “percent by weight of carbs”, and “percent by weight of protein”

data$percent_fat_by_weight <- (data$total_fat_g / data$serving_size_g)
data$percent_carbs_by_weight <- (data$total_carb_g / data$serving_size_g)
data$percent_protein_by_weight <- (data$protein_g / data$serving_size_g)

round each of them down to a tenth of a percent, then multiply by 100 to get actual percentage values

data$percent_fat_by_weight <- round(data$percent_fat_by_weight, 3) * 100
data$percent_carbs_by_weight <- round(data$percent_carbs_by_weight, 3) * 100
data$percent_protein_by_weight <- round(data$percent_protein_by_weight, 3) * 100

that’s good, but too many bars to plot on a single chart, so now I need to get the average percent macronutrient values for each candy class

mean_fat_percent <- tapply(data$percent_fat_by_weight, data$class, mean)
mean_carbs_percent <- tapply(data$percent_carbs_by_weight, data$class, mean)
mean_protein_percent <- tapply(data$percent_protein_by_weight, data$class, mean)

Check to see if these look reasonable

mean_fat_percent
##     chocolate         gummi    jelly_bean     liquorice peanut_butter 
##     22.038462      0.000000      0.000000      1.883333     28.828571 
##          sour         sugar 
##      1.085714      0.760000
mean_carbs_percent
##     chocolate         gummi    jelly_bean     liquorice peanut_butter 
##      68.48846      84.02222      84.50000      78.26667      58.41429 
##          sour         sugar 
##      85.00000      89.29333
mean_protein_percent
##     chocolate         gummi    jelly_bean     liquorice peanut_butter 
##      5.142308      3.022222      0.000000      2.083333      9.814286 
##          sour         sugar 
##      1.428571      1.433333

Now I have vectors listing the mean macronutrient percentages for each candy class. Combine those three vectors into a single dataframe

mean_macronutrient_percents <- data.frame(mean_fat_percent, mean_carbs_percent, mean_protein_percent)

Again, I have the problem where the column listing class doesn’t have a title

macronutrient_aggregate = aggregate(data$percent_fat_by_weight,percent_carbs_by_weight,percent_protein_by_weight, list(data$class), mean)

The aggregate command didn’t work this time maybe because I’m trying to create a dataframe with more than two columns??? So I’ll just copy and paste the dataframe into Excel, add the desired column title, copy that into a text file(because apparently Excel files can’t upload into R properly), and upload the text file as a new dataframe.

Now I have a dataframe I can work with, so I’ll try to make a stacked bar chart from those values

p <- ggplot(data = data, aes(x = class, y = percent_fat_by_weight, percent_carbs_by_weight, percent_protein_by_weight))
p + geom_bar()

That ^ doesn’t seem to be working, so I’ll try to make my chart from the percent ___ by weight values I added to the original dataframe “data”

p <- ggplot(data = data, aes(x = class, y = percent_fat_by_weight, percent_carbs_by_weight, percent_protein_by_weight))
p + geom_bar()

That ^ also doesn’t work.

barplot(text_mean_macronutrient_percents,  main = "Macronutrient Percents by Candy Class", xlab = "Candy Class", ylab = "Percent")

Nope^

DC: Let’s see what we can do to create this bar graph with stacked percentages

Let’s create a smaller dataset with just the class of candy and the percents by weights of fat, carbs, and protein.

names(data)
##  [1] "id"                        "name"                     
##  [3] "company"                   "class"                    
##  [5] "serving_size_g"            "calories"                 
##  [7] "calories_fat"              "total_fat_g"              
##  [9] "saturated_fat_g"           "cholesterol_mg"           
## [11] "sodium_mg"                 "total_carb_g"             
## [13] "dietary_fiber_g"           "sugars_g"                 
## [15] "protein_g"                 "primary_ingredient"       
## [17] "cals_per_gram"             "percent_fat_by_weight"    
## [19] "percent_carbs_by_weight"   "percent_protein_by_weight"
sub_data <- data[c(4,18:20)]

Next, let’s use a function in another package created by Hadley Wickham (creator of ggplot2) called “reshape2”, which allows us to reformat our data with ease. What we want to do is put the percents by weight of fat, carbs, and protein into a single column and then create a new column that says what they are (fat, carb, or protein). The function is called “melt”, and it collapses your data except for the columns you specify with the argument “id”. Let’s try it!

names(sub_data)
## [1] "class"                     "percent_fat_by_weight"    
## [3] "percent_carbs_by_weight"   "percent_protein_by_weight"
library(reshape2)
melted_data <- melt(sub_data, id="class")

names(melted_data)
## [1] "class"    "variable" "value"
head(melted_data)
##       class              variable value
## 1 chocolate percent_fat_by_weight  20.0
## 2 liquorice percent_fat_by_weight   2.4
## 3     sugar percent_fat_by_weight   0.0
## 4     gummi percent_fat_by_weight   0.0
## 5      sour percent_fat_by_weight   0.0
## 6 chocolate percent_fat_by_weight  23.8

Great! Now let’s try to create that bar plot

p <- ggplot(data = melted_data, aes(x = class, y = value, fill= variable))
p + geom_bar(stat="identity")

These appear to be unequal because I believe we calcaulted percents fat, carbs, and protein for each candy type within each class of candy? If so, then let’s try to normalize to get “percents” for each class

p <- ggplot(data = melted_data, aes(x = class, y = value, fill= variable))
p + geom_bar(stat="identity", position="fill")

Also, see position=“dodge”

p <- ggplot(data = melted_data, aes(x = class, y = value, fill= variable))
p + geom_bar(stat="identity", position="dodge")