Intro to Data Analysis in R

There are two downloads for this workshop. Use the links below to download the dataset and R script used:

If you do not already have R and R Studio installed, you can download them both here

Use the site below to navigate the R code we will be using in chunks. After the workshop, this site will be converted into multiple pages, each covering a different topic in an offline-workshop format.

Setup

    setwd("C:/Users/Kari/OneDrive - Kansas State University/NI Core")
    data = read.csv("ratCoffeeData.csv", stringsAsFactors = T)
    attach(data)

Check out the data

    View(data)
    
    # Simple plots
    hist(buttonpresses)
    plot(coffee)
    plot(sex)
    hist(pelletsconsumed)

Descriptive statistics

    mean(buttonpresses)  
    #mean(buttonpresses, na.rm = T) # If there is missing data, na.rm = T will remove it
    sd(buttonpresses)
    #sd(buttonpresses, na.rm = T)
    summary(buttonpresses)
    #summary(buttonpresses,  na.rm = T)

Correlations

    install.packages("ggplot2") # install ggplot2 if you do not already have it
    library(ggplot2) # load in the ggplot2 library for plotting
    
    # Simple plot of raw data points
    ggplot(aes(x = pelletsconsumed, y = buttonpresses), data = data) + geom_point()
    
    # Different types of correlations
    cor(buttonpresses, pelletsconsumed) # note: <use = "complete.obs"> for NAs
    cor(buttonpresses, pelletsconsumed, method='pearson') # identical to above
    cor(buttonpresses, pelletsconsumed, method='spearman') # rank-ordered correlation and nonlinear correlations
    
    # Plot of raw data with a line through it to visualize the trend
    ggplot(aes(x = pelletsconsumed, y = buttonpresses), data = data) + geom_point() + geom_smooth(method=lm, se=TRUE)

t-tests

    # Visualize the two means we want to compare
    ggplot(aes(x = sex, y = buttonpresses), data = data) + geom_bar(stat = "summary", color = "purple", fill = "purple")+
      geom_errorbar(stat = "summary", width=0.25) 
    
    # Look at the individual means for Males and Females
    males <- data [which(sex == "Male"),]
    summary(males$buttonpresses)
    females <- data [which(sex == "Female"),]
    summary(females$buttonpresses)
    
    # Run t-test
    t.test(buttonpresses~ sex)

ANOVA

    # Look at the outcome data. Does it look normal?
    hist(data$buttonpresses)
    
    # Run a normality test (these can be misleading if you have a lot of data -- eyeballing it is okay!)
    shapiro.test(data$buttonpresses)
    
    # Visualize the groups we want to compare
    ggplot(aes(x = coffee, y = buttonpresses), data = data) + geom_bar(stat = "summary", color = "purple", fill = "purple")+
      geom_errorbar(stat = "summary", width=0.25)
    
    # Run the ANOVA and save it to an object
    anova1 <- aov(buttonpresses ~ coffee)
    
    # Get a summary of that anova object
    summary(anova1)
    
    # Post-hoc tests (can use different adjustments)
    pairwise.t.test(buttonpresses, coffee, p.adj = "none") # aka 'LSD' adjustment
    pairwise.t.test(buttonpresses, coffee, p.adj = "bonf")
    pairwise.t.test(buttonpresses, coffee, p.adj = "fdr") 
    
    # or just default to Tukey
    TukeyHSD(anova1)

Linear Regression

    # Run regression and save it to an object
    regression <- lm(buttonpresses ~ coffee)
    
    # Get a summary of the regression object
    summary(regression)
    
    # Get an ANOVA table for the regression output
    anova(regression)
    
    # Test a more complex model, adding in sex as a predictor
    regression2 <- lm(buttonpresses ~ coffee + sex)
    
    # Summary of more complex model
    summary(regression2)
    
    # Test an even more complex model, 
    # introducing an interaction between coffee and sex
    regression3 <- lm(buttonpresses ~ coffee * sex)
    
    # Summary of most complex model
    summary(regression3)
    
    # model comparisons
    anova(regression, regression2, regression3)
    # any significant differences? Should there be? Additional predictors useful or no?
    
    # Get specific result values for regression 3
    summary(regression3)$fstatistic
    summary(regression3)$r.squared
    summary(regression3)$adj.r.squared
    summary(regression3)$sigma # Residual Standard Error
    
    # Post-hoc tests
    library(emmeans)
    emmeans(regression3, ~coffee*sex) # all interactions
    emmeans(regression3, ~coffee|sex) # coffee means grouped by sex
    emmeans(regression3, ~sex|coffee) # sex means grouped by coffee
    
    emmeans(regression3, pairwise ~coffee, adjust = "tukey") # compare coffee means
    emmeans(regression3, pairwise ~coffee*sex, adjust = "tukey") # compare coffee and sex combination means
    
    # Note: if you compare ALL combinations, the Tukey adjustment might be too strong.
    emmeans(regression3, pairwise ~coffee|sex, adjust = "tukey") # compare coffee means grouped by sex
    ```
    emmeans(regression3, pairwise ~sex|coffee, adjust = "tukey") # compare sex means grouped by coffee

Visualizing results from linear regression

    # Get the estimated marginal means in a dataframe of its own for plotting
    toplot <- as.data.frame(emmeans(regression3, ~coffee*sex))
    
    # Check out the first couple lines of this dataframe
    head(toplot)
    
    # Create the plot
    ggplot(data = toplot, aes(x = coffee, y = emmean, col = sex, fill = sex)) + geom_bar(stat = "identity") + facet_wrap(~sex) +
      geom_errorbar(aes(ymin = emmean - SE, ymax = emmean + SE), col = "black", width=0.25) + ylab("Button Presses") + xlab("Coffee")

Here are some handy resources to check out!

t-tests - Datacamp Note: this article is free, but we also get free Datacamp access through the Psych department - email me at karipayne@ksu.edu if you would like access!
Correlations in R - STHDA
One-way ANOVA in R - STHDA
Linear Regressions in R - Datacamp
Multilevel Modeling - R Bloggers