Download the R Script for this workshop here:
install.packages("tidyverse")
library(tidyverse)
# tidyverse loads in a handful of different libraries that will be helpful to us
# we will be using the msleep dataset from the "ggplot2" library (which is
# loaded with tidyverse). To learn more about this dataset, use...
?msleep
# let's get a look at that data
View(msleep) # view the whole dataset
str(msleep) # look at the structure of each variable
glimpse(msleep) # similar to str(), from the tidyverse library
summary(msleep) # get summary statistics for each variable
# say we only want to work with a couple of these variables
# first, let's get a reminder of what our variables are
names(msleep)
# let's grab the first 3 columns, all of the columns with "sleep" in the name,
# and all of the columns with "wt" in the name
slp <- msleep %>%
select(1:3, starts_with("sleep"), ends_with("wt"))
names(slp)
# on second thought, let's get rid of "sleep_rem" and "sleep_cycle"
slp <- slp %>%
select(1:3, sleep_total, ends_with("wt"))
names(slp)
# what if you want to remove one or more columns?
slp_not_genus_or_vore <- msleep %>%
# select(!genus)
select(!c(genus, vore))
names(slp_not_genus_or_vore)
# let's look at our data types again
str(slp)
glimpse(slp)
# we have two variables, "name" and "genus" that are characters. Let's make
# those factors, or in other words, let's treat those as categorical variables
slp$name <- as.factor(slp$name)
slp$genus <- as.factor(slp$genus)
# and let's look at them again
str(slp)
glimpse(slp)
# you can also turn numeric values into factors (valuable for a "participant"
# variable) or characters, and back into numeric values
slp$bodywt <- as.factor(slp$bodywt)
glimpse(slp)
slp$bodywt <- as.character(slp$bodywt)
glimpse(slp)
slp$bodywt <- as.numeric(slp$bodywt)
glimpse(slp)
# we can change up the order of our columns if we want
slp <- slp %>%
select(1:3, ends_with("wt"), sleep_total)
names(slp)
# and we can sort our data
# sorting from lowest to highest amount of sleep
slp <- slp %>%
arrange(sleep_total)
View(slp)
# sorting from highest to lowest amount of sleep
slp <- slp %>%
arrange(-sleep_total)
View(slp)
# sorting by name in alphabetical order
slp <- slp %>%
arrange(name)
View(slp)
# if you want multiple columns dictating the sorting
slp <- slp %>%
arrange(sleep_total, bodywt)
View(slp)
# we can also rename our columns
slp <- slp %>%
rename(diet = vore)
View(slp)
# our data has some missing values. We can see how many NAs there are using
# the summary() function
summary(slp)
# so there are 7 missing entries for vore and 27 missing for brain weight
# some functions will not work if there are NAs in a column, like mean()
mean(slp$brainwt)
# running this will give us "NA" as output. However, we can tell the mean()
# function to ignore our NA values
mean(slp$brainwt, na.rm = TRUE)
# we could get rid of all rows with any NAs like so
slp_no_NA <- slp %>%
na.omit()
View(slp_no_NA)
# but chances are some NAs in our data are okay to keep. Instead, we just want
# to take out any rows that do not have brain weight values
slp <- slp %>%
drop_na(brainwt)
View(slp)
# we still have NAs in our diet column. Let's change those to be "unknown"
# instead. To use replace_na, our variable type cannot be a factor, so let's
# change that back
slp$diet <- as.character(slp$diet)
slp <- slp %>%
mutate(diet = replace_na(diet, "unknown"))
# and now we can make it a factor again
slp$diet <- as.factor(slp$diet)
View(slp)
# say we want to whittle down our data a bit more before we analyze it
# in this example, we will filter our dataset down to just herbivores,
# carnivores, and omnivores and we will filter down to those who sleep 8
# hours or more (living the dream)
slp_filtered <- slp %>%
filter((diet == "carni" | diet == "herbi"| diet == "omni") & sleep_total >= 8)
View(slp_filtered)
# looks good! Let's make this "slp" going forward
slp <- slp_filtered
# we can recode and rename values, and we can take these new values and either
# overwrite the old ones,or make new columns out of them
# say we want to change "carni" to "meat", "herbi" to "veggie", and "omni" to
# "both", and save this to the same diet column
slp <- slp %>%
mutate(diet = recode(diet, "carni" = "meat", "herbi" = "veggie",
"omni" = "both"))
View(slp)
# and, what if we wanted to make new columns that give brain weight and body
# weight in grams instead of kilograms?
slp <- slp %>%
mutate(brainwt_in_grams = brainwt * 1000) %>%
mutate(bodywt_in_grams = bodywt * 1000)
View(slp)
# we can also recode our variables depending on whether or not they meet
# certain conditions. Here, we will consider any any creature with a brain
# heavier than 0.01 kg to be big-brained
slp <- slp %>%
mutate(brain_size = if_else(brainwt > 0.01,"big","small"))
View(slp)
# we can also check for duplicates in our data. Right now, there are not any.
duplicated(slp) # the duplicated() function checks each value for duplicates
# we can also go looking more specifically for duplicated rows
#this is saying "give me the rows of slp that return
#TRUE for duplicated(slp), and any columns (the blank space after the comma)
slp[duplicated(slp), ]
#right now, this is an empty dataset
# but, what if we add some duplicates...
slp[34,] <- slp[33,] # row 34 (a new row) is equal to row 33
slp[35,] <- slp[22,] # row 35 (a new row) is equal to row 22
slp[36,] <- slp[11,] # row 36 (a new row) is equal to row 11
# now let's check for duplicates again
slp[duplicated(slp), ]
# there they are!
# and, let's get rid of them (sorry, duplicates)
# we can do this in two different ways
# we can grab the non-duplicated rows and assign that to a data frame
slp_dupefree <- slp[!duplicated(slp), ]
# and check for duplicates
slp_dupefree[duplicated(slp_dupefree), ]
# or we can grab the "distinct" rows, those which only appear once. For this,
# we will want to add use "distinct(.keep_all = T)", to keep all of the
# variables and only grab the distinct rows
slp_dupefree2 <- slp %>%
distinct(.keep_all = T)
# and check for duplicates
slp_dupefree2[duplicated(slp_dupefree2), ]
# good to go! Let's make this our main "slp" dataset again
slp <- slp2_dupefree2
# we can also make a new, smaller dataset based on a summary of our current
# dataset. The summarize() function makes a dataset with one row for each
# combination of the grouping variables you give it
# let's group by diet and brain size, and then make columns summarizing the
# sleep total
slp_summary <- slp %>%
group_by(diet, brain_size) %>%
summarize(mean_sleep = mean(sleep_total), sd_sleep = sd(sleep_total))
View(slp_summary)
# note, we only have one entry for "big" brained veggie-only eaters, the rabbit
# so, no SD for the rabbit.
# we can also turn a long dataset into a wide one, and vice versa. Our current
# "slp" dataset is not too great for this because it has few categorical
# variables with many entries, so let's load in another example
install.packages("gapminder")
library(gapminder)
# this is a dataset with life expectancy, GDP, and population information for
# different countries.
# more info can be found...
?gapminder
# let's take a look
View(gapminder)
# let's grab country, year, and life expectancy
gm <- gapminder %>%
select(country, year, lifeExp)
View(gm)
# this dataset can be made wider
wide_gm <- gm %>%
pivot_wider(names_from = year, values_from = lifeExp)
View(wide_gm)
# now we have each row representing one country, and each column representing
# a different year
# but what if we have wide data that we want to turn into long data?
long_gm <- wide_gm %>%
pivot_longer (2:13, #here, we select the columns we want to combine
names_to = "year",
values_to = "lifeExp")
view(long_gm)
# and our data is long again!