Data Cleaning in R :: K-State CS Hugo Framework

Download the R Script for this workshop here:

Getting Started

install.packages("tidyverse")
library(tidyverse)
# tidyverse loads in a handful of different libraries that will be helpful to us

# we will be using the msleep dataset from the "ggplot2" library (which is 
# loaded with tidyverse). To learn more about this dataset, use...
?msleep

# let's get a look at that data
View(msleep) # view the whole dataset
str(msleep) # look at the structure of each variable
glimpse(msleep) # similar to str(), from the tidyverse library
summary(msleep) # get summary statistics for each variable

Select Data

# say we only want to work with a couple of these variables
# first, let's get a reminder of what our variables are
names(msleep)

# let's grab the first 3 columns, all of the columns with "sleep" in the name, 
# and all of the columns with "wt" in the name
slp <- msleep %>% 
  select(1:3, starts_with("sleep"), ends_with("wt"))
names(slp)

# on second thought, let's get rid of "sleep_rem" and "sleep_cycle"
slp <- slp %>%
  select(1:3, sleep_total, ends_with("wt"))
names(slp)

# what if you want to remove one or more columns?
slp_not_genus_or_vore <- msleep %>%
  # select(!genus)
  select(!c(genus, vore))
names(slp_not_genus_or_vore)

Changing Data Types

# let's look at our data types again
str(slp)
glimpse(slp)
# we have two variables, "name" and "genus" that are characters. Let's make 
# those factors, or in other words, let's treat those as categorical variables
slp$name <- as.factor(slp$name)
slp$genus <- as.factor(slp$genus)
# and let's look at them again
str(slp)
glimpse(slp)

# you can also turn numeric values into factors (valuable for a "participant" 
# variable) or characters, and back into numeric values
slp$bodywt <- as.factor(slp$bodywt)
glimpse(slp)
slp$bodywt <- as.character(slp$bodywt)
glimpse(slp)
slp$bodywt <- as.numeric(slp$bodywt)
glimpse(slp)

Reorder Data

# we can change up the order of our columns if we want
slp <- slp %>%
  select(1:3, ends_with("wt"), sleep_total)
names(slp)

# and we can sort our data

# sorting from lowest to highest amount of sleep
slp <- slp %>%
  arrange(sleep_total)
View(slp)

# sorting from highest to lowest amount of sleep
slp <- slp %>%
  arrange(-sleep_total)
View(slp)

# sorting by name in alphabetical order
slp <- slp %>%
  arrange(name)
View(slp)

# if you want multiple columns dictating the sorting
slp <- slp %>%
  arrange(sleep_total, bodywt)
View(slp)

# we can also rename our columns
slp <- slp %>%
  rename(diet = vore)
View(slp)

Missing Data

# our data has some missing values. We can see how many NAs there are using 
# the summary() function
summary(slp)
# so there are 7 missing entries for vore and 27 missing for brain weight

# some functions will not work if there are NAs in a column, like mean()
mean(slp$brainwt)
# running this will give us "NA" as output. However, we can tell the mean() 
# function to ignore our NA values
mean(slp$brainwt, na.rm = TRUE)

# we could get rid of all rows with any NAs like so
slp_no_NA <- slp %>%
  na.omit()
View(slp_no_NA)
# but chances are some NAs in our data are okay to keep. Instead, we just want 
# to take out any rows that do not have brain weight values
slp <- slp %>%
  drop_na(brainwt)
View(slp)

# we still have NAs in our diet column. Let's change those to be "unknown" 
# instead. To use replace_na, our variable type cannot be a factor, so let's 
# change that back
slp$diet <- as.character(slp$diet)
slp <- slp %>%
  mutate(diet = replace_na(diet, "unknown"))
# and now we can make it a factor again
slp$diet <- as.factor(slp$diet)
View(slp)

Filter Data

# say we want to whittle down our data a bit more before we analyze it
# in this example, we will filter our dataset down to just herbivores, 
# carnivores, and omnivores and we will filter down to those who sleep 8 
# hours or more (living the dream)

slp_filtered <- slp %>%
  filter((diet == "carni" | diet == "herbi"| diet == "omni") & sleep_total >= 8)
View(slp_filtered)

# looks good! Let's make this "slp" going forward
slp <- slp_filtered

Recoding Values and Add Columns

# we can recode and rename values, and we can take these new values and either 
# overwrite the old ones,or make new columns out of them
# say we want to change "carni" to "meat", "herbi" to "veggie", and "omni" to 
# "both", and save this to the same diet column

slp <- slp %>%
  mutate(diet = recode(diet, "carni" = "meat", "herbi" = "veggie", 
                       "omni" = "both"))
View(slp)

# and, what if we wanted to make new columns that give brain weight and body 
# weight in grams instead of kilograms?
slp <- slp %>%
  mutate(brainwt_in_grams = brainwt * 1000) %>%
  mutate(bodywt_in_grams = bodywt * 1000) 
View(slp)

Conditional Recoding

# we can also recode our variables depending on whether or not they meet 
# certain conditions. Here, we will consider any any creature with a brain 
# heavier than 0.01 kg to be big-brained
slp <- slp %>%
  mutate(brain_size = if_else(brainwt > 0.01,"big","small"))
View(slp)

Duplicates

# we can also check for duplicates in our data. Right now, there are not any.
duplicated(slp) # the duplicated() function checks each value for duplicates

# we can also go looking more specifically for duplicated rows
#this is saying "give me the rows of slp that return 
#TRUE for duplicated(slp), and any columns (the blank space after the comma)
slp[duplicated(slp), ]
#right now, this is an empty dataset

# but, what if we add some duplicates...
slp[34,] <- slp[33,] # row 34 (a new row) is equal to row 33
slp[35,] <- slp[22,] # row 35 (a new row) is equal to row 22
slp[36,] <- slp[11,] # row 36 (a new row) is equal to row 11

# now let's check for duplicates again
slp[duplicated(slp), ]
# there they are!

# and, let's get rid of them (sorry, duplicates)
# we can do this in two different ways

# we can grab the non-duplicated rows and assign that to a data frame
slp_dupefree <- slp[!duplicated(slp), ] 
# and check for duplicates
slp_dupefree[duplicated(slp_dupefree), ]

# or we can grab the "distinct" rows, those which only appear once. For this, 
# we will want to add use "distinct(.keep_all = T)", to keep all of the 
# variables and only grab the distinct rows
slp_dupefree2 <- slp %>%
  distinct(.keep_all = T)
# and check for duplicates
slp_dupefree2[duplicated(slp_dupefree2), ]

# good to go! Let's make this our main "slp" dataset again
slp <- slp2_dupefree2

Summarize

# we can also make a new, smaller dataset based on a summary of our current 
# dataset. The summarize() function makes a dataset with one row for each 
# combination of the grouping variables you give it

# let's group by diet and brain size, and then make columns summarizing the
# sleep total
slp_summary <- slp %>%
  group_by(diet, brain_size) %>%
  summarize(mean_sleep = mean(sleep_total), sd_sleep = sd(sleep_total))
View(slp_summary)
# note, we only have one entry for "big" brained veggie-only eaters, the rabbit
# so, no SD for the rabbit.

Reshape

# we can also turn a long dataset into a wide one, and vice versa. Our current 
# "slp" dataset is not too great for this because it has few categorical 
# variables with many entries, so let's load in another example
install.packages("gapminder")
library(gapminder)
# this is a dataset with life expectancy, GDP, and population information for 
# different countries.
# more info can be found...
?gapminder
# let's take a look
View(gapminder)

# let's grab country, year, and life expectancy
gm <- gapminder %>%
  select(country, year, lifeExp)
View(gm)
# this dataset can be made wider
wide_gm <- gm %>% 
  pivot_wider(names_from = year, values_from = lifeExp)

View(wide_gm)
# now we have each row representing one country, and each column representing 
# a different year

# but what if we have wide data that we want to turn into long data?
long_gm <- wide_gm %>% 
  pivot_longer (2:13, #here, we select the columns we want to combine
                names_to = "year",
                values_to = "lifeExp")

view(long_gm)
# and our data is long again!