##Section: 01-preparing-for-the-workshop.R ###Notice ### # # #This is an automatically generated script based on the code chunks from the # #book for this workshop. # # # #It is minimally annotated to allow participants to provide their comments: # #a practice that we highly encourage. # # # #Note that the solutions to the challenges are also included in this script. # #When solving the challenges by yourself, attempt to not scroll and peek at # #the solutions. # # # #Happy coding! # ##Section: 02-r-project.R ##Section: 03-writing-a-script.R # This is a comment, not a command QCBS R Workshop ## Workshop 2 - Loading and manipulating data Author: Quebec Center for Biodiversity Science Date: Fall 2014 R version 2.15.0 ##Housekeeping #### rm(list=ls()) # Clears R workspace ?rm ?ls A<-"Test" # Put some data in workspace A <- "Test" # Add some spaces to organize your data! A = "Test" # You can do this, but it does not mean you should # Check objects in the workspace ls() # [1] "A" A # [1] "Test" # Clean Workspace rm(list=ls()) A a<-10 A<-5 a A rm(list=ls()) # Clears R workspace again ##Section: 04-load-explore-save.R getwd() # This commands shows the directory you are currently working in dir() # This command shows the content of the directory you are currently working in CO2 <- read.csv("co2_good.csv") # Creates an object called CO2 by loading data from a file called "co2_good.csv" ?read.csv # Use the question mark to pull up the help page for a command CO2 <- read.csv("co2_good.csv", header = TRUE) CO2<-read.csv("co2_good.csv",header=FALSE) mydata[2,3] # extracts the content of row 2 / column 3 mydata[1,] # extracts the content of the first row mydata[,1][2] # this extracts the second content of the first column mydata$Variable1 # extracts a specific column by its name ("Variable1") # First let's make a copy of the dataset to play with! CO2copy <- CO2 # names() gives you the names of the variables present in the data frame names(CO2copy) # Changing from English to French names (make sure you have the same levels!) names(CO2copy) <- c("Plante","Categorie", "Traitement", "conc","absortion") # Let's create an unique id for our samples using the function paste() # see ?paste and ?paste0 # Don't forget to use "" for strings CO2copy$uniqueID <- paste0(CO2copy$Plante,"_", CO2copy$Categorie, "_", CO2copy$Traitement) # Observe the results head(CO2copy$uniqueID) # Let's standardize our variable "absortion" to relative values CO2copy$absortionRel = CO2copy$absortion/max(CO2copy$absortion) # Changing to relative values # Observe the results head(CO2copy$absortionRel) # Let's keep working with our CO2copy data frame Subsetting by variable name CO2copy[,c("Plante", "absortionRel")] # Selects only "Plante" and "absortionRel" columns. (Don't forget the ","!) Subsetting by row CO2copy[1:50,] # Subset data frame from rows from 1 to 50 #Subsetting by matching with a factor level CO2copy[CO2copy$Traitement == "nonchilled",] # Select observations matching only the nonchilled Traitement. #Subsetting according to a numeric condition CO2copy[CO2copy$absortion >= 20, ] # Select observations with absortion higher or equal to 20 #Conditions can be complimentary -The & (and) argument- CO2copy[CO2copy$Traitement == "nonchilled" & CO2copy$absortion >= 20, ] # We are done playing with the dataset copy. Let's erase it. rm(CO2copy) summary(CO2) # Get summary statistics of your dataset # Calculate mean and standard deviation of the concentration, and assign them to new variables meanConc <- mean(CO2$conc) sdConc <- sd(CO2$conc) # print() prints any given value to the R console print(paste("the mean of concentration is:", meanConc)) print(paste("the standard deviation of concentration is:", sdConc)) # Let's plot a histogram to explore the distribution of "uptake" hist(CO2$uptake) # Increasing the number of bins to observe better the pattern hist(CO2$uptake, breaks = 40) ?apply apply(CO2[,4:5], MARGIN = 2, FUN = mean) # Calculate mean of the two columns in the data frame that contain continuous data save.image(file="co2_project_Data.RData") # Save workspace rm(list=ls()) # Clears R workspace load("co2_project_Data.RData") #Reload everything that was in your workspace head(CO2) # Looking good! :) write.csv(CO2,file="co2_new.csv") # Save object CO2 to a file named co2_new.csv ##Section: 05-repair.R CO2<-read.csv("co2_broken.csv") # Overwrite CO2 object with broken CO2 data head(CO2) # Looks messy CO2 # Indeed! CO2 <- read.csv("co2_broken.csv",sep = "") ?read.csv head(CO2) # The head() command allows you to see that the data has not been read in with the proper headings CO2<-read.csv("co2_broken.csv",sep = "",skip=2) # By adding the skip argument into the read.csv function, R knows to skip the first two rows head(CO2) # You can now see that the CO2 object has the appropriate headings str(CO2) # The str() command shows you that both 'conc' and 'uptake' are labelled as factors class(CO2$conc) unique(CO2$conc) # By looking at the unique values in this column, you see that both columns contain "cannot_read_notes" unique(CO2$uptake) ?unique ?read.csv CO2 <- read.csv("co2_broken.csv",sep = "",skip = 2,na.strings = c("NA","na","cannot_read_notes")) head(CO2) str(CO2) # You can see that conc variable is now an integer and the uptake variable is now treated as numeric str(CO2) # You can see that 4 levels are listed for Treatment levels(CO2$Treatment) unique(CO2$Treatment) # The 4 different treatments are "nonchilled", "nnchilled", "chilled", and "chiled" # You can use which() to find rows with the typo "nnchilled" which(CO2$Treatment=="nnchilled") # Row number ten # You can then correct the error using indexing: CO2$Treatment[10] <- "nonchilled" # Alternatively, doing it with a single command: CO2$Treatment[which(CO2$Treatment=="nnchilled")] <- "nonchilled" # Now doing the same for "chiled": CO2$Treatment[which(CO2$Treatment=="chiled")] <- "chilled" str(CO2) # Structure still identifies 4 levels of the factor unique(CO2$Treatment) # But, unique says that only two are used CO2<-droplevels(CO2) # This command drops the unused levels from all factors in the data frame str(CO2) # Fixed! ##Section: 06-tidyverse.R if(!require(tidyr)){install.packages("tidyr")} library(tidyr) > wide <- data.frame(Species = c("Oak", "Elm", "Ash"), DBH = c(12, 20, 13), Height = c(56, 85, 55)) > wide Species DBH Height 1 Oak 12 56 2 Elm 20 85 3 Ash 13 55 ?pivot_longer # Gathering columns into rows > long <- pivot_longer(data = wide, cols = c("DBH", "Height"), names_to = "dimension", values_to = "cm") > long Species dimension cm 1 Chene DHP 12 2 Chene Haut 56 3 Orme DHP 20 4 Orme Haut 85 5 Frene DHP 13 6 Frene Haut 55 CO2.long <- pivot_longer(CO2, cols = c("conc", "uptake"), "response", "value") head(CO2) head(CO2.long) tail(CO2.long) # Spreading rows into columns > wide2 <- ivot_wider(data = long, names_from = "dimension", values_from = "cm") > wide2 Species DBH Height 1 Oak 12 56 2 Elm 20 85 3 Ash 13 55 tibble(x = 1:3, y = c("a","b","c")) tibble(x = 1:3, y = list(1:5, 1:10, 1:20)) set.seed(8) messy <- data.frame(id = 1:4, trt = sample(rep(c('control', 'farm'), each = 2)), zooplankton.T1 = runif(4), fish.T1 = runif(4), zooplankton.T2 = runif(4), fish.T2 = runif(4)) messy messy.long <- pivot_longer(messy, names_to = "taxa", cols = c("zooplankton.T1", "fish.T1", "zooplankton.T2", "fish.T2")) head(messy.long) id trt taxa value 1 1 farm zooplankton.T1 0.719 2 1 farm fish.T1 0.644 3 1 farm zooplankton.T2 0.545 4 1 farm fish.T2 0.264 5 2 farm zooplankton.T1 0.291 6 2 farm fish.T1 0.457 messy.long.sep <- separate(messy.long, taxa, into = c("species", "time"), sep = "\\.") head(messy.long.sep) id trt species time value 1 1 farm zooplankton T1 0.719 2 1 farm fish T1 0.644 3 1 farm zooplankton T2 0.545 4 1 farm fish T2 0.264 5 2 farm zooplankton T1 0.291 6 2 farm fish T1 0.457 ?air.quality data(airquality) # Use gather() to convert the dataset to long format air.long <- gather(airquality, variable, value, -Month, -Day) head(air.long) # Note that the syntax used here indicates we wish to gather ALL the columns except "Month" and "Day" # Then, use spread() to convert the dataset back to wide format air.wide <- spread(air.long , variable, value) head(air.wide) air.wide <- pivot_wider(air.long, values_from = "value", names_from = "variable") head(air.wide) if(!require(dplyr)){install.packages("dplyr")} library(dplyr) select(data, column1, column2) # select columns 1 and 2 select(data, c(2:4,6) # select columns 2 to 4 and 6 select(data, -column1) # select all columns except column 1 select(data, start_with(x.)) # select all columns that start with "x." > head(airquality) Ozone Solar.R Wind Temp Month Day 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 5 NA NA 14.3 56 5 5 6 28 NA 14.9 66 5 6 > ozone <- select(airquality, Ozone, Month, Day) > head(ozone) Ozone Month Day 1 41 5 1 2 36 5 2 3 12 5 3 4 18 5 4 5 NA 5 5 6 28 5 6 > august <- filter(airquality, Month == 8, Temp >= 90) > head(august) Ozone Solar.R Wind Temp Month Day 1 89 229 10.3 90 8 8 2 110 207 8.0 90 8 9 3 NA 222 8.6 92 8 10 4 76 203 9.7 97 8 28 5 118 225 2.3 94 8 29 6 84 237 6.3 96 8 30 > air_mess <- sample_frac(airquality, 1) > head(air_mess) Ozone Solar.R Wind Temp Month Day 21 1 8 9.7 59 5 21 42 NA 259 10.9 93 6 11 151 14 191 14.3 75 9 28 108 22 71 10.3 77 8 16 8 19 99 13.8 59 5 8 104 44 192 11.5 86 8 12 > air_chron <- arrange(air_mess, Month, Day) > head(air_chron) Ozone Solar.R Wind Temp Month Day 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 5 NA NA 14.3 56 5 5 6 28 NA 14.9 66 5 6 arrange(air_mess, Day, Month) > airquality_C <- mutate(airquality, Temp_C = (Temp-32)*(5/9)) > head(airquality_C) Ozone Solar.R Wind Temp Month Day Temp_C 1 41 190 7.4 67 5 1 19.44444 2 36 118 8.0 72 5 2 22.22222 3 12 149 12.6 74 5 3 23.33333 4 18 313 11.5 62 5 4 16.66667 5 NA NA 14.3 56 5 5 13.33333 6 28 NA 14.9 66 5 6 18.88889 if(!require(magrittr)){install.packages("magrittr")} require(magrittr) june_C <- mutate(filter(airquality, Month == 6), Temp_C = (Temp-32)*(5/9)) june_C <- airquality %>% filter(Month == 6) %>% mutate(Temp_C = (Temp-32)*(5/9)) > month_sum <- airquality %>% group_by(Month) %>% summarise(mean_temp = mean(Temp), sd_temp = sd(Temp)) > month_sum Source: local data frame [5 x 3] Month mean_temp sd_temp (int) (dbl) (dbl) 1 5 65.54839 6.854870 2 6 79.10000 6.598589 3 7 83.90323 4.315513 4 8 83.96774 6.585256 5 9 76.90000 8.355671 ?ChickWeight data(ChickWeight) # Use group_by() to divide the dataset by "Chick" # Use summarise() to calculate the weight gain within each group > weight_diff <- ChickWeight %>% group_by(Chick) %>% summarise(weight_diff = max(weight) - min(weight)) > weight_diff Source: local data frame [50 x 2] Chick weight_diff (fctr) (dbl) 1 18 4 2 16 16 3 15 27 4 13 55 5 9 58 6 20 76 7 10 83 8 8 92 9 17 100 10 19 114 .. ... ... > chick_18 <- ChickWeight %>% filter(Chick == 18) > chick_18 weight Time Chick Diet 1 39 0 18 1 2 35 2 18 1 > diet_summ <- ChickWeight %>% group_by(Diet, Chick) %>% summarise(weight_gain = last(weight) - first(weight)) %>% group_by(Diet) %>% summarise(mean_gain = mean(weight_gain)) > diet_summ # A tibble: 4 × 2 Diet mean_gain 1 1 114.9 2 2 174.0 3 3 229.5 4 4 188.3 ##Section: 07-final-considerations.R ##Section: 08-references.R