###################################################################### # R commands STAT588/BIOL588 # Lab 3 2023 University of South Carolina ###################################################################### # This file contains the R commands for the lab. # # Lines beginning with the symbol '#' are comments in R. All other # lines contain code. # # In R for Windows, you may wish to open this file from the menu bar # (File:Display file); you can then copy commands into the command # window. (Use the mouse to highlight one or more lines; then # right-click and select "Paste to console".) ###################################################################### ################## # data wrangling ################## library(dplyr) library(dslabs) library(matrixStats) library(tidyverse) url<-"https://people.stat.sc.edu/hoyen/STAT588/Data/ALLpheno.csv" ALLpheno<-read.csv(file=url) d1<-as.Date(ALLpheno$diagnosis, "%m/%d/%y") # mutate dim(ALLpheno) pheno<-mutate(ALLpheno, crtime=difftime(as.Date(ALLpheno$date.cr, format="%m/%d/%Y"), as.Date(ALLpheno$diagnosis, format="%m/%d/%Y"), units="days")) sum(!is.na(pheno$crtime)) pheno<-mutate(pheno, crtime=as.numeric(crtime)) %>% filter(!is.na(crtime) & crtime >=0) dim(pheno) str(pheno) plot(density(pheno$crtime)) newdata<-select(pheno, cod, sex, age, CR, relapse, crtime) str(newdata) ############################## # For loops ############################## for (i in 1:10){ d<-Sys.time() print(paste("Now is", d, sep=" ")) print(i*i) } ############################## # for loops & ifelse ############################## for (i in 1:10){ if (i > 5){ print(i) } } ## if .. else x<-rnorm(10, mean=5, sd=1) for (i in x){ if (i < 5){ msg<-paste(i, " is equal or less than 5", sep="") print(msg) } else { msg<-paste(i, " is larger than 5", sep="") print(msg) } } ### ifelse: ALL data example library(ALL) data(ALL) emat<-exprs(ALL) str(emat) emat[1:5, 1:5] colnames(emat) rownames(emat) emat<-as.data.frame(emat) newid<-ifelse(nchar(newdata$cod) <5, paste0("0", newdata$cod), newdata$cod) ### ################################################ ## Exercise 1 ## Identity the gene expression measurement for participants in the newid we just created use mutate or filter or select ################################################ ############################## # Generate random number ############################## ## from uniform distribution x<-runif(100) hist(x) index<-1:length(x) plot(index, x, type="l") abline(h=0.5, lty=2) ## from normal distribution y<-rnorm(100) plot(density(y)) ########################################################### # Functions ########################################################### # # function_name <- function(parameters){ # function body # } ############################################################ x<-rnorm(100) mysd<-function(x){ ans<-sqrt(var(x)) return(ans) } mysd(x) sd(x) myfun<-function(x){ ans<-x+3 return(ans) } myfun(5) myfun2<-function(x,y){ ans<-x+y return(ans) } myfun2(3,7) qsol<-function(a, b, c){ ans<-c((-b+sqrt(b^2-4*a*c))/(2*a), (-b-sqrt(b^2-4*a*c))/(2*a)) return(ans) } qsol(1,1,-1) ############################## # apply, sapply ############################## m<-matrix(1:20, nrow=10, ncol=2) ## create a matrix of 10 rows x 2 columns #### ans<-rep(NA, length=10) for(i in 1:10){ ans[i]<-mean(m[i,]) } ans ## apply(m, 1, mean) ## mean of the rows: rowMeans apply(m,2, mean) ## mean of the columns: colMeans x<-1:10 sapply(x, myfun) x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE)) sapply(x, quantile) ################################################################# # ALLexpr data ################################################################## # Exercise: identify genes with mean expression level less than 5 # Is mean expression level less than 5 very extreme? ### plot the density of the mean expression level from all genes ################################################################ ########################### # Exercise: strsplit, sapply ########################### myid<-paste("ID-", round(runif(100),2)*100, sep="") myid ## Use strsplit and sapply to create a numerical vector that contains only the numerical part of myid object (for exampe: ID-89 -> 89) ########################### # plotting ########################### library(MASS) data("Animals") head(Animals,7) help(Animals) plot(Animals$body, Animals$brain, xlab="Body Weight", ylab="Brain Weight", main="Plot of Body Wt. to Brain Wt.", pch=16) ############################# ###### Exercise ############################## #1. What is the animal that weighs more than 8000 kg #2. create a dataset with animals weigh less than 2000 kg ############################################ pdf("/Users/yen-yiho/Desktop/animal.pdf") plot(animal$body, animal$brain, xlab="Body Weight", ylab="Brain Weight", main="Plot of Body Wt. to Brain Wt.", pch=16) dev.off() ############################ # ggplot 2 ############################# library(ggplot2) url2<-"https://people.stat.sc.edu/hoyen/STAT588/Data/meta.csv" meta<-read.csv(file=url2) str(meta) # We will get an blank plot, because we need to specify additional layers using the + operator. ggplot(meta) ####################################################################################### # The geom (geometric) object is the layer that specifies what kind of plot we want to draw. # A plot must have at least one geom; there is no upper limit. Examples include: # # 1. points (geom_point, geom_jitter for scatter plots, dot plots, etc) # 2. lines (geom_line, for time series, trend lines, etc) # 3. boxplot (geom_boxplot for box plots) ####################################################################################### ### We will get an error, asking for aesthetics using the aes() function ggplot(meta) + geom_point() ######################################################################################## ## The aes() function has many different arguments, ## and all of those arguments take columns from the original data frame as input. ## It can be used to specify many plot elements including the following: # 1.position (i.e., on the x and y axes) # 2. color (“outside” color) # 3. fill (“inside” color) # 4. shape (of points) # 5. linetype # 6. size ######################################################################################## ggplot(meta) + geom_point(aes(x = age_in_days, y= samplemeans)) ggplot(meta) + geom_point(aes(x = age_in_days, y= samplemeans, color = genotype)) ggplot(meta) + geom_point(aes(x = age_in_days, y= samplemeans, color = genotype, shape=celltype)) ################################################################################ #### To change the size of the points to a specfic value (instead of a column in the dataset), we use the size argument outside of the aes function ################################################################################ ggplot(meta) + geom_point(aes(x = age_in_days, y= samplemeans, color = genotype, shape=celltype), size=6) ################################################################################### # The theme layer # The ggplot2 theme system handle non-data plot elements such as # 1. Axis label aesthetics # 2. plot background # 3. Facet label background # 4. Legend appearance # Built-in themes: themem_bw(), #################################################################################### ggplot(meta) + geom_point(aes(x = age_in_days, y= samplemeans, color = genotype, shape=celltype), size=3.0) + theme_bw() # change the size of axes titles to 1.5 time the default size ggplot(meta) + geom_point(aes(x = age_in_days, y= samplemeans, color = genotype, shape=celltype), size=2.25) + theme_bw() + theme(axis.title = element_text(size=rel(1.5))) ########### example("geom_point") ################## # End of lab3.R ##################