######################################################################
# R commands      STAT588/BIOL588
# Lab 2  2023     University of South Carolina
######################################################################
# This file contains the R commands for the lab.
#
# Lines beginning with the symbol '#' are comments in R.  All other
# lines contain code.
#
# In R for Windows, you may wish to open this file from the menu bar
# (File:Display file); you can then copy commands into the command
# window.  (Use the mouse to highlight one or more lines; then
# right-click and select "Paste to console".)
######################################################################


################
### Data types 
################

a<-1
class(a)
class(ls)


##############################
# Creating simple vectors
##############################

x <- c(1,3.5,-28.4,10) #numerical vector
x
y<-c("cat","dog","mouse","monkey") #character
z<-c(TRUE,TRUE,TRUE,FALSE,FALSE) #logical vector
x<-1:10
seq(1,10)
seq(3, 9, by=3)
rep(2, 10)
rep(c(1,2,3),5)
log(seq(1,2, by=0.1))
x<-c(1, 5, 10, NA, 15)
sum(x)
sum(x, na.rm=T)
prod(x, na.rm=T)
mean(x, na.rm=T)
x<-1:10
cumsum(x) ## running sum
cummax(x) ## running maximum
cummin(x) ## running minimum
######################
###################################
# Accessing Elements in a Vector
###################################
y <- c(18,32,15,-7,12,19)
length(y)
y[3:5] ##position in vector as positive integer
y[-c(1,5,6)] ## exclude: use negative integers
y<15 
y[y<15]
which(y==32)
y<-seq(1, 10, by=2)
x<-seq(2, 15)
match(y,x)
colors<-c("red", "blue", "pink")
which(colors=="yellow")
x<-c(1,5,10, NA, 15)
which(is.na(x))
which(!is.na(x))


##############################
# Factors
# vector with categories
##############################
colors<-c(1,1,2,3)
colors<-factor(colors, label=c("red", "green", "blue"))
table(colors)


##############################
# Matrices
##############################
help(cbind)
y<-c(8,32, 15, -7, 2, 19)
x<-1:6
mat<-cbind(x,y)
help(rbind)
dim(mat)  ## check dimension
ncol(mat) ## the number of columns of a matrix
nrow(mat) ## the number of rows of a matrix
mat[2,3] # the value in the 2nd row and the 3rd column
mat[1:3,]  ## the first three row of mat
mat[,2]  ## the 2nd column of mat
mat[-1,] ## exclude the first row
newmat<-matrix(1:9, nrow=3) ## create new matrix
newmat
rowMeans(newmat)
colMeans(newmat)
m<-matrix(1:9, nrow=3, byrow=T) ## fill row first
colnames(m)<-c("a", "b", "c")
rownames(m)<-c("r1", "r2", "r3")
vect<-as.vector(newmat)

##############################
# Matrices Multiplication
##############################
mat<-matrix(1:9, nrow=3)
mat^2
mat%*%mat

##############################
# Arrays
##############################

myarray<-array(1:64, dim=c(4,4,4))
myarray
myarray[1,2,3]


##############################
# Data Frames
##############################
muscle <- rnorm(n=10,mean=3,sd=1)
sex <- factor(rep(c("M","F"),c(6,4)))
speed  <- rep(0,10)
speed[1:6] <- rnorm(6,30-2*muscle[1:6],2)
speed[7:10] <- rnorm(4,40-2*muscle[7:10],2)
mydata <- data.frame(y=speed,x1=muscle,x2=sex)
summary(mydata)
str(mydata)
temp <- lm(y~x1+x2,data=mydata)  
summary(temp)

##############################
# Lists
##############################
x <- list(one=c(18:36),two=c("AK","AL","AZ"),three=c(T,T,F,T),four=matrix(1:12,3,4))
x
x[[1]][3:6]
x$one[3:6]
y<-unlist(x)
str(y)

############################
# Operator     Description
############################
# >            greater than
# >=           greater than or equal to
# <            less than
# <=           less than or equal to
# ==           equal to
# !=           not equal to
# &            and
# |            or

#################################

##############################
## &, |
##############################
x<-c(T, T, F, F)
y<-c(T, F, T, F)
mat<-cbind(x,y)
mat
and<- x & y
or<-x | y
and 
or


###################
# order
####################
x<-c(1,3,6,2,-1,-2)
o<-order(x, decreasing=T)
o
orderedx<-x[o]
orderedx


############################# 
# Reading and writing data 
#############################
getwd()
setwd("/Users/yen-yiho/Desktop/STAT588")
url<-"https://people.stat.sc.edu/hoyen/STAT588/Data/ALLpheno.csv"

ALLpheno<-read.csv(file=url, header=T)
save(ALLpheno, file="mydata.RData")
write.csv(ALLpheno, file="ALLpheno.csv", row.names=F)

rm(ALLpheno)
str(ALLpheno)
load("mydata.RData")
str(ALLpheno)


############## 
# Indexing 
##############
# ALL dataset 
# molecular abnormaly
table(ALLpheno$mol.bio)
index1<-ALLpheno$mol.bio=="BCR/ABL"
str(index1)
sum(index1)
mut1<-ALLpheno[index1,]
head(mut1)

#### BCR/ABL or ALL1/AF4

index2<-ALLpheno$mol.bio=="BCR/ABL" | ALLpheno$mol.bio=="ALL1/AF4"
mut2<-ALLpheno[index2, ]
str(mut2)

###### BCR/ABL and female 


#### indexing functions 
#### which, match, %in% 
#### which
imut1<-which(ALLpheno$mol.bio=="BCR/ABL")
str(imut1)
mut1<-ALLpheno[imut1,]

#### match 
id<-c("1005", "16002")
iid<-match(id, ALLpheno$cod)
iid
ALLpheno[iid,]

#####%in% 
pattern<-c("BCR/ABL", "ALL1/AF4")
imut3<-which(ALLpheno$mol.biol %in% c("BCR/ABL", "ALL1/AF4"))
mut3<-ALLpheno[imut3,]
dim(mut3)
str(mut3)

##### BCR/ABL and B cell
table(ALLpheno$BT)
iBcell<-grep(ALLpheno$BT, patter="^B")
iBmut1<-intersect(iBcell, imut1)
length(iBmut1)
############ What kind of mutations from T-cell leukemia 
iTcell<-grep(ALLpheno$BT, patter="^T")
length(iTcell)
Tcell<-ALLpheno[iTcell,]
table(Tcell$mol.bio)

################
# Indexing  exercise 
#################


getwd() ## change it using setwd()
fmsURL<-"http://people.stat.sc.edu/hoyen/STAT588/Data/FMS_data.txt"
fms<-read.delim(file=fmsURL, header=TRUE, sep="\t")
colnames(fms)
dim(fms)  ## check the dimension of the data
str(fms[,1:10]) ## check the structure of the data 
fms$id[1:10] 
fms[1,1:10]
fms$pre.BMI
fms$actn3_rs540874

################
# Indexing  exercise 
#################

#### Exercise 1: identify the gene "actn3_rs540874", and pre.BMI variables in the fms data 
#####Exercise 2: create a smaller data set with only id, actn3_rs540874 gene and pre.BMI
#### Exercise 3: Remove any NA values in these three column 


index<-match(c("actn3_rs540874","pre.BMI"), colnames(fms)) 
dat<-fms[!is.na(fms$pre.BMI) & !is.na(fms$actn3_rs540874) , index] ## observations without NA
attach(fms)
mean(pre.BMI, na.rm=T)