######################################################## ######################################################## ##################Introduction to R #################### ##################### Stat 345 ######################### ######################################################## ######################################################## #### R as a calculator #### 1+1 2*2 50+2*4 (50+2)*4 10^2 exp(1) #### Vectors #### x <- c(1,2,3) # the c is short for combine x y <- 1:3 y z <- 1:50 z # Sequences # seq(from = 1, to = 10) 1:10 #another way to do it seq(from = 1, to = 10, length = 10) #length tells us how many values are generated in this sequence seq(from = 1,to = 100, by = 20) #by generates a sequence that starts at your desired value and increases by the desired amount and stops when it hits the "to" value seq(1, 100, 20) #by is the default 3rd input, so length needs to be specified seq(1,100, length = 20) # Replicates # rep(7,3) rep(c(1,2,3),3) # replicating vectors can be very useful for generating data # Normal Random Variable # rnorm(n=10, mean = 0, sd = 1) # Character Vectors # x <- "Jared" x y <- Jared # creates an error without the quotations y The_Office <- c("Michael","Jim","Dwight", "Pam", "Angela") #Note that you cannot have spaces in the vector name The_office #Note that they are also case sensitive The_Office # Vector calculator # x <- 1:5 y <- rep(2,5) z <- 1:4 a <- c(0,0,0,0,0) x+1 x+y x*y x%*%y #this performs matrix multiplication a+z #because z is not the same length as a it will repeat itself over when adding through # Making a Matrix # x <- 1:5 y <- 6:10 z <- c(x,y) z z <- cbind(x,y) #creates matrix using the vectors as its columns z z <- rbind(x,y) # creates matrix using the vectors as its rows z #### Functions #### data <- 1:10 mean(data) # returns the arithmetic average of a vector var(data) # returns the variance of a vector sd(data) #returns the standard deviation of a vector sum(data) #retuns the sum of the vector prod(data) #retuns the product of the vector summary(data) #gives the five number summary #### Indexing and Logic #### x <- seq(1,10, by = 2) x x[2] # finds the value in the 2nd position x[c(2,4)] #finds the value in the 2nd and 4th position x[-c(2,4)] #finds the values that are not in 2nd and 4th position # Matrix Indexing # z<- matrix(data = 1:16,nrow = 4, ncol = 4) #notice it sorts the vector into columns z t(z) #transposes a matrix z[1,1] z[1,2] z[2,1] z[,1] #leaving the row value blank returns all the values in column 1 z[1,] #leaving the column value blank returns all the values in column 2 # Operators and Boolean Operators # x <- seq(1,10,by = 2) x x[x > 3] x[x < 0] x[x == 3] x[x <= 3] x[x >= 3] x[x != 3] # the ! is used to mean "not" x <- seq(1,10) x[x > 5 & x <= 7] #The "&" works as an "and". x[x > 5 | x <= 7] # The "|" works as an "or" set.seed(123) # this sets a seed from which a random sample is generated. VERY USEFUL for when you're generating data for a simulation and want keep generating the same sample. x <- rnorm(100,0,1) x[1:10] #See the first 10 values of the vector x > 2 # not helpful! which(x > 2) # this output is a vector so we can use it as such x[which(x>2)] x[x>2] # also works for searching for specific values #### NA's #### x <- c(1,2,3,4,NA) sum(x) # the calculation cannot be performed sum(x, na.rm = T) #### Data Sets #### cars[1:10,] # examines the first 10 values in the dataset head(cars) # works as well cars$speed [1:10] # the "$" extracts vectors out of the dataset cars$dist [1:10] cor(cars$speed,cars$dist, method = "pearson") confint(cars$speed, level = 0.95) rock <- 14 rap <- 19 country <- 5 pop <- 12 musicdataset1 <- data.frame(rock,rap,country,pop) musicdataset1 ## OR ## rock <- rep("rock",14) rap <- rep("rap",19) country <- rep("country", 5) pop <- rep("pop", 12) musicdataset2 <- data.frame("music_type" = c(rock,rap,country,pop)) musicdataset2[c(1:3,20:22,34:37,39:41),] #### Plotting ### # Scatterplots # plot(cars$speed,cars$dist) plot(cars$speed,cars$dist, xlab = "Speed", ylab = "Distance", main = "Cars Scatterplot") #note the "" around the labels. plot(cars$speed,cars$dist, xlab = "Speed", ylab = "Distance", main = "Cars Scatterplot", pch = 16) # Boxplots # head(iris) boxplot(iris$Sepal.Length) boxplot(iris$Sepal.Length~iris$Species, xlab = "Species", ylab = "Sepal Length", main = "Iris Dataset Boxplots") # this "~" allows us to examine the Sepal length across the different species. boxplot(iris$Sepal.Length~iris$Species, xlab = "Species", ylab = "Sepal Length", main = "Iris Dataset Boxplots", col = c("red", "blue", "green")) # Histograms # variable1 <- rnorm(100, mean = 0, sd = 1) variable2 <-rnorm(1000, mean = 2, sd = 1) variable3 <-rnorm(100000, mean = 4, sd = 1) par(mfrow = c(1,3)) #1 row by 3 columns hist(variable1, main = "n = 100") hist(variable2,main = "n = 1000") hist(variable3, main = "n = 100000") par(mfrow = c(1,3)) hist(variable3, main = "Default # of Breaks") hist(variable3, breaks = 20, main = "More Breaks") hist(variable3, breaks = 100, main = "Lots of Breaks") par(mfrow = c(1,1)) hist(variable1, main = "Overlay of histogram and actual curve", freq = F) # freq = F gives the area under the curve (density) instead of the frequency x <- seq(-4,4, length = 10000) # gives the x values of the function I want to overlay lines(x, dnorm(x, mean = 0, sd=1), col="black") # the "d" in "dnorm" means the density. #So with small enough increments of x we can draw a nice curve over the histogram. This gives the y values of the function. ##