########################################################
########################################################
##################Introduction to R ####################
##################### Stat 345 #########################
########################################################
########################################################

#### R as a calculator ####
1+1
2*2
50+2*4
(50+2)*4
10^2
exp(1)

#### Vectors ####
x <- c(1,2,3) # the c is short for combine
x
y <- 1:3
y
z <- 1:50
z

# Sequences #
seq(from = 1, to = 10)
1:10 #another way to do it
seq(from = 1, to = 10, length = 10) #length tells us how many values are generated in this sequence
seq(from = 1,to = 100, by = 20) #by generates a sequence that starts at your desired value and increases by the desired amount and stops when it hits the "to" value 
seq(1, 100, 20) #by is the default 3rd input, so length needs to be specified
seq(1,100, length = 20)

# Replicates #
rep(7,3)
rep(c(1,2,3),3) # replicating vectors can be very useful for generating data

# Normal Random Variable #
rnorm(n=10, mean = 0, sd = 1)

# Character Vectors #
x <- "Jared"
x
y <- Jared # creates an error without the quotations
y
The_Office <- c("Michael","Jim","Dwight", "Pam", "Angela") #Note that you cannot have spaces in the vector name
The_office #Note that they are also case sensitive
The_Office

# Vector calculator #
x <- 1:5
y <- rep(2,5)
z <- 1:4
a <- c(0,0,0,0,0)
x+1
x+y
x*y
x%*%y #this performs matrix multiplication
a+z #because z is not the same length as a it will repeat itself over when adding through

# Making a Matrix #
x <- 1:5
y <- 6:10
z <- c(x,y)
z
z <- cbind(x,y) #creates matrix using the vectors as its columns
z
z <- rbind(x,y) # creates matrix using the vectors as its rows
z

#### Functions ####
data <- 1:10
mean(data) # returns the arithmetic average of a vector
var(data) # returns the variance of a vector
sd(data) #returns the standard deviation of a vector
sum(data) #retuns the sum of the vector
prod(data) #retuns the product of the vector
summary(data) #gives the five number summary

#### Indexing and Logic ####
x <- seq(1,10, by = 2)
x
x[2] # finds the value in the 2nd position
x[c(2,4)] #finds the value in the 2nd and 4th position
x[-c(2,4)] #finds the values that are not in 2nd and 4th position

# Matrix Indexing #
z<- matrix(data = 1:16,nrow = 4, ncol = 4) #notice it sorts the vector into columns
z
t(z) #transposes a matrix
z[1,1]
z[1,2]
z[2,1]
z[,1] #leaving the row value blank returns all the values in column 1
z[1,] #leaving the column value blank returns all the values in column 2

# Operators and Boolean Operators #
x <- seq(1,10,by = 2)
x
x[x > 3]
x[x < 0]
x[x == 3]
x[x <= 3]
x[x >= 3]
x[x != 3] # the ! is used to mean "not"

x <- seq(1,10)
x[x > 5 & x <= 7] #The "&" works as an "and".
x[x > 5 | x <= 7] # The "|" works as an "or"

set.seed(123) # this sets a seed from which a random sample is generated. VERY USEFUL for when you're generating data for a simulation and want keep generating the same sample.
x <- rnorm(100,0,1)
x[1:10] #See the first 10 values of the vector
x > 2 # not helpful!
which(x > 2) # this output is a vector so we can use it as such
x[which(x>2)]
x[x>2] # also works for searching for specific values

#### NA's ####
x <- c(1,2,3,4,NA)
sum(x) # the calculation cannot be performed
sum(x, na.rm = T)

#### Data Sets ####
cars[1:10,] # examines the first 10 values in the dataset
head(cars) # works as well
cars$speed [1:10] # the "$" extracts vectors out of the dataset
cars$dist [1:10]
cor(cars$speed,cars$dist, method = "pearson")
confint(cars$speed, level = 0.95)

rock <- 14
rap <- 19
country <- 5
pop <- 12
musicdataset1 <- data.frame(rock,rap,country,pop)
musicdataset1
## OR ##

rock <- rep("rock",14)
rap <- rep("rap",19)
country <- rep("country", 5)
pop <- rep("pop", 12)
musicdataset2 <- data.frame("music_type" = c(rock,rap,country,pop)) 
musicdataset2[c(1:3,20:22,34:37,39:41),]

#### Plotting ###
# Scatterplots #
plot(cars$speed,cars$dist)
plot(cars$speed,cars$dist, xlab = "Speed", ylab = "Distance", main = "Cars Scatterplot") #note the "" around the labels.
plot(cars$speed,cars$dist, xlab = "Speed", ylab = "Distance", main = "Cars Scatterplot",
     pch = 16)

# Boxplots #
head(iris)
boxplot(iris$Sepal.Length)
boxplot(iris$Sepal.Length~iris$Species, xlab = "Species", ylab = "Sepal Length", main = "Iris Dataset Boxplots")
# this "~" allows us to examine the Sepal length across the different species.
boxplot(iris$Sepal.Length~iris$Species, xlab = "Species", ylab = "Sepal Length", main = "Iris Dataset Boxplots",
        col = c("red", "blue", "green"))

# Histograms #
variable1 <- rnorm(100, mean = 0, sd = 1)
variable2 <-rnorm(1000, mean = 2, sd = 1)
variable3 <-rnorm(100000, mean = 4, sd = 1)
par(mfrow = c(1,3)) #1 row by 3 columns
hist(variable1, main = "n = 100")
hist(variable2,main = "n = 1000")
hist(variable3, main = "n = 100000")

par(mfrow = c(1,3))
hist(variable3, main = "Default # of Breaks")
hist(variable3, breaks = 20, main = "More Breaks")
hist(variable3, breaks = 100, main = "Lots of Breaks")

par(mfrow = c(1,1))
hist(variable1, main = "Overlay of histogram and actual curve", freq = F) # freq = F gives the area under the curve (density) instead of the frequency
x <- seq(-4,4, length = 10000) # gives the x values of the function I want to overlay
lines(x, dnorm(x, mean = 0, sd=1), col="black") # the "d" in "dnorm" means the density. 
#So with small enough increments of x we can draw a nice curve over the histogram. This gives the y values of the function.
##