# This is a comment. The '#' character and anything after it is ignored. # Read in dataset. 'header=TRUE' means the 1st line contains variable names geyser <- read.table("faithful.txt",header=TRUE) # See what kind of object geyser is. class(geyser) # What are its dimensions? dim(geyser) # We could also get the row and column dimensions seperately. nrow(geyser) ncol(geyser) # What are the names of the variables? names(geyser) # Writing out 'geyser' is all the time is cumbersome... Copy it to 'd' d <- geyser # List of waiting time between eruptions d$waiting # Show a histogram hist(d$waiting) # Find out more about the 'xxx' function using 'help(xxx)', # where 'xxx' is 'hist' for example # Make the labels look better hist(d$waiting,main="Histogram of Waiting Time Between Eruptions",xlab="Waiting Time (minutes)") # Show a histogram of duration of eruptions hist(d$waiting,main="Histogram of Duration of Eruptions",xlab="Duration (minutes)") # See what objects are available ls() # The 'system' function executes a command as if at the shell prompt # See what files are in the current working directory. system("ls") # How are duration and waiting correlated? cor(d$eruptions,d$waiting) plot(d$eruptions,d$waiting) # This plot is better labeled plot(d$eruptions,d$waiting,ylab="Duration of Eruptions (minutes)",xlab="Waiting Time Between Eruptions (minutes)",main="Old Faithful Data",pch=c(16,17)[1*(d$eruptions>3)+1]) # Do a simple linear regression fm <- lm( waiting ~ eruptions, data=d) summary(fm) anova(fm) # Look inside the fitted model. All of these things can be extracted from the object. str(fm) str(anova(fm)) # Get the mean squared error. anova(fm)$"Mean Sq"[2] # Get the OLS estimates of the regression coefficients fm$coef # Put least squares fit on the plot abline(fm,col="red",lty=2,lwd=3) # Save to plot to a file postscript("faithful.ps",width=5.5,height=5) plot(d$eruptions,d$waiting,ylab="Duration of Eruptions (minutes)",xlab="Waiting Time Between Eruptions (minutes)",main="Old Faithful Data",pch=c(16,17)[1*(d$eruptions>3)+1]) abline(fm,col="red",lty=2,lwd=3) dev.off() # Convert it for inclusion by 'pdflatex' # First, convert to 'encapsulated postscript', thus reducing the 'bounding box' system("ps2epsi faithful.ps") # Second, convert it to a PDF file system("epstopdf faithful.epsi") # Remove the encapsulated postscript and postscript files system("rm faithful.epsi faithful.ps") # Get pdf file directly pdf("faithful.pdf",width=5.5,height=5) plot(d$eruptions,d$waiting,ylab="Duration of Eruptions (minutes)",xlab="Waiting Time Between Eruptions (minutes)",main="Old Faithful Data",pch=c(16,17)[1*(d$eruptions>3)+1]) abline(fm,col="red",lty=2,lwd=3) dev.off() # 'd$' before each variable is not necessary when using the 'attach' function waiting # Notice the last command gave an error. Now use the 'attach' function attach(d) waiting # No error this time! # There are many build in functions... sd(d$eruptions) var(d$waiting) # But, sometimes you need to do things "by hand" using vector operations sd.by.hand <- sqrt( sum( (d$eruptions-mean(d$eruptions))^2 ) / ( length(d$eruptions) - 1 ) ) sd.by.hand # Writing code like you would in a low-level language (such as Fortan, C, C++, etc.) should be avoided x <- 0 x2 <- 0 n <- length(d$eruptions) for ( i in 1:n ) { x <- x + d$eruptions[i] x2 <- x2 + d$eruptions[i]^2 } sd.bad.way <- sqrt( ( x2 - x^2/n ) / ( n - 1 ) ) sd.bad.way