# Rcode for the class # Chapter 1 # Requires MASS library("MASS") ############################ # Section 1.2 # Picture for projections onto Euclidean space # PROJECTIONS ON EUCLIDEAN SPACE y = c(0,2) x = c(0,1) y1 = c(1/4) x1 = c(1/2) y2 = c(1/sqrt(5)) x2 = c(2/sqrt(5)) y3 = c(0,3*sqrt(4/5)/2) x3 = c(0,3*sqrt(4/5)) y4 = c(4/5) x4 = c(8/5) plot(x,y, type='o', lwd=4, main="",xlab="",ylab="", ylim=c(0,2.25),xlim =c(0,2.25), col="blue") points(x1,y1,lwd=3,col="blue") lines(c(0,x1),c(0,y1),lwd=2,col="blue") points(x2,y2,lwd=4,col="orange") lines(x3,y3,lwd=3,col="orange",lty=2) points(x4,y4,lwd=4,col="orange") lines(c(1,x4),c(2,y4),col="red",lty=2,lwd = 3) grid(lwd=3) ########################### # Section 1.2 Dust clouds ## IID Data Sigma = matrix(c(1,0,0,1),ncol=2) xtemp = mvrnorm(500,c(0,0),Sigma) # Plotting the bivariate data plot(xtemp, main="",xlab="X",ylab="Y", col="blue", lwd=4, ylim=c(-3,3),xlim =c(-3,3)) grid(lwd=3) # New basis E2 = matrix(c(1/sqrt(2),1/sqrt(2),1/sqrt(2),-1/sqrt(2)),ncol=2) transform2 = xtemp%*%E2 plot(transform2, main="",xlab="X",ylab="Y", col="blue", lwd=4, ylim=c(-3,3),xlim =c(-3,3)) grid(lwd=3) # stretched basis E2 = matrix(c(1/sqrt(2),1/sqrt(2),1/sqrt(2),-1/sqrt(2)),ncol=2) xtemp3 = cbind(3*xtemp[,1],xtemp[,2]) transform3 = xtemp3%*%E2 plot(transform3, main="",xlab="X",ylab="Y", col="blue", lwd=4)# ylim=c(-3,3),xlim =c(-3,3)) grid(lwd=3) ############################ # Section 1.3 # UNDERSTANDING COVARIANCE D=2 # Load the library MASS # This allows us to draw samples from # a multivariate normal distribution. library(MASS) # We will use the function mvrnorm # this requires us to specify a vector mean # and matrix variance. ############################# library("MASS") # Drawing # Correlated normal data # Here we define the variance matrix. Sigma = matrix(c(5,4,4,6),ncol = 2) # The vector mean in this example is (1,2) # Below we draw 200 times a bivariate normal # distribution with mean (1,2) and variance Sigma xtemp = mvrnorm(200,c(1,2),Sigma) # Plotting the bivariate data plot(xtemp, main="",xlab="X",ylab="Y", ylim=c(-6,10),xlim =c(-6,10), col="blue") ######################### # Drawing # Independent normal data Sigma = matrix(c(5,0,0,6),ncol = 2) xtemp = mvrnorm(200,c(1,2),Sigma) # Plotting the bivariate data plot(xtemp, main="",xlab="X",ylab="Y", ylim=c(-6,10),xlim =c(-6,10), col="blue") ############################ # Section 1.3 more dust clouds A = matrix(c(3,-1.5,-2,2),nrow=2) Sigma = matrix(c(1,0,0,1),ncol = 2) xtemp = t(mvrnorm(200,c(0,0),Sigma)) ytemp = A%*%xtemp + c(7,8) plot(xtemp[1,],xtemp[2,],pch=16,lwd=3,col="blue",xlab="X",ylab="Y",xlim=c(-5,20),ylim=c(-5,20),asp=1) points(ytemp[1,],ytemp[2,],col="red",lwd=3,pch=16) # Since the xtemp are iid the variance (X_{1},X_{2}) is A%*%t(A) > A%*%t(A) [,1] [,2] [1,] 13.0 -8.50 [2,] -8.5 6.25 # The eigenvectors are > eigen(A%*%t(A)) eigen() decomposition $values [1] 18.7705249 0.4794751 $vectors [,1] [,2] [1,] -0.8273551 -0.5616792 [2,] 0.5616792 -0.8273551 ############################ # Projections by demeaning Dimension 2 # Suppose that (X_{1},X_{2}) are iid with mean 5 and variance one. library("MASS") Sigma2 = matrix(c(1,0,0,1),ncol = 2) xtemp2 = t(mvrnorm(200,c(5,5),Sigma2)) ytemp2 = xtemp2 n = length(xtemp2[1,]) for(i in c(1:n)){ ytemp2[,i] = xtemp2[,i] - mean(xtemp2[,i]) } plot(xtemp2[1,],xtemp2[2,],pch=16,lwd=3,col="blue",xlab="X",ylab="Y",xlim=c(-3,10),ylim=c(-3,10),asp=1) points(ytemp2[1,],ytemp2[2,],col="red",lwd=3,pch=16) points(c(xtemp2[1,1]),c(xtemp2[2,1]),lwd=2,col="green",pch=16) lines(c(xtemp2[1,1],ytemp2[1,1]),c(xtemp2[2,1],ytemp2[2,1]),lwd=2,col="green") points(c(xtemp2[1,2]),c(xtemp2[2,2]),lwd=2,col="green",pch=16) lines(c(xtemp2[1,2],ytemp2[1,2]),c(xtemp2[2,2],ytemp2[2,2]),lwd=2,col="green") ################################ # Dimension 3 # Suppose that (X_{1},X_{2},X_{3}) are iid with mean 5 and variance one. library("MASS") library("scatterplot3d") Sigma3 = matrix(c(1,0,0,0,1,0,0,0,1),ncol = 3) xtemp3 = t(mvrnorm(100,c(5,5,5),Sigma3)) ytemp3 = xtemp3 n = length(xtemp3[1,]) for(i in c(1:n)){ ytemp3[,i] = xtemp3[,i] - mean(xtemp3[,i]) } xD = xtemp3[1,] yD = xtemp3[2,] zD = xtemp3[3,] xM = ytemp3[1,] yM = ytemp3[2,] zM = ytemp3[3,] #scatterplot3d(xM[c(1:2)],yM[c(1:2)],zM[c(1:2)],pch = 16, color="blue",lwd=3,xlab = "X1",ylab = "Y1", zlab="Z1",xlim=c(-3,8),ylim = c(-3,8),zlim=c(-3,8)) #scatterplot3d(2,1,1,pch = 16, color="blue",lwd=3,xlab = "X1",ylab = "Y1", zlab="Z1",xlim=c(-3,8),ylim = c(-3,8),zlim=c(-3,8)) sd3 = scatterplot3d(xM,yM,zM,pch = 16, color="red",lwd=3,xlab = "X1",ylab = "Y1", zlab="Z1",xlim=c(-3,8),ylim = c(-3,8),zlim=c(-3,8),angle = 200) sd3$points3d(xD,yD,zD,col="blue",pch=16,lwd=3) plot(xtemp2[1,],xtemp2[2,],pch=16,lwd=3,col="blue",xlab="X",ylab="Y",xlim=c(-3,10),ylim=c(-3,10)) points(ytemp2[1,],ytemp2[2,],col="red",lwd=3,pch=16) ############################ # Section 1.4 # Types of convergence ############################# # DEMONSTRATING DIFFERENT FORMS OF CONVERGENCE, USING THE SAMPLE MEAN # AT DIFFERENT SAMPLE SIZES (n=1...100 or 500). We start with n = 500, then focus # on the case n = 100. # 100 replications are done (it would have been better to do 1000 replications # but I was a bit lazy to wait). ############################ # make a matrix of averages run1 = matrix(rep(0,1000*500),ncol = 500) # simulate from a chi-square 1 #sim1 = matrix(rchisq(100*500,df=1),ncol = 500) sim1 = matrix(rbinom(1000*500,size=5,p=0.1),ncol = 500) for(j in c(1:1000)){ for(i in c(1:500)){ test = sim1[j,c(1:i)] test1 = mean(test) run1[j,i] = test1 } } ############################# # Make plot of first 5 m1 = max(run1[c(1,2,3,4,5),]) m2 = min(run1[c(1,2,3,4,5),]) # Each row of run1 contains averages evaluated from n = 1 to n = 500. # Make a plot of the first 5 plot(c(1:500),run1[1,c(1:500)], type='l', lwd=3, main="",xlab="sample size",ylab="sample mean", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8, ylim=c(m2,m1),col="blue") grid(lwd=3) lines(c(1:500),run1[2,c(1:500)],lwd=3,col="red") lines(c(1:500),run1[3,c(1:500)],lwd=3,col="green") lines(c(1:500),run1[4,c(1:500)],lwd=3,col="orange") lines(c(1:500),run1[5,c(1:500)],lwd=3,col="purple") lines(c(1:500),rep(1,500),lty=3,col="pink") ############################# # Make plot of first 100 # To make the plots less cluttered we only use a max sample size of 100. m1 = max(run1[c(1:100),]) m2 = min(run1[c(1:100),]) # Each row of run1 contains averages evaluated from n = 1 to n = 100. # Make a plot of the first 5 plot(c(1:100),run1[1,c(1:100)], type='l', lwd=1, main="",xlab="sample size",ylab="sample mean", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8, ylim=c(m2,m1)) grid(lwd=3) for(j in c(2:100)){ lines(c(1:100),run1[j,c(1:100)],lwd=1) } # Also include the standard error. lines(c(1:100),(c(1:100)/2)**(-1/2),col="red",lwd=2) ################################### # calculate and plot the average squared error. msqerr = rep(0,100) for(j in c(1:100)){ temp = sum((run1[,j]-1)**2)/length(run1[,j]) msqerr[j] = temp } # Calculate the bias meanbias = rep(0,100) for(j in c(1:100)){ tempbias = (mean(run1[,j])-1) meanbias[j] = tempbias } ################## #MSE plot finer = seq(5,100,by=5) SigmaTrue = (2/finer) plot(finer,msqerr[finer], type='o', lwd=2, main="",xlab="sample size",ylab="mean square error", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8) lines(finer,msqerr[finer],lwd = 2,lty=2,col="red") #points(seq(2,100,by=1),msqerr[seq(2,100,by=1)],col="red") lines(finer,SigmaTrue,lwd=2,col="blue") ################### #Bias plot plot(c(1:100),meanbias, type='l', lwd=2, main="",xlab="sample size",ylab="mean square error", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8,col="blue") lines(c(1:100),meanbias,lty=2,lwd = 2,col="red") ################################### # Making QQplots, as described in Section 1.5.3, Definition 1.7 # Obtaining normal quantiles (for example, when size n ) # Let x be the input data set. Then n = length(x) # Define the initial vector nq, this will give the quantiles of # the normal distribution (or whatever distribution you want) nq = rep(0,n) # use p=(i-0.5)/n rather than p=i/n to avoid the case p=1 (when i=n) for(i in c(1:n)){ nq[i] = qnorm((i-0.5)/n) # For a t-distribution with df=2 # nq[i] = qt((i-0.5)/n,df=2) } # sort data from smallest to largest xsort = sort(x) # remove the last observation # Plot the quantiles of a normal against the empirical quantiles of # the data plot(nq,xsort, type='o', lwd=2, main="",xlab="normal quantiles",ylab="quantiles of data") ##################################### # plot a histogram of the averages at several time points # n = 1 hist(run1[,1],xlab="sample size",ylab="",main="n=1",xlim=c(0,6)) transform = sort((run1[,1]-1)/sqrt(2)) hist(transform,xlab="sample size",ylab="",main="n=1") # qqplot plot(nq,transform, type='o', lwd=2, main="",xlab="normal quantiles",ylab="quantiles of sample mean", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8) lines(nq,nq) #################################### # plot a histogram of the averages at several time points # n = 5 hist(run1[,5],xlab="sample size",ylab="",main="n=5",xlim = c(0,6)) transform = sort((run1[,5]-1)/sqrt(2/5)) hist(transform,xlab="sample size",ylab="",main="n=5") # qqplot plot(nq,transform, type='o', lwd=2, main="",xlab="normal quantiles",ylab="quantiles of sample mean", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8) lines(nq,nq) #################################### # plot a histogram of the averages at several time points # n = 10 hist(run1[,10],xlab="sample size",ylab="",main="n=10",xlim = c(0,6)) transform = sort((run1[,10]-1)/sqrt(2/10)) hist(transform,xlab="sample size",ylab="",main="n=10") # qqplot plot(nq,transform, type='o', lwd=2, main="",xlab="normal quantiles",ylab="quantiles of sample mean", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8) lines(nq,nq) ######################################### # plot a histogram of the averages at several time points # n = 100 hist(run1[,100],xlab="sample size",ylab="",main="n=100",xlim=c(0,6)) transform = sort((run1[,100]-1)/sqrt(2/100)) hist(transform,xlab="sample size",ylab="",main="n=100") # qqplot plot(nq,transform, type='o', lwd=2, main="",xlab="normal quantiles",ylab="quantiles of sample mean", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8) lines(nq,nq) ############################################################ ############################# # Make plot of first 5 # To make the plots less cluttered we only use a max sample size of 100. all = run1[c(1:5),] m1 = max(run1[c(1:5),]) m2 = min(run1[c(1:5),]) all[1,c(15,67)] = rep(2,2) all[2,c(10,78)]=c(2.5,0) #all[3,c(12,37,76)]=rep(2.5,3) all[3,c(50,100)] = c(1.5,2.2) all[4,c(58,92)]=c(1.8,2.4) all[5,c(23,90)]=rep(0.2,2) #all[5,c(34,44,63,84)]=rep(1.8,4) # Each row of run1 contains averages evaluated from n = 1 to n = 200. # Make a plot of the first 5 plot(c(1:100),all[1,c(1:100)], type='l', lwd=2, main="",xlab="sample size",ylab="sample mean", cex.lab=1.3, cex.axis = 1.3, cex.main=1.8, ylim=c(m2,m1),col="blue") points(c(1:100),all[1,c(1:100)]) grid(lwd=3) lines(c(1:100),all[2,c(1:100)],lwd=2,col="red") points(c(1:100),all[2,c(1:100)],lwd=1,col="red") lines(c(1:100),all[3,c(1:100)],lwd=2,col="green") points(c(1:100),all[3,c(1:100)],lwd=1,col="green") lines(c(1:100),all[4,c(1:100)],lwd=2,col="orange") points(c(1:100),all[4,c(1:100)],lwd=1,col="orange") lines(c(1:100),all[5,c(1:100)],lwd=2,col="purple") points(c(1:100),all[5,c(1:100)],lwd=1,col="purple") #lines(c(1:100),rep(1,100),lty=3,col="pink") ############################# # Section 1.5.4 (transformations of the sample means): consider two cases # when the derivative condition is satisfied and not satisfied. ############################## # Sample size n = 200 ############################## test0 = matrix(rnorm(n=200*1000,mean = 0,sd=1),ncol=1000) test05 = matrix(rnorm(n=200*1000,mean = 0.5,sd=1),ncol=1000) mean0 = rep(0,1000) mean05 = rep(0,1000) for(i in c(1:1000)){ mean0[i] = mean(test0[,i]) mean05[i] = mean(test05[,i]) } mean0sq = mean0**2 mean05sq = mean05**2 hist(mean0sq,xlab="sample mean squared",ylab="",main="mean = 0, n = 200") hist(mean05sq,xlab="sample mean squared",ylab="",main="mean = 0.5, n = 200") ############################## # Sample size n = 1000 ############################### test0 = matrix(rnorm(n=1000*1000,mean = 0,sd=1),ncol=1000) test05 = matrix(rnorm(n=1000*1000,mean = 0.5,sd=1),ncol=1000) mean0 = rep(0,1000) mean05 = rep(0,1000) for(i in c(1:1000)){ mean0[i] = mean(test0[,i]) mean05[i] = mean(test05[,i]) } mean0sq = mean0**2 mean05sq = mean05**2 hist(mean0sq,xlab="sample mean squared",ylab="",main="mean = 0, n = 1000") hist(mean05sq,xlab="sample mean squared",ylab="",main="mean = 0.5, n = 1000")