################################################################################################### # File-Name: session1.r # Date: 17/07/15 # Author: DD # Machine: DD's MacBook Pro ################################################################################################### # A. Load libraries and set macros ################################################################################################### rm() sapply(list.files(pattern="[.]R$", path="R/", full.names=TRUE), source) #install.packages("ggplot2") #install.packages("survey") #install.packages("ri") library(foreign) library(lattice) library(graphics) library(ggplot2) library(survey) library(ri) ######################################################## # Set working directory to the folder containing this file wd <- "/your/working/directory" setwd(wd) ######################################################## ################################################################################################### # B. Session 1 - Part 2 ################################################################################################### # Simulations in R ################################################################################################### # (Pseudo) Random number generator set.seed(010101) x = runif(1000) x1 = x[-1000] x2 = x[-1] par(mfrow=c(1,3)) hist(x) plot(x1,x2) acf(x) ################################################ # Simulation of normal distribution # Using a loop set.seed(010101) par(mfrow=c(2,2)) for (s in c(5, 50, 500, 5000)) { nSims = s mu = rep(NA,nSims) # sets the vector to be filled nSample=15 for(i in 1:nSims){ x = rnorm(n=nSample,mean=0,sd=1) mu[i] = mean(x) } hist(mu,main=paste("Histogram of mu for S =",nSims),col="blue",cex.main=1.5) } # Using replicate set.seed(010101) normalDistr.sim <- function(x){ var <- rnorm(x) return(mean(var)) } numObs <- 15 par(mfrow = c(2,2)) for(s in c(5,50,500,5000)) { sim <- replicate(s, normalDistr.sim(numObs)) hist(sim, main = paste("Histogram of mu for S =", s), ylab="Density", xlab="var", col="blue") } ################################################################################################### # Randomization inference ################################################################################################### library(ri) # Check out package by Aronow/Samii: https://cran.r-project.org/web/packages/ri/ri.pdf data <- read.dta("data/fakeData.dta") data <- matrix(data[1:6,1:2]) y <- data[[1]] t <- data[[2]] cluster <- seq(1,6) # we do not cluster in this example block <- c(rep(1,6)) # we do not block in this example permutations <- genperms(t,block,cluster) probability <- genprobexact(t,block,cluster) ate <- estate(y,t,prob=probability) permutations probability ate potentialOutcomes <- genouts(y,t,ate=0) distributionUnderH0 <- gendist(potentialOutcomes,permutations,prob=probability) dispdist(distributionUnderH0, ate, quantiles=c(.025,0.975)) ################################################################################################### # Statistical power evaluation ################################################################################################### Check out EGAP: http://egap.org/content/power-analysis-simulations-r possible.ns <- seq(from=5, to=100, by=5) power <- rep(NA, length(possible.ns)) alpha <- 0.05 sims <- 500 H_a <- .5 for (j in 1:length(possible.ns)){ N <- possible.ns[j] significant.experiments <- rep(NA, sims) for (i in 1:sims){ p.value <- t.test(rnorm(N/2),rnorm(N/2,H_a))$p.value significant.experiments[i] <- (p.value <= alpha) } power[j] <- mean(significant.experiments) } plot(possible.ns, power, ylim=c(0,1)) ################################################################################################### # Performance of standard estimators in small samples ################################################################################################### # Robustness ########################################################### # Task 1 Extract t-statistic when assumptions for t-test about underlying population are met; set.seed(010101) # Run simulation of t-test for N=4 for sample that meets assumptions ts = replicate(1000,t.test(rnorm(2),rnorm(2))$statistic) # assess whether it fits the t-distribution with df=2 range(ts) pts = seq(-4.5,4.5,length=100) plot(pts,dt(pts,df=2),col='red',type='l') lines(density(ts)) qqplot(ts,rt(1000,df=2)) abline(0,1) probs = c(.9,.95,.99) quantile(ts,probs) qt(probs,df=2) # now simulate 1000 runs of a t-test with N=4, 10, 20, 40 par(mfrow = c(2,2)) for(n in c(2,5,10,20)) { x = replicate(1000,t.test(rnorm(n),rnorm(n))$statistic) hist(x, main = paste("N =", 2*n), ylab="Density", col="red", xlab="T",freq=FALSE) curve(dt(x,2*n-1), add=TRUE, col="blue") } # Task 2 # Extract t-statistic when assumptions about underlying population are violated; par(mfrow = c(2,2)) for(n in c(2,5,10,20)) {u x = replicate(1000,t.test( rchisq(n, n-1) + ifelse(runif(n)<=.2,5,0), rchisq(n, n-1) + ifelse(runif(n)<=.2,5,0) )$statistic) hist(x, main = paste("N =", 2*n), ylab="Density", col="red", xlab="T",freq=FALSE) curve(dt(x,2*n-1), add=TRUE, col="blue") } ################################################################################################### # High statistical power ################################################################################################### possible.ns <- seq(from=4, to=16, by=2) # gets very slow with more observations in ri power.ttest <- rep(NA, length(possible.ns)) power.ranksum <- rep(NA, length(possible.ns)) power.ri <- rep(NA, length(possible.ns)) alpha <- 0.05 sims <- 1000 H_a <- 2 distortion <- 2 for (j in 1:length(possible.ns)){ N <- possible.ns[j] print(N) significant.experiments.ttest <- rep(NA, sims) significant.experiments.ranksum <- rep(NA, sims) significant.experiments.ri <- rep(NA, sims) for (i in 1:sims){ p.value.ttest <- t.test(rnorm(N/2),rnorm(N/2,H_a,distortion))$p.value significant.experiments.ttest[i] <- (p.value.ttest <= alpha) p.value.ranksum <- wilcox.test(rnorm(N/2),rnorm(N/2,H_a,distortion))$p.value significant.experiments.ranksum[i] <- (p.value.ranksum <= alpha) p.value.ri <- dispdist( gendist( genouts(c(rnorm(N/2),rnorm(N/2,H_a,distortion)),c(rep(0,N/2),rep(1,N/2)),ate=0), genperms(c(rep(0,N/2),rep(1,N/2)),rep(1,N),seq(1,N),maxiter=200000), prob=genprobexact(c(rep(0,N/2),rep(1,N/2)),rep(1,N),seq(1,N)) ), estate( c(rnorm(N/2),rnorm(N/2,H_a,distortion)), c(rep(0,N/2),rep(1,N/2)), prob=genprobexact(c(rep(0,N/2),rep(1,N/2)),rep(1,N),seq(1,N)) ), display.plot=FALSE, )$two.tailed.p.value significant.experiments.ri[i] <- (p.value.ri <= alpha) } power.ttest[j] <- mean(significant.experiments.ttest) power.ranksum[j] <- mean(significant.experiments.ranksum) power.ri[j] <- mean(significant.experiments.ri) } plot.save <- paste("connected_powerByTest.pdf") pdf(plot.save) plot(possible.ns, power.ttest, ylim=c(0,1), col="black", xlab="Sample size", ylab="Power") points(possible.ns, power.ranksum, col="blue") points(possible.ns, power.ri, col="red") lines(possible.ns, power.ttest, col="black") lines(possible.ns, power.ranksum, col="blue") lines(possible.ns, power.ri, col="red") legend("top", c("t-test", "rank-sum", "randomization"), ncol=1,lwd=3,col=c("black","blue","red"), inset = .01) dev.off() ################################################################################################### ###################################################################################################