###################################################################################################
# File-Name:       	session1.r
# Date:            	17/07/15
# Author:          	DD	
# Machine:         	DD's MacBook Pro
###################################################################################################
# A. Load libraries and set macros
###################################################################################################
rm()
sapply(list.files(pattern="[.]R$", path="R/", full.names=TRUE), source)
#install.packages("ggplot2")
#install.packages("survey")
#install.packages("ri")
library(foreign) 	
library(lattice)
library(graphics)
library(ggplot2)
library(survey)
library(ri)

########################################################
# Set working directory to the folder containing this file
wd <- "/your/working/directory"
setwd(wd)
########################################################
###################################################################################################
# B. Session 1 - Part 2
###################################################################################################
# Simulations in R
###################################################################################################
# (Pseudo) Random number generator
set.seed(010101)
x = runif(1000)
x1 = x[-1000]
x2 = x[-1]
par(mfrow=c(1,3))
hist(x)
plot(x1,x2)
acf(x)

################################################
# Simulation of normal distribution
# Using a loop
set.seed(010101)
par(mfrow=c(2,2))
for (s in c(5, 50, 500, 5000)) {
	nSims = s
	mu = rep(NA,nSims) # sets the vector to be filled
	nSample=15
	for(i in 1:nSims){
		x = rnorm(n=nSample,mean=0,sd=1)
		mu[i] = mean(x)
	}
	hist(mu,main=paste("Histogram of mu for S =",nSims),col="blue",cex.main=1.5)
}

# Using replicate
set.seed(010101)
normalDistr.sim <- function(x){
	var <- rnorm(x)
	return(mean(var))
}

numObs <- 15
par(mfrow = c(2,2))
for(s in c(5,50,500,5000)) {
	sim <- replicate(s, normalDistr.sim(numObs))
	hist(sim, main = paste("Histogram of mu for S =", s), ylab="Density", xlab="var", col="blue")
}

###################################################################################################
# Randomization inference
###################################################################################################
library(ri) # Check out package by Aronow/Samii: https://cran.r-project.org/web/packages/ri/ri.pdf
data <- read.dta("data/fakeData.dta")
data <- matrix(data[1:6,1:2])

y <- data[[1]]
t <- data[[2]]
cluster <- seq(1,6) # we do not cluster in this example
block <- c(rep(1,6)) # we do not block in this example
permutations <- genperms(t,block,cluster)
probability <- genprobexact(t,block,cluster)
ate <- estate(y,t,prob=probability)
permutations
probability
ate
potentialOutcomes <- genouts(y,t,ate=0)
distributionUnderH0 <- gendist(potentialOutcomes,permutations,prob=probability)
dispdist(distributionUnderH0, ate, quantiles=c(.025,0.975))

###################################################################################################
# Statistical power evaluation
###################################################################################################
Check out EGAP: http://egap.org/content/power-analysis-simulations-r
possible.ns <- seq(from=5, to=100, by=5)
power <- rep(NA, length(possible.ns))
alpha <- 0.05
sims <- 500
H_a <- .5

for (j in 1:length(possible.ns)){
  N <- possible.ns[j]
  significant.experiments <- rep(NA, sims)

  for (i in 1:sims){
    p.value <- t.test(rnorm(N/2),rnorm(N/2,H_a))$p.value
    significant.experiments[i] <- (p.value <= alpha)
  }

  power[j] <- mean(significant.experiments)
}

plot(possible.ns, power, ylim=c(0,1))


###################################################################################################
# Performance of standard estimators in small samples
###################################################################################################
# Robustness
###########################################################
# Task 1
Extract t-statistic when assumptions for t-test about underlying population are met;
set.seed(010101)

# Run simulation of t-test for N=4 for sample that meets assumptions
ts = replicate(1000,t.test(rnorm(2),rnorm(2))$statistic)

# assess whether it fits the t-distribution with df=2
range(ts)
pts = seq(-4.5,4.5,length=100)
plot(pts,dt(pts,df=2),col='red',type='l')
lines(density(ts))
qqplot(ts,rt(1000,df=2))
abline(0,1)
probs = c(.9,.95,.99)
quantile(ts,probs)
qt(probs,df=2)

# now simulate 1000 runs of a t-test with N=4, 10, 20, 40
par(mfrow = c(2,2))
for(n in c(2,5,10,20)) {
	x = replicate(1000,t.test(rnorm(n),rnorm(n))$statistic)
	hist(x, main = paste("N =", 2*n), ylab="Density", col="red", xlab="T",freq=FALSE)
	curve(dt(x,2*n-1), add=TRUE, col="blue")
}

# Task 2
# Extract t-statistic when assumptions about underlying population are violated;
par(mfrow = c(2,2))
for(n in c(2,5,10,20)) {u
	x = replicate(1000,t.test(
			rchisq(n, n-1) + ifelse(runif(n)<=.2,5,0),
			rchisq(n, n-1) + ifelse(runif(n)<=.2,5,0)
		)$statistic)
	hist(x, main = paste("N =", 2*n), ylab="Density", col="red", xlab="T",freq=FALSE)
	curve(dt(x,2*n-1), add=TRUE, col="blue")
}

###################################################################################################
# High statistical power
###################################################################################################
possible.ns <- seq(from=4, to=16, by=2) # gets very slow with more observations in ri     
power.ttest <- rep(NA, length(possible.ns))  
power.ranksum <- rep(NA, length(possible.ns))        
power.ri <- rep(NA, length(possible.ns))  
alpha <- 0.05                                 
sims <- 1000                                  
H_a <- 2
distortion <- 2

for (j in 1:length(possible.ns)){
	N <- possible.ns[j]          
	print(N)                   
	significant.experiments.ttest <- rep(NA, sims) 
  	significant.experiments.ranksum <- rep(NA, sims) 
  	significant.experiments.ri <- rep(NA, sims)        
  
  	for (i in 1:sims){
    	p.value.ttest <- t.test(rnorm(N/2),rnorm(N/2,H_a,distortion))$p.value 
    	significant.experiments.ttest[i] <- (p.value.ttest <= alpha) 
    	p.value.ranksum <- wilcox.test(rnorm(N/2),rnorm(N/2,H_a,distortion))$p.value
    	significant.experiments.ranksum[i] <- (p.value.ranksum <= alpha) 
    	p.value.ri <- 
			dispdist(
				gendist(
					genouts(c(rnorm(N/2),rnorm(N/2,H_a,distortion)),c(rep(0,N/2),rep(1,N/2)),ate=0),
					genperms(c(rep(0,N/2),rep(1,N/2)),rep(1,N),seq(1,N),maxiter=200000),
					prob=genprobexact(c(rep(0,N/2),rep(1,N/2)),rep(1,N),seq(1,N))
				), 
				estate( 		
					c(rnorm(N/2),rnorm(N/2,H_a,distortion)),
					c(rep(0,N/2),rep(1,N/2)),
					prob=genprobexact(c(rep(0,N/2),rep(1,N/2)),rep(1,N),seq(1,N))
				),
				display.plot=FALSE,
			)$two.tailed.p.value
    	significant.experiments.ri[i] <- (p.value.ri <= alpha) 
  	}
	power.ttest[j] <- mean(significant.experiments.ttest) 
  	power.ranksum[j] <- mean(significant.experiments.ranksum) 
	power.ri[j] <- mean(significant.experiments.ri)      
}

plot.save <- paste("connected_powerByTest.pdf")
pdf(plot.save)
plot(possible.ns, power.ttest, ylim=c(0,1), col="black", xlab="Sample size", ylab="Power")
points(possible.ns, power.ranksum, col="blue")
points(possible.ns, power.ri, col="red")
lines(possible.ns, power.ttest, col="black")
lines(possible.ns, power.ranksum, col="blue")
lines(possible.ns, power.ri, col="red")
legend("top", c("t-test", "rank-sum", "randomization"), ncol=1,lwd=3,col=c("black","blue","red"),
inset = .01)
dev.off()


###################################################################################################
###################################################################################################