# Chapter 4
#import Pkg
#Pkg.add("Plots")
#Pkg.add("Optim")
#Pkg.add("StatsKit")
#Pkg.add("GLM")
#Pkg.add("DataFrames")
#Pkg.add("StatsPlots")
#Pkg.add("StatFiles")
using Random
using Statistics
using Plots
using Optim
using StatsKit
using CSV
using GLM
using DataFrames
using LinearAlgebra
using StatsPlots
using StatFiles
# 4.1
# 4.2
# 4.2.1
# 4.2.2
rng = MersenneTwister(123)
N = 200
a = fill(2, N, 2)
b = randn!(rng, zeros(N)).*3 .+ 2
x₀ = fill(0, N)
x₁ = fill(1, N)
υ = randn!(rng, zeros(N))
y = a + [b.*x₀ b.*x₁] + [υ υ]
# 4.2.3
plot(kde(y[1:end,1]))
plot!(kde(y[1:end,2]))
vline!([mean(y[1:end,1]),mean(y[1:end,2])])
F₀ = ecdf(y[1:end,1])
F₁ = ecdf(y[1:end,2])
scatter(F₀)
scatter!(F₁)
density(y[1:end,2] - y[1:end,1])
vline!(fill(0,N))
F = ecdf(y[1:end,2] - y[1:end,1])
scatter(F)
vline!(fill(0,N))
# 4.3
# 4.3.1
mean_diff = mean(y[1:end,2] - y[1:end,1])
diff_mean = mean(y[1:end,2]) - mean(y[1:end,1])
mean_diff - diff_mean
# these should be identical, there must be some rounding difference in the computation.
# 4.3.2
# 4.3.3
X = zeros(N)
u = rand!(rng, zeros(N))
index_0 = []
index_1 = []
for i = 1:N
if u[i] < 0.3
X[i] = 1
index_1 = [index_1; i]
else
index_0 = [index_0; i]
end
end
Y = (1 .- X).*y[1:end,1] + X.*y[1:end,2]
mean(Y[index_1]) - mean(Y[index_0])
# 4.4
# 4.4.1
# 4.4.2
f = function(x)
mean(y₁ .< x) - mean(y₀ .< x - b)
end
f₋ = function(x)
-(mean(y₁ .< x) - mean(y₀ .< x - b))
end
Fᴸ = function(b, y₁, y₀)
a = optimize(f₋, minimum([y₁; y₀]), maximum([y₁; y₀]))
return maximum([-a.minimum 0])
end
Fᵁ = function(b, y₁, y₀)
a = optimize(f, minimum([y₁; y₀]), maximum([y₁; y₀]))
return 1 + minimum([a.minimum 0])
end
K = 50
min_diff = minimum(y[1:end,1]) - maximum(y[1:end,2])
max_diff = maximum(y[1:end,1]) - minimum(y[1:end,2])
δ_diff = (max_diff - min_diff)/K
y_K = min_diff .+ Vector(1:K).*δ_diff
FL = Array{Union{Missing, Float64}}(missing, K, 2)
FU = Array{Union{Missing, Float64}}(missing, K, 2)
global y₀ = y[1:end,1]
global y₁ = y[1:end,2]
for k = 1:K
global b = y_K[k] # need to be careful in tracking variables
FL[k,1] = b
FL[k,2] = Fᴸ(b, y₁, y₀)
FU[k,1] = b
FU[k,2] = Fᵁ(b, y₁, y₀)
#println(k)
end
scatter(F)
scatter!(FL[1:end,1],FL[1:end,2])
scatter!(FU[1:end,1],FU[1:end,2])
vline!(zeros(N))
# 4.5
# 4.5.1
using StatFiles
path = "ENTER PATH HERE"
x = DataFrame(load(string(path,"/seedanalysis_011204_080404_1.dta")))
x1 = DataFrame(treatment=x.treatment, marketing=x.marketing, balchange=x.balchange)
x1 = dropmissing(x1)
index_0 = []
index_1 = []
for i = 1:1777
if x1.treatment[i] == 0 && x1.marketing[i] == 0
index_0 = [index_0; i]
elseif x1.treatment[i] == 1 && x1.marketing[i] == 0
index_1 = [index_1; i]
end
end
bal_0 = x1.balchange[index_0]
bal_1 = x1.balchange[index_1]
lbal_0 = Vector{Union{Missing, Float64}}(missing, length(bal_0))
lbal_0 = log.(bal_0 .+ 2169)
lbal_1 = log.(bal_1 .+ 2169)
mean(bal_1) - mean(bal_0)
# 4.5.2
K = 50
min_diff = minimum(lbal_0) - maximum(lbal_1)
max_diff = maximum(lbal_0) - minimum(lbal_1)
δ_diff = (max_diff - min_diff)/K
y_K = min_diff .+ Vector(1:K).*δ_diff
FL = Array{Union{Missing, Float64}}(missing, K, 2)
FU = Array{Union{Missing, Float64}}(missing, K, 2)
global y₀ = lbal_0
global y₁ = lbal_1
for k = 1:K
global b = y_K[k] # need to be careful in tracking variables
FL[k,1] = b
FL[k,2] = Fᴸ(b, y₁, y₀)
FU[k,1] = b
FU[k,2] = Fᵁ(b, y₁, y₀)
#println(k)
end
scatter(FL[1:end,1],FL[1:end,2])
scatter!(FU[1:end,1],FU[1:end,2])
vline!(zeros(length(lbal_1)))
# 4.5.3
# 4.6
# 4.6.1
# 4.6.2
Random.seed!(123457689)
c = 2
d = 4
f = fill(-1, N)
Z = rand([0, 1], N)
υ₂ = randn!(rng, zeros(N))
x_star = f + c.*υ + d.*Z + υ₂
X = zeros(N)
index₁ = []
index₀ = []
for i = 1:N
if x_star[i] > 0
X[i] = 1
index₁ = [index₁; i]
else
index₀ = [index₀; i]
end
end
Y = (1 .- X).*y[1:end,1] + X.*y[1:end,2]
mean(Y[index₁]) - mean(Y[index₀])
# 4.6.3
# 4.6.4
# 4.6.5
PX₁ = mean(X .== 1)
PX₀ = mean(X .== 0)
EY₁ = mean(Y[index₁])
EY₀ = mean(Y[index₀])
minY = minimum(Y)
maxY = maximum(Y)
(EY₁ - minY)*PX₁ + (maxY - EY₀)*PX₀
(EY₁ - maxY)*PX₁ + (minY - EY₀)*PX₀
# 4.6.6
# 4.6.7
# 4.6.8
EY_11 = mean(Y[(X .== 1).*(Z .== 1)])
EY_10 = mean(Y[(X .== 1).*(Z .== 0)])
EY_01 = mean(Y[(X .== 0).*(Z .== 1)])
EY_00 = mean(Y[(X .== 0).*(Z .== 0)])
PX1_Z1 = mean(X[Z .== 1] .== 1)
PX1_Z0 = mean(X[Z .== 0] .== 1)
PX0_Z1 = mean(X[Z .== 1] .== 0)
PX0_Z0 = mean(X[Z .== 0] .== 0)
minimum([(EY_11 - minY)*PX1_Z1 + (maxY - EY_01)*PX0_Z1, (EY_10 - minY)*PX1_Z0 + (maxY - EY_00)*PX0_Z0])
maximum([(EY_11 - maxY)*PX1_Z1 + (minY - EY_01)*PX0_Z1, (EY_10 - maxY)*PX1_Z0 + (minY - EY_00)*PX0_Z0])
# 4.6.9
# 4.6.10
(maxY - EY₀)*PX₀
(EY₁ - maxY)*PX₁
# 4.7
# 4.7.1
path = "ENTER PATH HERE"
x = DataFrame(load(string(path,"/UpdatedStateLevelData-2010.dta")))
x1 = DataFrame(state=x.state, year=x.year, shalll=x.shalll, rataga=x.rataga, area=x.area)
x1 = dropmissing(x1)
# Note this is a different procedure than used the R code.
states = unique(x1.state)
X = []
Y = []
Z = []
for i = 1:length(states)
state = states[i]
X = [X; sum(x1.shalll[x1.state .== state]) > 0]
Y = [Y; mean(x1.rataga[(x1.state .== state).*(x1.year .> 1990)])]
Z = [Z; mean(x1.area[(x1.state .== state).*(x1.year .> 1990)]) > 53960]
println(i)
end
histogram(x1.rataga)
xlabel!("Average Assault Rate")
# 4.7.2
EY₁ = mean(Y[X .== 1])
EY₀ = mean(Y[X .== 0])
EY₁ - EY₀
# 4.7.3
PX₀ = mean(X .== 0)
PX₁ = mean(X .== 1)
minY = 0
maxY = 100000
(EY₀ - minY)*PX₁ + (maxY - EY₁)*PX₀
(EY₁ - maxY)*PX₁ + (minY - EY₀)*PX₀
minY = minimum(Y)
maxY = maximum(Y)
(EY₀ - minY)*PX₁ + (maxY - EY₁)*PX₀
(EY₁ - maxY)*PX₁ + (minY - EY₀)*PX₀
# 4.7.4
EY_11 = mean(Y[(X .== 1).*(Z .== 1)])
EY_10 = mean(Y[(X .== 1).*(Z .== 0)])
EY_01 = mean(Y[(X .== 0).*(Z .== 1)])
EY_00 = mean(Y[(X .== 0).*(Z .== 0)])
PX1_Z1 = mean(X[Z .== 1] .== 1)
PX1_Z0 = mean(X[Z .== 0] .== 1)
PX0_Z1 = mean(X[Z .== 1] .== 0)
PX0_Z0 = mean(X[Z .== 0] .== 0)
minimum([(EY_11 - minY)*PX1_Z1 + (maxY - EY_01)*PX0_Z1, (EY_10 - minY)*PX1_Z0 + (maxY - EY_00)*PX0_Z0])
maximum([(EY_11 - maxY)*PX1_Z1 + (minY - EY_01)*PX0_Z1, (EY_10 - maxY)*PX1_Z0 + (minY - EY_00)*PX0_Z0])
# 4.7.5
(EY₁ - minY)*PX₁
(minY - EY₀)*PX₀
# 4.8