-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtwo_mixed_exp.R
More file actions
218 lines (188 loc) · 7.32 KB
/
two_mixed_exp.R
File metadata and controls
218 lines (188 loc) · 7.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
## Two-component mixture of exponential mixed Poisson model
two_geometric_model <- function(input_data, cutoff = 10, ...) {
input_data <- breakaway::convert(input_data)
## convert the data to frequency table
included <- input_data[input_data$index <= cutoff,]
excluded <- input_data[input_data$index > cutoff,]
ii <- included$index
ff <- included$frequency
if (nrow(included) == 0) {
included <- input_data
excluded <- list(index = Inf, frequency = 0)
}
c_tau <- sum(included$frequency)
c_excluded <- sum(excluded$frequency)
if (length(ii) <= 4) {
warning("Not enough different frequency counts. We recommend increase the cutoff.")
return(alpha_estimate(estimate = NA,
error = NA,
estimand = "richness",
name = "Two-component Geometric mixture Model",
type = "parametric",
model = "Two-component Geometric mixture Model",
frequentist = TRUE,
parametric = TRUE,
reasonable = TRUE,
interval = NA,
interval_type = "Approximate: log-normal",
GOF0 = NA,
GOF5 = NA,
AICc = NA,
other = list(t1 = NA,
t2 = NA,
t3 = NA,
u = NA,
cutoff = cutoff)))
}
## EM algorithm for obtaining the parameter estimates
em_params <- two_mixed_exp_EM(input_data = included)
u <- em_params$u
t1 <- em_params$t1
t2 <- em_params$t2
t3 <- u * t2 * (1 + t1) / (t1 + t1 * t2 + t2 * u - t1 * u)
C_tau <- c_tau * (1 + t1) * (1 + t2) / (t1 * t2 + t2 - t2 * t3 + t1 * t3)
f0 <- C_tau - c_tau
C_hat <- C_tau + c_excluded
## se
se_hat <- two_mixed_exp_se(C_tau, t1, t2, t3)
## confidence interval
dd <- ifelse(f0 == 0, 1, exp(1.96 * sqrt(log(1 + se_hat^2/f0))))
C_confint <- c(c_tau + c_excluded + f0/dd, c_tau + c_excluded + f0 * dd)
## AICc
AICc <- 6 * c_tau / (c_tau - 4) - 2 * sum(log(1:c_tau)) +
2 * sum(sapply(1:length(ff), function(i) {
sum(log(1:ff[i]))
})) -
2 * sum(ff * log(u / t1 * (t1 / (1 + t1))^ii + (1 - u) / t2 * (t2 / (1 + t2))^ii))
## GOF0 and GOF5
O_c <- ff
pp_E <- u / t1 * (t1 / (1 + t1))^ii + (1-u) / t2 * (t2 / (1 + t2))^ii
E_c <- c_tau * pp_E
GOF0_twomixedexp <- GOF(O_c = O_c, E_c = E_c, param = 3, bin.tol = 0)
GOF5_twomixedexp <- GOF(O_c = O_c, E_c = E_c, param = 3, bin.tol = 5)
alpha_estimate(estimate = C_hat,
error = se_hat,
estimand = "richness",
name = "Two-component Geometric mixture Model",
type = "parametric",
model = "Two-component Geometric mixture Model",
frequentist = TRUE,
parametric = TRUE,
reasonable = TRUE,
interval = C_confint,
interval_type = "Approximate: log-normal",
GOF0 = pchisq(GOF0_twomixedexp$GOF, GOF0_twomixedexp$df, lower.tail = F) %>% signif(4),
GOF5 = pchisq(GOF5_twomixedexp$GOF, GOF5_twomixedexp$df, lower.tail = F) %>% signif(4),
AICc = AICc,
other = list(t1 = t1,
t2 = t2,
t3 = t3,
u = u,
cutoff = cutoff))
}
# u <- 0.5
# t1 <- 3
# t2 <- 6
## compute the likelihood of the parameters
two_mixed_exp_lld <- function(input_data, t1, t2, u) {
ii <- input_data$index
ff <- input_data$frequency
sum(ff * log(u / t1 * (t1 / (1 + t1)) ^ ii +
(1 - u) / t2 * (t2 / (1 + t2)) ^ ii))
}
## Initial values of the EM algorithm
two_mixed_exp_init <- function(input_data) {
ii <- input_data$index
ff <- input_data$frequency
ss <- 2:(length(ii) - 2)
c_tau <- sum(ff)
init_values <- data.frame(u = sapply(ss, function(s) {
sum(ff[1:s]) / c_tau
}),
t1 = sapply(ss, function(s) {
sum(ff[1:s] * ii[1:s]) / sum(ff[1:s]) - 1
}),
t2 = sapply(ss, function(s) {
sum(ff[(s + 1):length(ii)] * ii[(s + 1):length(ii)]) /
sum(ff[(s + 1):length(ii)]) - 1
}))
init_values$lld <- apply(init_values, 1, function(rr) {
two_mixed_exp_lld(input_data, u = rr[1], t1 = rr[2], t2 = rr[3])
})
init_values_mle <- init_values[init_values$lld == max(init_values$lld),]
return(list(u = init_values_mle$u,
t1 = init_values_mle$t1,
t2 = init_values_mle$t2,
lld = init_values_mle$lld))
}
two_mixed_exp_EM <- function(input_data, MaxIter = 1000, tol = 1e-7) {
ii <- input_data$index
ff <- input_data$frequency
c_tau <- sum(ff)
## initial values for EM algorithm
inits <- two_mixed_exp_init(input_data)
u <- inits$u
t1 <- inits$t1
t2 <- inits$t2
lld <- inits$lld
## begin updating
k <- 1
flag <- 0
while (k <= MaxIter) {
# print(paste("k = ", k, ", t1 = ", t1, ", t2 = ", t2,
# ", u = ", u, ", lld = ", lld, sep = ""))
#print(lld)
## update parameters
z <- (u / t1 * (t1 / (1 + t1))^ii) /
(u / t1 * (t1 / (1 + t1))^ii + (1 - u) / t2 * (t2 / (1 + t2))^ii)
u_new <- sum(ff * z) / c_tau
t1_new <- sum(ii * ff * z) / sum(ff * z) - 1
t2_new <- sum(ii * ff * (1 - z)) / sum(ff * (1 - z)) - 1
lld_new <- two_mixed_exp_lld(input_data, t1_new, t2_new, u_new)
## exit iteration if convergent
if(u_new / u > 1 - tol & u_new / u < 1 + tol &
t1_new / t1 > 1 - tol & t1_new / t1 < 1 + tol &
t2_new / t2 > 1 - tol & t2_new / t2 < 1 + tol) {
u <- u_new
t1 <- t1_new
t2 <- t2_new
lld <- lld_new
flag <- 1
break()
}
u <- u_new
t1 <- t1_new
t2 <- t2_new
lld <- lld_new
k <- k + 1
}
return(list(u = u, t1 = t1, t2 = t2, iteration = min(k, MaxIter),
flag = flag))
}
two_mixed_exp_se <- function(cc, t1, t2, t3, MaxIter = 500, tol = 1e-7) {
a00 <- (t2 + t1 * t2 + t1 * t3 - t2 * t3) /
(1 + t1 - t1 * t3 + t2 * t3)
a0 <- matrix(c(t3 * (1 + t2) / ((1 + t1) * (1 + t1 + t2 * t3 - t1 * t3)),
(1 + t1) * (1 - t3) / ((1 + t2) * (1 + t1 - t1 * t3 + t2 * t3)),
(t1 - t2) / (1 + t1 - t1 * t3 + t2 * t3)), nrow = 3)
A <- 0
k <- 0
while(k < MaxIter) {
p_k <- t3 / (1 + t1) * (t1 / (1 + t1)) ^ k +
(1 - t3) / (1 + t2) * (t2 / (1 + t2)) ^ k
p_k1 <- (t1 / (1 + t1)) ^ (k - 1) * (k - t1) * t3 / (1 + t1) ^ 3
p_k2 <- (t2 / (1 + t2)) ^ (k - 1) * (k - t2) * (1 - t3) / (1 + t2) ^ 3
p_k3 <- (t1 / (1 + t1)) ^ k / (1 + t1) - (t2 / (1 + t2)) ^ k / (1 + t2)
p_kj <- matrix(c(p_k1, p_k2, p_k3), ncol = 1)
A_k <- p_kj %*% t(p_kj) / p_k
A <- A + A_k
if (max(abs(A_k)) < tol) break()
k <- k + 1
}
if (any(svd(A)$d < 1e-15)) {
warning("stardard error unattainable due to matrix singularity")
return(NA)
} else {
return(as.numeric(sqrt(cc) / sqrt(a00 - t(a0) %*% solve(A) %*% a0)))
}
}