In this post, we show some examples in simulating univariate and multivariate zero-inflated generalized Poisson (ZIGP) data based on the approaches proposed in chen et al. (2024).
Let \((X_1,X_2,\ldots,X_n)^T\) denote a vector of \(n\) discrete random variables. A generalized Poisson (GP) regression model distribution (Consul and Famoye, 1992; Yang et al., 2009) adopts the following probability mass function (PMF) (Consul and Famoye, 2006):
\[\begin{equation} Pr(X_i=x_i \; | \; \theta_i,\lambda_i) = \begin{cases} e^{-\theta_i - \lambda_i x_i} \theta_i\left(\theta_i + \lambda_i x_i\right)^{x_i - 1} / x_i!, & \text{for } x_i= 0, 1, 2... \\ 0, & \text{for } x_i > m_i \text{ when } \lambda_i < 0 \end{cases} \end{equation}\]
We consider the ZIGP distribution defined by the following PMF:
\[\begin{align} \begin{split} Pr(X_i =x_i \; | \; \omega_i, \theta_i, \lambda_i) = \begin{cases} \omega_i + (1-\omega_i) %Pr_{\theta_i,\lambda_i}(0) e^{-\theta_i - \lambda_i x_i} \theta_i\left(\theta_i + \lambda_i x_i\right)^{x_i - 1} / x_i! = \omega_i+(1-\omega_i)e^{-\theta_i} & \text{for } x_i= 0 \\ (1-\omega_i) %Pr_{\theta_i,\lambda_i}(x_i) e^{-\theta_i - \lambda_i x_i} \theta_i\left(\theta_i + \lambda_i x_i\right)^{x_i - 1} / x_i! & \text{for } x_i= 1,2,\ldots \\ 0 & \text{for } x_i > m_i \text{ if } \lambda_i < 0\end{cases} \end{split} \end{align}\]
Source codes for simulating univariate ZIGP distributed Count Data
require(RNGforGPD)
## Loading required package: RNGforGPD
MOM.genpois <- function(data){
m <- mean(data); v <- var(data)
emp.theta <- sqrt(m^3 / v)
emp.lambda <- 1 - sqrt(m / v)
return(list(
emp.theta = emp.theta,
emp.lambda = emp.lambda
))
}
################################################################################
# require vgam
################################################################################
GenUniZIGpois <- function (theta, lambda, omega, n, details = TRUE,
method = "ZIGP-inversion") {
tol <- .Machine$double.eps^0.5
if (n <= 0 | abs(n - round(n)) > tol) {
stop("n must be a positive integer.")
}
if (n == 1)
stop("Parameter estimates are exactly the same as input of parameters in the case when n = 1!")
if (theta <= 0)
stop("theta has to be greater than 0!")
if (lambda >= 1)
stop("lambda has to be less than 1!")
if (lambda < 0 & lambda < (-theta)/4)
stop(paste("For lambda < 0, lambda must be greater than or equal to -theta/4 which is ",
(-theta)/4, "!", sep = ""))
myset = numeric(n)
if (lambda == 0) {
# rzipois is called from 'VGAM'
myset = rzipois(n, theta, pstr0=omega)
if (n != 1) {
params = MOM.genpois(myset)
emp.theta = round(params$emp.theta, 6)
emp.lambda = round(params$emp.lambda, 6)
}
}
for (i in 1:n) {
x <- 0
u <- runif(1)
p0 <- omega + (1-omega) * exp(-theta)
cum.probs <- p0
while(u > cum.probs) {
x <- x + 1
current.prob <- (1-omega) * (theta*(theta+lambda*(x))^(x-1)*exp(-theta-lambda*(x))/factorial(x))
cum.probs <- current.prob + cum.probs
}
myset[i] = x
}
if (n != 1) {
params = MOM.genpois(myset)
emp.theta = round(params$emp.theta, 6)
emp.lambda = round(params$emp.lambda, 6)
}
if (details == TRUE) {
print(paste("Specified theta is ", theta, ", empirical theta is ",
emp.theta, ", specified lambda is ", lambda, ", empirical lambda is ",
emp.lambda, ".", sep = ""))
}
return(invisible(list(data = myset, specified.theta = theta,
empirical.theta = as.numeric(emp.theta), specified.lambda = lambda,
empirical.lambda = as.numeric(emp.lambda), method = method)))
}
Suppose that \(X \sim ZIGP(\theta_1=5,\lambda_1=-0.5,\omega_1=0.3)\)
sim.data <- GenUniZIGpois(5,-0.5,0.3, 50)
## [1] "Specified theta is 5, empirical theta is 2.34849, specified lambda is -0.5, empirical lambda is 0.10363."
head(sim.data)
## $data
## [1] 2 4 0 0 7 0 1 2 4 2 0 2 4 2 3 3 0 1 4 2 0 3 5 3 0 2 4 4 3 0 0 0 2 4 4 4 2 1
## [39] 5 3 5 5 2 3 5 4 3 2 5 5
##
## $specified.theta
## [1] 5
##
## $empirical.theta
## [1] 2.34849
##
## $specified.lambda
## [1] -0.5
##
## $empirical.lambda
## [1] 0.10363
##
## $method
## [1] "ZIGP-inversion"
ZIGPmu <- function(theta,lambda,omega) {
a=lambda/theta
return((1-omega)*theta/(1-a*theta))
}
# theoretical variance:
ZIGPsigma2 <- function(theta,lambda,omega) {
a=lambda/theta
return((1-omega)*theta*(1+theta*omega*(1-a*theta))/((1-a*theta)^3))
}
quantileUniZIGPois <- function(probs, theta, lambda, omega) {
quant.res <- matrix(rep(0,length(probs)),
nrow=length(probs),ncol=1)
for (i in 1:length(quant.res)) {
x <- 0
u <- probs[i] # not runif(0,1)!!!!
p0 <- omega + (1-omega) * exp(-theta)
cum.probs <- p0
while(u > cum.probs) {
x <- x + 1
current.prob <- (1-omega) * (theta*(theta+lambda*(x))^(x-1)*exp(-theta-lambda*(x))/factorial(x))
cum.probs <- current.prob + cum.probs
}
quant.res[i] <- x
}
return(quant.res)
}
CorrNNZIGoisPolyReg <- function(rho.x,theta,lambda,omega) {
require(cubature)
# step 0: make sure rho.x is within the correlation bounds
##############################################################################
# step 1: find the 99th quantile of a standard Gaussian distribution
# ZIGPois.quantile.99th <- quantileUniZIGPois(0.9999, theta, lambda, omega)
Gaussian.quantile.99th <- qnorm(0.999999)
Gaussian.quantile.01th <- qnorm(0.000001)
a0ZIGPDIntgrand <- function(x, theta, lambda, omega) {
return(quantileUniZIGPois(pnorm(x), theta[1], lambda[1], omega[1]) * dnorm(x))
}
b0ZIGPDIntgrand <- function(x, theta, lambda, omega) {
return(quantileUniZIGPois(pnorm(x), theta[2], lambda[2], omega[2]) * dnorm(x))
}
a1ZIGPDIntgrand <- function(x, theta, lambda, omega) {
return(x*quantileUniZIGPois(pnorm(x), theta[1], lambda[1], omega[1]) * dnorm(x))
}
b1ZIGPDIntgrand <- function(x, theta, lambda, omega) {
return(x*quantileUniZIGPois(pnorm(x), theta[2], lambda[2], omega[2]) * dnorm(x))
}
a0 <- cubintegrate(a0ZIGPDIntgrand,
lower=Gaussian.quantile.01th,
upper=Gaussian.quantile.99th,
theta=theta, lambda=lambda, omega=omega)$integral
b0 <- cubintegrate(b0ZIGPDIntgrand,
lower=Gaussian.quantile.01th,
upper=Gaussian.quantile.99th,
theta=theta, lambda=lambda, omega=omega)$integral
a1 <- cubintegrate(a1ZIGPDIntgrand,
lower=Gaussian.quantile.01th,
upper=Gaussian.quantile.99th,
theta=theta, lambda=lambda, omega=omega)$integral
b1 <- cubintegrate(b1ZIGPDIntgrand,
lower=Gaussian.quantile.01th,
upper=Gaussian.quantile.99th,
theta=theta, lambda=lambda, omega=omega)$integral
mu.i <- ZIGPmu(theta[1],lambda[1],omega[1])
sigma.i <- sqrt(ZIGPsigma2(theta[1],lambda[1],omega[1]))
mu.j <- ZIGPmu(theta[2],lambda[2],omega[2])
sigma.j <- sqrt(ZIGPsigma2(theta[2],lambda[2],omega[2]))
rho.z <- (rho.x + (mu.i*mu.j)/(sigma.i*sigma.j) - (1/(sigma.i*sigma.j))*(a0*b0))*((sigma.i*sigma.j)/(a1*b1))
# ensure within the [-1,1] bounds
rho.z <- ifelse(rho.z > 1, 1, rho.z)
rho.z <- ifelse(rho.z < -1, -1, rho.z)
return(rho.z)
}
# the function below only applies when lambda=0
theo.MOM.ZIGPD <- function(theta,
lambda,
omega) {
mu <- theta*(1-lambda)^(-1) # mu of GPD
M1 = (1-omega)*mu
sigma2 = (1-omega)*(theta*(1-lambda)^(-3))+omega*(1-omega)*mu^2
M2 = M1^2+sigma2
# M2.2 = (1-omega)*theta*(1-lambda)^(-2)*(theta+(1-lambda)^(-1))
return(list(M1,M2
# ,M2.2
,sigma2))
}
# don't be fooled by the name!
# this function uses MOM to get empirical theta, lambda, and omega
MOM.genpois.ZIGPD <- function(sim.data,
abs_tol=1e-2,
iter.max=5) {
require(e1071)
M1 <- moment(sim.data, order=1)
M2 <- moment(sim.data, order=2)
# initial empirical probability of proportion of zero
p0hat <- length(which(sim.data==0))/length(sim.data)
current_omega_hat <- p0hat
for (i in 1:iter.max) {
current_lambda_hat <- 1 - (M2/M1 - M1/(1-current_omega_hat))^(-1/2)
current_theta_hat <- (M1*(1-current_lambda_hat))/(1-current_omega_hat)
current_omega_hat <- (p0hat - exp(-current_theta_hat))/(1-exp(-current_theta_hat))
}
# Code below are for the algorithm by Kamalja, K. K. and Wagh, Y. S. (2022).
# which gives wrong results
# theta0hat <- M1/(1-p0hat)
# lambda0hat <- sqrt(M2/M1-M1/(1-p0hat))
#
# current_theta_hat <- theta0hat
# current_lambda_hat <- lambda0hat
#
# for (i in 1:iter.max) {
# current_omega_hat <- (p0hat-exp(-(current_theta_hat)))/(1-exp(-(current_theta_hat)))
# current_theta_hat <- M1/(1-current_omega_hat)
# current_lambda_hat <- sqrt(M2/M1-M1/(1-current_omega_hat))
# }
# need to make sure the emp.omega is non-negative
if (current_omega_hat < 0) {
current_omega_hat <- 0
}
return(list(
emp.theta = current_theta_hat,
emp.lambda = current_lambda_hat,
emp.omega = current_omega_hat
))
}
ComputeCorrZIGpois <- function(theta.vec, lambda.vec, omega.vec, verbose = TRUE) {
no.gpois = length(theta.vec)
samples = 1e+05
u = matrix(NA, nrow = no.gpois, ncol = samples)
for (i in 1:no.gpois) {
u[i, ] = GenUniZIGpois(theta.vec[i], lambda.vec[i], omega.vec[i], samples,
method = "Inversion", details = FALSE)$data
}
maxmat = minmat = diag(NA, no.gpois)
errorCount = 0
for (i in 1:no.gpois) {
for (j in 1:no.gpois) {
if (i != j) {
maxcor = cor(sort(u[i, ]), sort(u[j, ]))
mincor = cor(sort(u[i, ]), sort(u[j, ], decreasing = TRUE))
minmat[i, j] = mincor
maxmat[i, j] = maxcor
if (verbose == TRUE) {
cat(".")
}
}
}
}
if (verbose == TRUE) {
cat("\n")
}
return(list(min = minmat, max = maxmat))
}
CmatStarZIGpoisPolyReg <- function (Rx, theta.vec, lambda.vec, omega.vec, verbose = TRUE)
{
require(matrixcalc); require(Matrix)
no.gpois = length(theta.vec)
# Rx is the target correlation matrix
Rx.star = diag(nrow(Rx))
g = expand.grid(row = 1:nrow(Rx), col = 1:ncol(Rx))
g.lower.tri = g[lower.tri(Rx, diag = TRUE), ]
Rx.lower.index = g.lower.tri[-which(g.lower.tri[,1] == g.lower.tri[, 2]),]
for (i in 1:nrow(Rx.lower.index)) {
i.temp = Rx.lower.index[i,1]
j.temp = Rx.lower.index[i,2]
Rx.star[i.temp, j.temp] = CorrNNZIGoisPolyReg(Rx[i.temp, j.temp], # target pair correlation
c(theta.vec[i.temp],theta.vec[j.temp]), # theta param vec
c(lambda.vec[i.temp], lambda.vec[j.temp]), # lambda param vec
c(omega.vec[i.temp], omega.vec[j.temp])) # omega param vec
if (verbose == TRUE) {
cat(".")
}
}
g.upper.tri = g[upper.tri(Rx, diag = TRUE), ]
Rx.upper.index <- g.upper.tri[-which(g.upper.tri[, 1] == g.upper.tri[, 2]), ]
for (i in 1:nrow(Rx.upper.index)) {
i.temp = Rx.upper.index[i,1]
j.temp = Rx.upper.index[i,2]
Rx.star[i.temp, j.temp] = Rx.star[j.temp, i.temp]
}
# Rx.star[upper.tri(Rx.star)] <- Rx.star[lower.tri(Rx.star)]
if (verbose == TRUE) {
cat("\n")
}
if (!is.positive.definite(Rx.star)) {
warning("Intermediate correlation matrix is not positive definite. Nearest positive definite matrix is used!")
Rx.star = as.matrix(nearPD(Rx.star, corr = TRUE,
keepDiag = TRUE)$mat)
}
Rx.star = (Rx.star + t(Rx.star))/2
return(Rx.star)
}
quantileMVZIGPois <- function(probs, d, theta, lambda, omega) {
quant.res <- matrix(rep(0,nrow(probs)*d),
nrow=nrow(probs),ncol=d)
for (j in 1:d) {
for (i in 1:nrow(quant.res)) {
x <- 0
u <- probs[i,j] # not runif(0,1)!!!!
p0 <- omega[j] + (1-omega[j]) * exp(-theta[j])
cum.probs <- p0
while(u > cum.probs) {
x <- x + 1
current.prob <- (1-omega[j]) * (theta[j]*(theta[j]+lambda[j]*(x))^(x-1)*exp(-theta[j]-lambda[j]*(x))/factorial(x))
cum.probs <- current.prob + cum.probs
}
quant.res[i,j] <- x
}
}
return(quant.res)
}
################################
################################################################
# sample.size <- 100
# no.zigpois <- 3
# Rx <- matrix(c(1,0.4,0.5,
# 0.4,1,-0.5,
# 0.5,-0.5,1),nrow=3,ncol=3) # target correlation
# is.positive.definite(Rx)
# where x denotes ZIGPois
# target rate parameters
# theta.vec <- c(2,5,3)
# target dispersion parameters
# lambda.vec <- c(.1,.15,.5)
# target proportions
# omega.vec <- c(0.2,0.2,0.17)
# target correlation
# r <- 0.85
GenMVZIGpoisPolyReg <- function(sample.size,
no.zigpois,
theta.vec,
lambda.vec,
omega.vec,
Rx) {
require(mvtnorm)
##########################################################
# step 0 make sure each pair of your your specified Rx is
# within the correlation bounds
# if not, it will change it to the bounds and
# issue an warning!!!
###########################################################
corr.bounds.mat <- ComputeCorrZIGpois(theta.vec, lambda.vec, omega.vec)
for (i in 1:((no.zigpois*(no.zigpois-1)/2))) {
if (Rx[lower.tri(Rx)][i] < corr.bounds.mat$min[lower.tri(corr.bounds.mat$min)][i]) {
Rx[lower.tri(Rx)][i] <- corr.bounds.mat$min[lower.tri(corr.bounds.mat$min)][i]
warning("At least one pair of specified correlation is out of the correlation lower bound, and it/they is/are changed to the lower bound value(s)")
}
if (Rx[lower.tri(Rx)][i] > corr.bounds.mat$max[lower.tri(corr.bounds.mat$max)][i]) {
Rx[lower.tri(Rx)][i] <- corr.bounds.mat$max[lower.tri(corr.bounds.mat$max)][i]
warning("At least one pair of specified correlation is out of the correlation upper bound, and it/they is/are changed to the lower bound value(s)")
}
}
# again:
# if not within the correlation bounds, issue a warning and
# adjust the excessive correlations to the correlation bounds
##################################################
# step 1: find Rz
# Rz is the adjusted Gaussian correlation matrix
Rz <- CmatStarZIGpoisPolyReg(Rx,theta.vec,
lambda.vec,
omega.vec)
XX = rmvnorm(sample.size, rep(0, no.zigpois), Rz)
UU = pnorm(XX)
# note the quantile function used here is the multivariate version
XXgpois = quantileMVZIGPois(UU, no.zigpois, theta.vec, lambda.vec, omega.vec)
emp.corr <- cor(XXgpois)
return(list(data=XXgpois,
RZIGPois=emp.corr,
corrected.RX=Rx))
}
#########################
# 3 dimensional case
#########################
require(matrixcalc)
is.positive.definite(matrix(c(1,0.4,0.5,
0.4,1,-0.5,
0.5,-0.5,1),nrow=3,ncol=3))
## [1] TRUE
# target correlation matrix
matrix(c(1,0.4,0.5,
0.4,1,-0.5,
0.5,-0.5,1),nrow=3,ncol=3)
## [,1] [,2] [,3]
## [1,] 1.0 0.4 0.5
## [2,] 0.4 1.0 -0.5
## [3,] 0.5 -0.5 1.0
set.seed(12345)
# theta, lambda, omega
samp.data.1 <- GenMVZIGpoisPolyReg(50,3,
c(2,5,3),
c(.1,.15,.5),
c(0.2,0.2,0.17),
matrix(c(1,0.4,0.5,
0.4,1,-0.5,
0.5,-0.5,1),nrow=3,ncol=3))
## ......
## ...
## Warning in CmatStarZIGpoisPolyReg(Rx, theta.vec, lambda.vec, omega.vec):
## Intermediate correlation matrix is not positive definite. Nearest positive
## definite matrix is used!
head(samp.data.1)
## $data
## [,1] [,2] [,3]
## [1,] 1 8 0
## [2,] 0 1 1
## [3,] 0 6 0
## [4,] 0 7 0
## [5,] 4 5 10
## [6,] 0 5 2
## [7,] 0 3 0
## [8,] 0 0 6
## [9,] 2 1 10
## [10,] 1 0 7
## [11,] 0 0 3
## [12,] 3 4 10
## [13,] 5 3 16
## [14,] 6 5 15
## [15,] 4 0 21
## [16,] 0 2 0
## [17,] 3 9 2
## [18,] 3 9 2
## [19,] 0 6 0
## [20,] 4 3 13
## [21,] 4 8 4
## [22,] 6 11 6
## [23,] 0 0 14
## [24,] 2 0 9
## [25,] 2 9 1
## [26,] 2 13 0
## [27,] 3 7 5
## [28,] 2 5 5
## [29,] 2 4 7
## [30,] 0 0 3
## [31,] 1 4 4
## [32,] 5 9 6
## [33,] 2 7 2
## [34,] 1 6 1
## [35,] 2 7 3
## [36,] 0 9 0
## [37,] 0 4 0
## [38,] 4 7 7
## [39,] 1 4 4
## [40,] 0 0 9
## [41,] 1 5 3
## [42,] 0 0 7
## [43,] 2 8 1
## [44,] 2 0 14
## [45,] 4 8 5
## [46,] 4 6 9
## [47,] 3 8 3
## [48,] 1 0 8
## [49,] 1 2 5
## [50,] 4 4 11
##
## $RZIGPois
## [,1] [,2] [,3]
## [1,] 1.0000000 0.3916545 0.5196481
## [2,] 0.3916545 1.0000000 -0.4677230
## [3,] 0.5196481 -0.4677230 1.0000000
##
## $corrected.RX
## [,1] [,2] [,3]
## [1,] 1.0 0.4 0.5
## [2,] 0.4 1.0 -0.5
## [3,] 0.5 -0.5 1.0
# empirical correlation matrix
# samp.data.1$RZIGPois
# corrected target correlation
# samp.data.1$corrected.RX
mean(samp.data.1$RZIGPois-samp.data.1$corrected.RX)
## [1] 0.009684357
#########################
# 5 dimensional case
#########################
# require(matrixcalc)
# is.positive.definite(matrix(c(1,0.4,-0.5,0.2,0.7,
# 0.4,1,-0.5,0.3,0.4,
# -0.5,-0.5,1,-0.4,-0.2,
# 0.2,0.3,-0.4,1,0.65,
# 0.7,0.4,-0.2,0.65,1),nrow=5,ncol=5))
# set.seed(12345)
# theta, lambda, omega
# samp.data.2 <- GenMVZIGpoisPolyReg(10000,5,
# c(2,5,3,7,1.4),
# c(.1,.15,.5,.5,0.2),
# c(0.2,0.2,0.17,0.5,0.05),
# matrix(c(1,0.4,-0.5,0.2,0.7,
# 0.4,1,-0.5,0.3,0.4,
# -0.5,-0.5,1,-0.4,-0.2,
# 0.2,0.3,-0.4,1,0.65,
# 0.7,0.4,-0.2,0.65,1),nrow=5,ncol=5))
# target correlation matrix
# samp.data.2$corrected.RX
# round(samp.data.2$RZIGPois,2)
# it seems that lambda (dispersion parameter) too close to 1
# results in bad results
# mean(round(samp.data.2$RZIGPois,2) - samp.data.2$corrected.RX)
# target rate parameters
# theta.vec <- c(2,5,3,7,1.4)
# target dispersion parameters
# lambda.vec <- c(.1,.15,.5,.9,0.2)
# target proportions
# omega.vec <- c(0.2,0.2,0.17,0.5,0.05)
# Rx <- matrix(c(1,0.4,-0.5,0.2,0.7,
# 0.4,1,-0.5,0.3,0.4,
# -0.5,-0.5,1,-0.4,-0.2,
# 0.2,0.3,-0.4,1,0.65,
# 0.7,0.4,-0.2,0.65,1),nrow=5,ncol=5)
#