Introduction

In this post, we show some examples in simulating univariate and multivariate zero-inflated generalized Poisson (ZIGP) data based on the approaches proposed in chen et al. (2024).

Let \((X_1,X_2,\ldots,X_n)^T\) denote a vector of \(n\) discrete random variables. A generalized Poisson (GP) regression model distribution (Consul and Famoye, 1992; Yang et al., 2009) adopts the following probability mass function (PMF) (Consul and Famoye, 2006):

\[\begin{equation} Pr(X_i=x_i \; | \; \theta_i,\lambda_i) = \begin{cases} e^{-\theta_i - \lambda_i x_i} \theta_i\left(\theta_i + \lambda_i x_i\right)^{x_i - 1} / x_i!, & \text{for } x_i= 0, 1, 2... \\ 0, & \text{for } x_i > m_i \text{ when } \lambda_i < 0 \end{cases} \end{equation}\]

We consider the ZIGP distribution defined by the following PMF:

\[\begin{align} \begin{split} Pr(X_i =x_i \; | \; \omega_i, \theta_i, \lambda_i) = \begin{cases} \omega_i + (1-\omega_i) %Pr_{\theta_i,\lambda_i}(0) e^{-\theta_i - \lambda_i x_i} \theta_i\left(\theta_i + \lambda_i x_i\right)^{x_i - 1} / x_i! = \omega_i+(1-\omega_i)e^{-\theta_i} & \text{for } x_i= 0 \\ (1-\omega_i) %Pr_{\theta_i,\lambda_i}(x_i) e^{-\theta_i - \lambda_i x_i} \theta_i\left(\theta_i + \lambda_i x_i\right)^{x_i - 1} / x_i! & \text{for } x_i= 1,2,\ldots \\ 0 & \text{for } x_i > m_i \text{ if } \lambda_i < 0\end{cases} \end{split} \end{align}\]

Simulation With Examples

Simulate Univaraite ZIGP Distributed Count Data

Source codes for simulating univariate ZIGP distributed Count Data

require(RNGforGPD)
## Loading required package: RNGforGPD
MOM.genpois <- function(data){
  
  m <- mean(data); v <- var(data)
  
  emp.theta <- sqrt(m^3 / v)
  emp.lambda <- 1 - sqrt(m / v)
  
  return(list(
    emp.theta = emp.theta,
    emp.lambda = emp.lambda
  ))
  
}

################################################################################
# require vgam
################################################################################
GenUniZIGpois <- function (theta, lambda, omega, n, details = TRUE,
                           method = "ZIGP-inversion") {
  
  tol <- .Machine$double.eps^0.5
  if (n <= 0 | abs(n - round(n)) > tol) {
    stop("n must be a positive integer.")
  }
  if (n == 1) 
    stop("Parameter estimates are exactly the same as input of parameters in the case when n = 1!")
  if (theta <= 0) 
    stop("theta has to be greater than 0!")
  if (lambda >= 1) 
    stop("lambda has to be less than 1!")
  if (lambda < 0 & lambda < (-theta)/4) 
    stop(paste("For lambda < 0, lambda must be greater than or equal to -theta/4 which is ", 
               (-theta)/4, "!", sep = ""))
  myset = numeric(n)
  
  if (lambda == 0) {
    # rzipois is called from 'VGAM'
    myset = rzipois(n, theta, pstr0=omega)
    
    if (n != 1) {
      params = MOM.genpois(myset)
      emp.theta = round(params$emp.theta, 6)
      emp.lambda = round(params$emp.lambda, 6)
    }
  }
  
  for (i in 1:n) {
    x <- 0
    u <- runif(1)
    p0 <- omega + (1-omega) * exp(-theta)
    cum.probs <- p0
    while(u > cum.probs) {
      x <- x + 1
      current.prob <- (1-omega) * (theta*(theta+lambda*(x))^(x-1)*exp(-theta-lambda*(x))/factorial(x))
      cum.probs <- current.prob + cum.probs
    }
    myset[i] = x
  }
  
  if (n != 1) {
    params = MOM.genpois(myset)
    emp.theta = round(params$emp.theta, 6)
    emp.lambda = round(params$emp.lambda, 6)
  }
  
  if (details == TRUE) {
    print(paste("Specified theta is ", theta, ", empirical theta is ", 
                emp.theta, ", specified lambda is ", lambda, ", empirical lambda is ", 
                emp.lambda, ".", sep = ""))
  }
  return(invisible(list(data = myset, specified.theta = theta, 
                        empirical.theta = as.numeric(emp.theta), specified.lambda = lambda, 
                        empirical.lambda = as.numeric(emp.lambda), method = method)))
}

Suppose that \(X \sim ZIGP(\theta_1=5,\lambda_1=-0.5,\omega_1=0.3)\)

sim.data <- GenUniZIGpois(5,-0.5,0.3, 50)
## [1] "Specified theta is 5, empirical theta is 2.34849, specified lambda is -0.5, empirical lambda is 0.10363."
head(sim.data)
## $data
##  [1] 2 4 0 0 7 0 1 2 4 2 0 2 4 2 3 3 0 1 4 2 0 3 5 3 0 2 4 4 3 0 0 0 2 4 4 4 2 1
## [39] 5 3 5 5 2 3 5 4 3 2 5 5
## 
## $specified.theta
## [1] 5
## 
## $empirical.theta
## [1] 2.34849
## 
## $specified.lambda
## [1] -0.5
## 
## $empirical.lambda
## [1] 0.10363
## 
## $method
## [1] "ZIGP-inversion"

Simulate Bivariate ZIGP Distributed Count Data

By NORTA-PolyReg

ZIGPmu <- function(theta,lambda,omega) {
  a=lambda/theta
  return((1-omega)*theta/(1-a*theta))
}

# theoretical variance:
ZIGPsigma2 <- function(theta,lambda,omega) {
  a=lambda/theta
  return((1-omega)*theta*(1+theta*omega*(1-a*theta))/((1-a*theta)^3))
}
quantileUniZIGPois <- function(probs, theta, lambda, omega) {
  quant.res <- matrix(rep(0,length(probs)),
                      nrow=length(probs),ncol=1)
  
  for (i in 1:length(quant.res)) {
    x <- 0
    u <- probs[i] # not runif(0,1)!!!!
    p0 <- omega + (1-omega) * exp(-theta)
    cum.probs <- p0
    while(u > cum.probs) {
      x <- x + 1
      current.prob <- (1-omega) * (theta*(theta+lambda*(x))^(x-1)*exp(-theta-lambda*(x))/factorial(x)) 
      cum.probs <- current.prob + cum.probs
    }
    quant.res[i] <- x
  }
  
  return(quant.res)
}
CorrNNZIGoisPolyReg <- function(rho.x,theta,lambda,omega) {
  require(cubature)
  
  # step 0: make sure rho.x is within the correlation bounds
  
  ##############################################################################
  # step 1: find the 99th quantile of a standard Gaussian distribution
  # ZIGPois.quantile.99th <- quantileUniZIGPois(0.9999, theta, lambda, omega)
  
  Gaussian.quantile.99th <- qnorm(0.999999)
  Gaussian.quantile.01th <- qnorm(0.000001)
  
  a0ZIGPDIntgrand <- function(x, theta, lambda, omega) {
    return(quantileUniZIGPois(pnorm(x), theta[1], lambda[1], omega[1]) * dnorm(x))
  }
  b0ZIGPDIntgrand <- function(x, theta, lambda, omega) {
    return(quantileUniZIGPois(pnorm(x), theta[2], lambda[2], omega[2]) * dnorm(x))
  }
  
  a1ZIGPDIntgrand <- function(x, theta, lambda, omega) {
    return(x*quantileUniZIGPois(pnorm(x), theta[1], lambda[1], omega[1]) * dnorm(x))
  }
  b1ZIGPDIntgrand <- function(x, theta, lambda, omega) {
    return(x*quantileUniZIGPois(pnorm(x), theta[2], lambda[2], omega[2]) * dnorm(x))
  }
  
  a0 <- cubintegrate(a0ZIGPDIntgrand,
                     lower=Gaussian.quantile.01th,
                     upper=Gaussian.quantile.99th,
                     theta=theta, lambda=lambda, omega=omega)$integral
  
  b0 <- cubintegrate(b0ZIGPDIntgrand,
                     lower=Gaussian.quantile.01th,
                     upper=Gaussian.quantile.99th,
                     theta=theta, lambda=lambda, omega=omega)$integral
  a1 <- cubintegrate(a1ZIGPDIntgrand,
                     lower=Gaussian.quantile.01th,
                     upper=Gaussian.quantile.99th,
                     theta=theta, lambda=lambda, omega=omega)$integral
  b1 <- cubintegrate(b1ZIGPDIntgrand,
                     lower=Gaussian.quantile.01th,
                     upper=Gaussian.quantile.99th,
                     theta=theta, lambda=lambda, omega=omega)$integral
  
  mu.i <- ZIGPmu(theta[1],lambda[1],omega[1])
  sigma.i <- sqrt(ZIGPsigma2(theta[1],lambda[1],omega[1]))
  
  mu.j <- ZIGPmu(theta[2],lambda[2],omega[2])
  sigma.j <- sqrt(ZIGPsigma2(theta[2],lambda[2],omega[2]))
  
  rho.z <- (rho.x + (mu.i*mu.j)/(sigma.i*sigma.j) - (1/(sigma.i*sigma.j))*(a0*b0))*((sigma.i*sigma.j)/(a1*b1))
  
  # ensure within the [-1,1] bounds
  rho.z <- ifelse(rho.z > 1, 1, rho.z)
  rho.z <- ifelse(rho.z < -1, -1, rho.z)
  
  return(rho.z)
}
# the function below only applies when lambda=0

theo.MOM.ZIGPD <- function(theta,
                           lambda,
                           omega) {
  mu <- theta*(1-lambda)^(-1) # mu of GPD
  M1 = (1-omega)*mu
  sigma2 = (1-omega)*(theta*(1-lambda)^(-3))+omega*(1-omega)*mu^2
  M2 = M1^2+sigma2
  # M2.2 = (1-omega)*theta*(1-lambda)^(-2)*(theta+(1-lambda)^(-1))
  return(list(M1,M2
              # ,M2.2
              ,sigma2))
}
# don't be fooled by the name!
# this function uses MOM to get empirical theta, lambda, and omega
MOM.genpois.ZIGPD <- function(sim.data,
                              abs_tol=1e-2,
                              iter.max=5) {
  require(e1071)
  M1 <- moment(sim.data, order=1)
  M2 <- moment(sim.data, order=2)
  
  # initial  empirical probability of proportion of zero
  p0hat <- length(which(sim.data==0))/length(sim.data)
  current_omega_hat <- p0hat
  
  for (i in 1:iter.max) {
    current_lambda_hat <- 1 - (M2/M1 - M1/(1-current_omega_hat))^(-1/2)
    current_theta_hat <- (M1*(1-current_lambda_hat))/(1-current_omega_hat)
    current_omega_hat <- (p0hat - exp(-current_theta_hat))/(1-exp(-current_theta_hat))
  }
  
  # Code below are for the algorithm by Kamalja, K. K. and Wagh, Y. S. (2022).
  # which gives wrong results
  # theta0hat <- M1/(1-p0hat)
  # lambda0hat <- sqrt(M2/M1-M1/(1-p0hat))
  # 
  # current_theta_hat <- theta0hat
  # current_lambda_hat <- lambda0hat
  # 
  # for (i in 1:iter.max) {
  #   current_omega_hat <- (p0hat-exp(-(current_theta_hat)))/(1-exp(-(current_theta_hat)))
  #   current_theta_hat <- M1/(1-current_omega_hat)
  #   current_lambda_hat <- sqrt(M2/M1-M1/(1-current_omega_hat))
  # }
  
  # need to make sure the emp.omega is non-negative
  if (current_omega_hat < 0) {
    current_omega_hat <- 0
  }
  
  return(list(
    emp.theta = current_theta_hat,
    emp.lambda = current_lambda_hat,
    emp.omega = current_omega_hat
  ))
  
}
ComputeCorrZIGpois <- function(theta.vec, lambda.vec, omega.vec, verbose = TRUE) {
  no.gpois = length(theta.vec)
  samples = 1e+05
  u = matrix(NA, nrow = no.gpois, ncol = samples)
  for (i in 1:no.gpois) {
    u[i, ] = GenUniZIGpois(theta.vec[i], lambda.vec[i], omega.vec[i], samples,
                           method = "Inversion", details = FALSE)$data
  }
  maxmat = minmat = diag(NA, no.gpois)
  errorCount = 0
  for (i in 1:no.gpois) {
    for (j in 1:no.gpois) {
      if (i != j) {
        maxcor = cor(sort(u[i, ]), sort(u[j, ]))
        mincor = cor(sort(u[i, ]), sort(u[j, ], decreasing = TRUE))
        minmat[i, j] = mincor
        maxmat[i, j] = maxcor
        if (verbose == TRUE) {
          cat(".")
        }
      }
    }
  }

  if (verbose == TRUE) {
    cat("\n")
  }
  return(list(min = minmat, max = maxmat))
}
CmatStarZIGpoisPolyReg <- function (Rx, theta.vec, lambda.vec, omega.vec, verbose = TRUE) 
{
  require(matrixcalc); require(Matrix)
  no.gpois = length(theta.vec)
  # Rx is the target correlation matrix
  Rx.star = diag(nrow(Rx))
  
  g = expand.grid(row = 1:nrow(Rx), col = 1:ncol(Rx))
  g.lower.tri = g[lower.tri(Rx, diag = TRUE), ]
  Rx.lower.index = g.lower.tri[-which(g.lower.tri[,1] == g.lower.tri[, 2]),]
    
  for (i in 1:nrow(Rx.lower.index)) {
    i.temp = Rx.lower.index[i,1]
    j.temp = Rx.lower.index[i,2]
    
    Rx.star[i.temp, j.temp] = CorrNNZIGoisPolyReg(Rx[i.temp, j.temp], # target pair correlation
                                                  c(theta.vec[i.temp],theta.vec[j.temp]), # theta param vec
                                                  c(lambda.vec[i.temp], lambda.vec[j.temp]), # lambda param vec
                                                  c(omega.vec[i.temp], omega.vec[j.temp])) # omega param vec
    if (verbose == TRUE) {
      cat(".")
      }
  }
  
  g.upper.tri = g[upper.tri(Rx, diag = TRUE), ]
  Rx.upper.index <- g.upper.tri[-which(g.upper.tri[, 1] == g.upper.tri[, 2]), ]
  
  for (i in 1:nrow(Rx.upper.index)) {
    i.temp = Rx.upper.index[i,1]
    j.temp = Rx.upper.index[i,2]
    
    Rx.star[i.temp, j.temp] = Rx.star[j.temp, i.temp]
  }
  
  # Rx.star[upper.tri(Rx.star)] <- Rx.star[lower.tri(Rx.star)]
  
  if (verbose == TRUE) {
    cat("\n")
  }

  if (!is.positive.definite(Rx.star)) {
    warning("Intermediate correlation matrix is not positive definite. Nearest positive definite matrix is used!")
    Rx.star = as.matrix(nearPD(Rx.star, corr = TRUE,
                                   keepDiag = TRUE)$mat)
  }
  Rx.star = (Rx.star + t(Rx.star))/2
  return(Rx.star)
}
quantileMVZIGPois <- function(probs, d, theta, lambda, omega) {
  quant.res <- matrix(rep(0,nrow(probs)*d),
                      nrow=nrow(probs),ncol=d)
  
  for (j in 1:d) {
    for (i in 1:nrow(quant.res)) {
      x <- 0
      u <- probs[i,j] # not runif(0,1)!!!!
      p0 <- omega[j] + (1-omega[j]) * exp(-theta[j])
      cum.probs <- p0
      while(u > cum.probs) {
        x <- x + 1
        current.prob <- (1-omega[j]) * (theta[j]*(theta[j]+lambda[j]*(x))^(x-1)*exp(-theta[j]-lambda[j]*(x))/factorial(x)) 
        cum.probs <- current.prob + cum.probs
      }
      quant.res[i,j] <- x
    }
  }
  
  return(quant.res)
}
################################
################################################################
# sample.size <- 100
# no.zigpois <- 3
# Rx <- matrix(c(1,0.4,0.5,
# 0.4,1,-0.5,
# 0.5,-0.5,1),nrow=3,ncol=3) # target correlation
# is.positive.definite(Rx)
# where x denotes ZIGPois

# target rate parameters
# theta.vec <- c(2,5,3)
# target dispersion parameters
# lambda.vec <- c(.1,.15,.5)
# target proportions
# omega.vec <- c(0.2,0.2,0.17)

# target correlation
# r <- 0.85

GenMVZIGpoisPolyReg <- function(sample.size, 
                              no.zigpois, 
                              theta.vec, 
                              lambda.vec, 
                              omega.vec, 
                              Rx) {
  require(mvtnorm)
  ##########################################################
  # step 0 make sure each pair of your your specified Rx is
  # within the correlation bounds
  # if not, it will change it to the bounds and
  #  issue an warning!!!
  ###########################################################
  corr.bounds.mat <- ComputeCorrZIGpois(theta.vec, lambda.vec, omega.vec)
  
  for (i in 1:((no.zigpois*(no.zigpois-1)/2))) {
    if (Rx[lower.tri(Rx)][i] < corr.bounds.mat$min[lower.tri(corr.bounds.mat$min)][i]) {
      Rx[lower.tri(Rx)][i] <- corr.bounds.mat$min[lower.tri(corr.bounds.mat$min)][i]
      warning("At least one pair of specified correlation is out of the correlation lower bound, and it/they is/are changed to the lower bound value(s)")
    }
    
    if (Rx[lower.tri(Rx)][i] > corr.bounds.mat$max[lower.tri(corr.bounds.mat$max)][i]) {
      Rx[lower.tri(Rx)][i] <- corr.bounds.mat$max[lower.tri(corr.bounds.mat$max)][i]
      warning("At least one pair of specified correlation is out of the correlation upper bound, and it/they is/are changed to the lower bound value(s)")
    }
  }
  # again:
  # if not within the correlation bounds, issue a warning and
  # adjust the excessive correlations to the correlation bounds
  
  ##################################################
  # step 1: find Rz
  # Rz is the adjusted Gaussian correlation matrix
  Rz <- CmatStarZIGpoisPolyReg(Rx,theta.vec, 
                               lambda.vec, 
                               omega.vec)
  
  XX = rmvnorm(sample.size, rep(0, no.zigpois), Rz)
  
  UU = pnorm(XX)
  # note the quantile function used here is the multivariate version
  XXgpois = quantileMVZIGPois(UU, no.zigpois, theta.vec, lambda.vec, omega.vec)
  
  emp.corr <- cor(XXgpois)
  return(list(data=XXgpois,
              RZIGPois=emp.corr,
              corrected.RX=Rx))
}

#########################
# 3 dimensional case
#########################

require(matrixcalc)
is.positive.definite(matrix(c(1,0.4,0.5,
                              0.4,1,-0.5,
                              0.5,-0.5,1),nrow=3,ncol=3))
## [1] TRUE
# target correlation matrix
matrix(c(1,0.4,0.5,
         0.4,1,-0.5,
         0.5,-0.5,1),nrow=3,ncol=3)
##      [,1] [,2] [,3]
## [1,]  1.0  0.4  0.5
## [2,]  0.4  1.0 -0.5
## [3,]  0.5 -0.5  1.0
set.seed(12345)
# theta, lambda, omega
samp.data.1 <- GenMVZIGpoisPolyReg(50,3,
                    c(2,5,3),
                    c(.1,.15,.5),
                    c(0.2,0.2,0.17),
                    matrix(c(1,0.4,0.5,
                             0.4,1,-0.5,
                             0.5,-0.5,1),nrow=3,ncol=3))
## ......
## ...
## Warning in CmatStarZIGpoisPolyReg(Rx, theta.vec, lambda.vec, omega.vec):
## Intermediate correlation matrix is not positive definite. Nearest positive
## definite matrix is used!
head(samp.data.1)
## $data
##       [,1] [,2] [,3]
##  [1,]    1    8    0
##  [2,]    0    1    1
##  [3,]    0    6    0
##  [4,]    0    7    0
##  [5,]    4    5   10
##  [6,]    0    5    2
##  [7,]    0    3    0
##  [8,]    0    0    6
##  [9,]    2    1   10
## [10,]    1    0    7
## [11,]    0    0    3
## [12,]    3    4   10
## [13,]    5    3   16
## [14,]    6    5   15
## [15,]    4    0   21
## [16,]    0    2    0
## [17,]    3    9    2
## [18,]    3    9    2
## [19,]    0    6    0
## [20,]    4    3   13
## [21,]    4    8    4
## [22,]    6   11    6
## [23,]    0    0   14
## [24,]    2    0    9
## [25,]    2    9    1
## [26,]    2   13    0
## [27,]    3    7    5
## [28,]    2    5    5
## [29,]    2    4    7
## [30,]    0    0    3
## [31,]    1    4    4
## [32,]    5    9    6
## [33,]    2    7    2
## [34,]    1    6    1
## [35,]    2    7    3
## [36,]    0    9    0
## [37,]    0    4    0
## [38,]    4    7    7
## [39,]    1    4    4
## [40,]    0    0    9
## [41,]    1    5    3
## [42,]    0    0    7
## [43,]    2    8    1
## [44,]    2    0   14
## [45,]    4    8    5
## [46,]    4    6    9
## [47,]    3    8    3
## [48,]    1    0    8
## [49,]    1    2    5
## [50,]    4    4   11
## 
## $RZIGPois
##           [,1]       [,2]       [,3]
## [1,] 1.0000000  0.3916545  0.5196481
## [2,] 0.3916545  1.0000000 -0.4677230
## [3,] 0.5196481 -0.4677230  1.0000000
## 
## $corrected.RX
##      [,1] [,2] [,3]
## [1,]  1.0  0.4  0.5
## [2,]  0.4  1.0 -0.5
## [3,]  0.5 -0.5  1.0
# empirical correlation matrix
# samp.data.1$RZIGPois
# corrected target correlation
# samp.data.1$corrected.RX

mean(samp.data.1$RZIGPois-samp.data.1$corrected.RX)
## [1] 0.009684357
#########################
# 5 dimensional case
#########################

# require(matrixcalc)
# is.positive.definite(matrix(c(1,0.4,-0.5,0.2,0.7,
#                               0.4,1,-0.5,0.3,0.4,
#                               -0.5,-0.5,1,-0.4,-0.2,
#                               0.2,0.3,-0.4,1,0.65,
#                               0.7,0.4,-0.2,0.65,1),nrow=5,ncol=5))

# set.seed(12345)
# theta, lambda, omega
# samp.data.2 <- GenMVZIGpoisPolyReg(10000,5,
#                                    c(2,5,3,7,1.4),
#                                    c(.1,.15,.5,.5,0.2),
#                                    c(0.2,0.2,0.17,0.5,0.05),
#                                    matrix(c(1,0.4,-0.5,0.2,0.7,
#                                             0.4,1,-0.5,0.3,0.4,
#                                             -0.5,-0.5,1,-0.4,-0.2,
#                                             0.2,0.3,-0.4,1,0.65,
#                                             0.7,0.4,-0.2,0.65,1),nrow=5,ncol=5))
# target correlation matrix
# samp.data.2$corrected.RX
# round(samp.data.2$RZIGPois,2)

# it seems that lambda (dispersion parameter) too close to 1 
# results in bad results

# mean(round(samp.data.2$RZIGPois,2) - samp.data.2$corrected.RX)
# target rate parameters
# theta.vec <- c(2,5,3,7,1.4)
# target dispersion parameters
# lambda.vec <- c(.1,.15,.5,.9,0.2)
# target proportions
# omega.vec <- c(0.2,0.2,0.17,0.5,0.05)

# Rx <- matrix(c(1,0.4,-0.5,0.2,0.7,
#                0.4,1,-0.5,0.3,0.4,
#                -0.5,-0.5,1,-0.4,-0.2,
#                0.2,0.3,-0.4,1,0.65,
#                0.7,0.4,-0.2,0.65,1),nrow=5,ncol=5)
#