ABSTRACT
We analyze data that are “cherry picked” (i.e., nonrandomly sampled) from a population and are then used for regression modeling and prediction. Nonrandom data are encountered in numerous situations, and the application of standard statistical methods developed for random samples can easily lead to incorrect conclusions. A case study is presented to illustrate the related issues, as well as the repercussions of erroneously ignoring the nonrandom sampling.
APPENDIX
This appendix presents WinBUGS code for the combined analysis of the randomly and nonrandomly sampled data for the JMAK nonlinear regression model.
# JMAK-nonlinear regression here accounts for nonrandom sampling in last 20 data
# each based on 8th-10th order statistic chosen with probabilities .1, .3, .6
# use Poisson 0 trick - WinBUGS manual (Spiegelharter et al. (2010))
# menu item -- Tricks: Advanced Use of the BUGS Language
# submenu item -- Specifying a new sampling distribution
model
{
#random sampling
for( i in 1 : N1 ) {
y[i]∼dnorm(mu[i],tau[i]) # tau is normal precision, i.e., reciprocal variance
mu[i] <- 1-exp(-kparm*pow(pow(10,x[i]),nparm))
sigma[i] <- beta0+beta1*x[i]
tau[i] <- 1/(sigma[i]*sigma[i])
}
#nonrandom sampling
for( i in (N1+1) : N2 ) {
sigma[i] <- beta0+beta1*x[i]
z[i]∼dpois(lambda[i]) # Poisson 0 trick - z[i]=0, lambda[i] = -loglikelihood of y[i]
mu[i] <- 1-exp(-kparm*pow(pow(10,x[i]),nparm))
k1[i]<-.6*11*(1/sigma[i])*exp(-(y[i]-mu[i])*(y[i]-mu[i])/(2*sigma[i]*sigma[i]))*pow(phi((y[i]-mu[i])/sigma[i]),10)
k2[i] <-.3*110*(1/sigma[i])*exp(-(y[i]-mu[i])*(y[i]-mu[i])/(2*sigma[i]*sigma[i]))*pow(phi((y[i]-mu[i])/sigma[i]),9)*
(1-phi((y[i]-mu[i])/sigma[i]))
k3[i] <- .1*495*(1/sigma[i])*exp(-(y[i]-mu[i])*(y[i]-mu[i])/(2*sigma[i]*sigma[i]))*pow(phi((y[i]-mu[i])/sigma[i]),8)*
pow(1-phi((y[i]-mu[i])/sigma[i]),2)
lambda[i] <- -log(k1[i]+k2[i]+k3[i])+C #add large constant C to keep Poisson lambda positive
}
#prior
beta0∼dlnorm(mu0,tau0)
beta1∼dlnorm(mu1,tau1)
kparm∼dlnorm(mu2,tau2)
nparm∼dlnorm(mu3,tau2)
mu0<-log(.001)
tau0<- 1/(.1*.1)
mu1<-log(.15)
tau1<- 1/(.2*.2)
mu2<-log(.1)
mu3<-log(.6)
tau2<- 1/(.15*.15)
}
Data
list(N1=10, N2=20, C=100,
z=c(0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0)
)
Inits
list(kparm=.075, nparm=.5, beta0=.0001, beta1=.1)