################################################################################ # R functions used to implement the MB, MF/MB, MF2 and MF/MF2 procedures # from paper "Model-free model-fitting and predictive distributions" # by D.N. Politis (March 2010--revised April 2012). # Modified for model-free regression: Liang Wang and Srinjoy Das - June 2015 ################################################################################ ####################################################################### ####### BELOW ARE THREE OPTIONS FOR THE uniformize FUNCTION ########### ####################################################################### uniformize.linear <- function(x, x0, inverse=FALSE, w) { # MARCH 2010 # this function implements transformation (35) and its inverse # using a piecewise linear estimate of local cdf # IF inverse=FALSE, then x is really the Y data in regression # IF inverse=TRUE, then x is actually the x variable # x0 can be vector! n <- length(x) if (n <= 1) return(NA) # ("sample size too small!") if(missing(w)) { x <- sort(x) w <- (0:n)/n } else { if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } ord <- order(x) x <- x[ord] w <- cumsum(c(0,w[ord])) } midpoints <- (x[-n] + x[-1])/2 midpoints <- c(2*x[1] - midpoints[1], midpoints, 2*x[n] - midpoints[n-1]) if(inverse) { Finv <- approxfun(w, midpoints) return(Finv(x0)) } else { Fmid <- approxfun(midpoints, w) F <- function(t) ifelse(t <= midpoints[1], 0, ifelse(t >= midpoints[n+1], 1, Fmid(t))) return(F(x0)) } } ############################################################################ # BELOW IS A NEW VERSION OF THE uniformize R FUNCTION (Oct. 2011) uniformize.linear2 <- function(x, x0, inverse=FALSE, w) { # October 2011 # this function implements transformation (35) and its inverse # using a piecewise linear estimate of local cdf # IF inverse=FALSE, then x is really the Y data in regression # IF inverse=TRUE, then x is actually the x variable # x0 can be vector! n <- length(x) if(missing(w)) { x <- sort(x) w <- (0:n)/n } else { if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } ord <- order(x) x <- x[ord] w <- cumsum(c(0,w[ord])) } midpoints <- (x[-n] + x[-1])/2 lwr <- min(2* midpoints[1] - midpoints[2], x[1] - (midpoints[1]-x[1])) upr <- max(2* midpoints[n-1] - midpoints[n-2], x[n] + (x[n] - midpoints[n-1])) midpoints <- c(lwr, midpoints, upr) if(inverse) { Finv <- approxfun(w, midpoints) return(Finv(x0)) } else { Fmid <- approxfun(midpoints, w) F <- function(t) ifelse(t <= midpoints[1], 0, ifelse(t >= midpoints[n+1], 1, Fmid(t))) return(F(x0)) } } ################################################################## uniformizeSPlus <- function(x, x0, inverse=FALSE, w) { # Splus version of uniformize.linear n <- length(x) if(missing(w)) { x <- sort(x) w <- (0:n)/n } else { if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } ord <- order(x) x <- x[ord] w <- cumsum(c(0,w[ord])) } midpoints <- (x[-n] + x[-1])/2 lwr <- min(2* midpoints[1] - midpoints[2], x[1] - (midpoints[1]-x[1])) upr <- max(2* midpoints[n-1] - midpoints[n-2], x[n] + (x[n] - midpoints[n-1])) midpoints <- c(lwr, midpoints, upr) if(inverse) { return(approx(w, midpoints, x0)) } else { F <- function(t) { ret <- numeric(length(t)) zeros <- (t <= midpoints[1]) ones <- (t >= midpoints[n+1]) mid <- ((t > midpoints[1]) & (t < midpoints[n+1])) ret[mid] <- approx(midpoints, w, t[mid])$y ret[ones] <- 1 ret[zeros] <- 0 return(ret) } return(F(x0)) } } ################################################################# uniformize.kernel <- function(x, x0, inverse=FALSE, w) { # APRIL 2012 # New R function that does uniformize on the basis # of a kernel smoothed edf (need kernel > 0 everywhere # to ensure the resulting edf is invertible) n <- length(x) if (n <= 1) return(NA) # ("sample size too small!") if(missing(w)) { w <- c(1:n)*0+1/n } if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } if ( is.na(sum(x)) ) { w<- w[!is.na(x)] w <- w/sum(w) x<- x[!is.na(x)] warning("missing values in x") } # first compute the smoothed edf, i.e., the inverse=FALSE case ppdf<- density(x, weights = w ) # default uses gaussian kernel npdf<- length(ppdf$x) delta<- ppdf$x[2]-ppdf$x[1] pcdf<- cumsum(ppdf$y)*delta if(inverse) { FFinv <- approxfun(pcdf, ppdf$x) return(FFinv(x0)) #In the inverse case x0 denotes the vector of y values to which FFinv is applied } else { FF <- approxfun(ppdf$x, pcdf) return(FF(x0)) #Here x0 denotes the vector of x values to which FF is applied }} #################################################################### uniformize <- uniformize.kernel # or: uniformize <- uniformize.linear2 ##################################################################### inverse.cdf <- function(x, prob, w) { # 04/20/2015 # New R function that performs a weighted quantile inverse # of a kernel smoothed edf (need kernel > 0 everywhere # to ensure the resulting edf is invertible) n <- length(x) if (n <= 1) return(NA) # ("sample size too small!") if(missing(w)) { w <- c(1:n)*0+1/n } if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } if ( is.na(sum(x)) ) { w<- w[!is.na(x)] w <- w/sum(w) x<- x[!is.na(x)] warning("missing values in x") } # include package for weighted inverse quantile function require(Hmisc) # Only use step quantile function # Using wtd.ecdf, exactly follows the mathematical definition of quantile function. wtdecdf=wtd.Ecdf(x,weights=w,type='i/n') out=sapply(prob,function(pr)wtdecdf$x[min(which(wtdecdf$ecdf>=pr))]) return(out) } ##################################################################### MFpp<- function(x,Y,xf,ban, box=FALSE, L2=TRUE, delete=FALSE, edgecorrect=TRUE) { # returns MF2 point predictor (L1 or L2) of Yf at xf (as in Table 4.1) # using box or normal kernel (default) for cdf estimation # edgecorrect=TRUE implies omitting the u's close to the edges # as in Remark 4.8 of the paper YYY<- MFppDISTR(x,Y,xf,ban, box, delete, edgecorrect) Yf<- mean(YYY, na.rm=TRUE) if (L2==FALSE) {Yf<- median(YYY, na.rm=TRUE)} Yf } ############################################################ MFppDISTR<- function(x,Y,xf,ban, box=FALSE,delete=FALSE, edgecorrect=TRUE) { # returns MF2 naive predictive distr. of Yf at xf (to take mean or median of) # (x,Y) are data from regression; ban=bandwidth to use at xf # xf is a SINGLE number; box=TRUE means rectangular kernel (the default is Normal) # delete=TRUE only works for normal kernel here. n<- length(x) u<- c(1:n) if (box==TRUE){ # rectangular (Box) kernel in uniformize for (i in 1:n){ # this computes the u_t of eq. (35) yw<- Y[ abs(x-x[i])< ban] # w=constant, Rectangular Window u[i]<- uniformize(yw, Y[i], inverse=FALSE) } # qqplot(u, c(1:length(u))/length(u), xlab="quantiles of transformed data u_t", # ylab="quantiles of Uniform (0,1)" ) u<- na.rm(u) Yf<-uniformize(Y[ abs(x-xf)< ban], u, inverse=TRUE) } if (box==FALSE){ # if not rectangular, then use normal kernel ## with IQ range =(-ban/4 ,ban/4)*stdev as Box, i.e., stdev=ban/2.7=0.37*ban #for (i in 1:n){ # this computes the u_t of eq. (35) # w<- dnorm((x-x[i]), mean = 0, sd = (0.37*ban)) # u[i]<- uniformize(Y, Y[i], inverse=FALSE, w=w) } ## qqplot(u, c(1:length(u))/length(u), xlab="quantiles of transformed data u_t", ## ylab="quantiles of Uniform (0,1)" ) u<- getU(x,Y,ban,delete, edgecorrect) w<- dnorm((x-xf), mean = 0, sd = (0.37*ban)) Yf<-uniformize(Y, u, inverse=TRUE, w=w) } Yf } ############################################################ MFpp_new<- function(x,Y,xf,ustar,ban, box=FALSE, L2=TRUE,fusion=0) { # returns MF2 point predictor (L1 or L2) of Yf at xf (as in Table 4.1) # using box or normal kernel (default) for cdf estimation # edgecorrect=TRUE implies omitting the u's close to the edges # as in Remark 4.8 of the paper YYY<- MFppDISTR_new(x,Y,xf,ustar,ban, box, fusion) Yf<- mean(YYY, na.rm=TRUE) if (L2==FALSE) {Yf<- median(YYY, na.rm=TRUE)} Yf } ############################################################ MFppDISTR_new<- function(x,Y,xf,ustar,ban, box=FALSE, fusion=0) { # returns MF2 naive predictive distr. of Yf at xf (to take mean or median of) # (x,Y) are data from regression; ban=bandwidth to use at xf # xf is a SINGLE number; box=TRUE means rectangular kernel (the default is Normal) # delete=TRUE only works for normal kernel here. # add fusion for LMF n<- length(x) u<- c(1:n) if (box==TRUE){ # rectangular (Box) kernel in uniformize for (i in 1:n){ # this computes the u_t of eq. (35) yw<- Y[ abs(x-x[i])< ban] # w=constant, Rectangular Window u[i]<- uniformize(yw, Y[i], inverse=FALSE) } # qqplot(u, c(1:length(u))/length(u), xlab="quantiles of transformed data u_t", # ylab="quantiles of Uniform (0,1)" ) u<- na.rm(u) Yf<-uniformize(Y[ abs(x-xf)< ban], u, inverse=TRUE) } if (box==FALSE){ # if not rectangular, then use normal kernel ## with IQ range =(-ban/4 ,ban/4)*stdev as Box, i.e., stdev=ban/2.7=0.37*ban #for (i in 1:n){ # this computes the u_t of eq. (35) # w<- dnorm((x-x[i]), mean = 0, sd = (0.37*ban)) # u[i]<- uniformize(Y, Y[i], inverse=FALSE, w=w) } ## qqplot(u, c(1:length(u))/length(u), xlab="quantiles of transformed data u_t", ## ylab="quantiles of Uniform (0,1)" ) ####################################################################################### # NOTE: DO NOT REGENERATE THE u FROM Y* - this may not be same as u* (denote as u**) # u<- getU(x,Y,ban,delete, edgecorrect) ####################################################################################### w<- dnorm((x-xf), mean = 0, sd = (0.37*ban)) if (fusion == 0) {Yf<-uniformize(Y, ustar, inverse=TRUE, w=w)} # for MF and PMF else {Yf <- inverse.cdf(Y,ustar,w=w)} # for LMF } Yf } ######################################################### crossvalMF<- function(x,y, nstep=20,bmax=0, box=FALSE, L1=TRUE){ # finds the best bandwidth for MPpp by cross-validation # searches for bandwidths from zero to bmax # uses criterion L2 or L1 (default) # nstep is number of grid points for bandwidth search # box=FALSE means using Normal kernel for smoothing # IN VIEW OF THE CLOSENESS OF MFpp TO A KERNEL SMOOTHER THIS GIVES # ALMOST IDENTICAL ANSWERS AS REGULAR crossval WHICH IS MUCH FASTER if (bmax==0){bmax<- 1.5*sqrt(var(x))} bstep<- bmax/nstep n<- length(x) SSE<- c(1:nstep)*0 if(L1==FALSE){ for (i in 1:nstep){ for (k in 2:(n-1)){ xx<- c(x[1:(k-1)], x[(k+1):n]) yy<- c(y[1:(k-1)], y[(k+1):n]) # xx,yy are delete-k data Yf<- MFpp(xx,yy,x[k],ban=(i*bstep),box=FALSE,L2=TRUE) SSE[i]<- SSE[i]+(y[k]-Yf )^2 }}} if(L1==TRUE){ for (i in 1:nstep){ for (k in 2:(n-1)){ xx<- c(x[1:(k-1)], x[(k+1):n]) yy<- c(y[1:(k-1)], y[(k+1):n]) # xx,yy are delete-k data Yf<- MFpp(xx,yy,x[k],ban=(i*bstep),box=FALSE,L2=FALSE) SSE[i]<- SSE[i]+abs(y[k]-Yf ) }} } k<- c(1:nstep)*bstep k<- k[SSE==min(SSE,na.rm = TRUE)] na.rm(k)} ########################################################### na.rm<-function(x) { x[!is.na(x)] } ############################################################ crossval<- function(x,y, nstep=20,bmax=0, L1=TRUE){ # finds the best bandwidth for ksmooth by cross-validation # searches for bandwidts from zero to bmax # uses criterion L1 or L2 (default) # nstep is number of grid points for bandwidth search # stud=TRUE means studentize by kernel smoothed estimate of variance # (with same bandwidth) if (bmax==0){bmax<- 1.5*sqrt(var(x))} bstep<- bmax/nstep n<- length(x) SSE<- c(1:nstep)*0 if(L1==FALSE){ for (i in 1:nstep){ for (k in 2:(n-1)){ xx<- c(x[1:(k-1)], x[(k+1):n]) yy<- c(y[1:(k-1)], y[(k+1):n]) kss<- ksmooth(xx,yy,"normal",ban=(i*bstep),x.points=x[k]) # rr<- kresid(xx,yy, (i*bstep)) SSE[i]<- SSE[i]+(y[k]-kss$y )^2 }}} if(L1==TRUE){ for (i in 1:nstep){ for (k in 2:(n-1)){ xx<- c(x[1:(k-1)], x[(k+1):n]) yy<- c(y[1:(k-1)], y[(k+1):n]) kss<- ksmooth(xx,yy,"normal",ban=(i*bstep),x.points=x[k]) # rr<- kresid(xx,yy, (i*bstep)) SSE[i]<- SSE[i]+ abs(y[k]-kss$y ) }}} k<- c(1:nstep)*bstep k<- k[SSE==min(SSE,na.rm = TRUE)] na.rm(k)} ########################################################## pred.intMF3<- function(x,y,xfut, fusion=0, model=0, B=999, alpha=0.9){ # gives key quantiles of the predictive distribution of yfut # to be observed at the value xfut using L2 predictors # based on ROOT method of MF3 paper Sections 3 and 4 # options: model =0 means MB; model=1 means MF/MB; # model =2 means MF2; and model=3 means MF/MF2 (MF3!). # xfut can be multivariate if model =0 or 1 only. # Cases MF2 and MF/MF2 are rather slow due to double loop # xfut now can be multivariate even if model = 2 or 3 and fusion =0 or 1 # fusion=1 only works for model=2 and 3. Please do not let fusion=1 when model<2 # when fusion=0, model=2 is MF, model=3 is PMF; when fusion=1, both model=2 and 3 are LMF (no difference) # Now when xfut is multivariate, e.g., xfut has k elements, then the output is a sequence with 2k elements. # for ith value of xfut, [(2i-1)th,(2i)th] values of output is the corresponding predictive interval. # add parameter alpha # If using NW estimator instead of PI, please search "NW estimator" and change it manually; # PMF with NW estimator might produce many NA values (caused by ksmooth function) n<- length(x) pp<- length(xfut) # remove the following warning #if (pp>1 & model==2) warning("need univariate xfut for MF2 option") pseudoresp<- matrix(0,B,pp) pred.df<- matrix(0,B,pp) # each column is an empirical pred.d.f. for Yfut[j] at xfut[j] b<- crossval(x,y, L1=T) # L1 cross validation # b<- crossvalMF(x,y, L1=T) # is an alternative but indistinguishable from above ks <- ksmooth(x,y,"normal",ban= b,x.points = x) # Get a regression estimate for y vs. x. ks2 <- ksmooth(x,y^2,"normal",ban= b,x.points = x) mm<- ks$y ss<- sqrt(ks2$y-mm^2) ks<- ksmooth(x,y,"normal",ban= b,x.points = xfut) mmfut<- ks$y ks2 <- ksmooth(x,y^2,"normal",ban= b,x.points = xfut) ssfut<- sqrt(ks2$y-mmfut^2) ee<- kresid(x,y,b,stud=TRUE) ee<- ee-mean(ee) if (model==1){ee<- pred.resid(x,y,b,stud=TRUE)} # used to be called: te # Bootstrap part: if (model<2){ # i.e., MB or MF/MB for (i in 1:B) { eboot<- sample(ee,size=n, replace=TRUE ) yboot<- mm+ss*eboot # Ystar data pseudoresp[i,]<- mmfut + ssfut* sample(ee, size=pp, replace=TRUE) # Yfut star ks <- ksmooth(x,yboot,"normal",ban= b,x.points = xfut) ks2 <- ksmooth(x,yboot^2,"normal",ban= b,x.points = xfut) sboot<- sqrt(ks2$y-ks$y^2) pred.df[i,]<- pseudoresp[i,]- ks$y + mmfut }} if (model >=2){ # i.e., MF2 or MF3 if (model==2){ delete<- FALSE } if (model==3){ delete<- TRUE} u<- getU(x,y,b,delete, edgecorrect=TRUE) for (kk in 1:pp) { PI<- MFpp(x,y, xfut[kk], ban=b, box=FALSE, L2=TRUE, fusion) # NW estimator: #PI=ksmooth(x,y,"normal",ban= b,x.points = xfut[kk])$y Ystar<- u*0 # here we need xfut to be UNIVARIATE #update: xfut doesn't have to be univariate. wf<- dnorm((x-xfut[kk]), mean = 0, sd = (0.37*b)) for (j in 1:B) { if (fusion==0) { uboot<- sample(u,size=n, replace=TRUE) } else { uboot<-runif(n) } for (i in 1:n){ # this computes the Ystar data w<- dnorm((x-x[i]), mean = 0, sd = (0.37*b)) if (fusion==0) { Ystar[i]<- uniformize(y, uboot[i], inverse=TRUE, w=w) Yf<- uniformize(y, sample(u,1), inverse=TRUE, w=wf) } else { # inverse.cdf <- function(x, prob, w) Ystar[i] <- inverse.cdf(y, uboot[i], w) Yf <- inverse.cdf(y, runif(1), wf) } } bootroot<- Yf-MFpp(x,Ystar, xfut[kk], ban=b, box=FALSE, L2=TRUE, fusion) # NW estimator: #bootroot<- Yf-ksmooth(x,Ystar,"normal",ban=b,x.points = xfut[kk])$y pred.df[j,kk]<- bootroot + PI } } } # end of model >=2 interval=rep(0,2*pp) # remove alpha #alpha=0.9 for (kk in 1:pp) { interval[c(2*kk-1,2*kk)]=quantile(pred.df[,kk],probs=c(1/2-alpha/2, 1/2+alpha/2),na.rm=TRUE) } return(interval) } ################################### pred.intNORM<- function(x,y,xfut, alpha=0.9){ # construct a (1-alpha)100% predictive interval for yfut (to be observed at xfut) # based on NORMAL approximation as in MF2 paper # Now pred.intNORM has the same type output as pred.intMF3 when xfut is multivariate # Minor changes on alpha, qq bw<- crossval(x,y, L1=T) ks <- ksmooth(x, y, "normal", ban = bw,x.points=xfut) mfut<- ks$y # this is the point predictor/estimate of regression vfut<- mfut*0 pp <- length(xfut) #qq<- matrix(0,12,pp) qq=qnorm(c(1/2-alpha/2,1/2+alpha/2)) interval=NA interval.temp=matrix(0,pp,2) for(j in 1:pp){ #qq[,j]<- qnorm(c(2.5, 5, 10,40,45,47.5, 52.5, 55,60, 90,95,97.5)/100) # these are the 12 normal quantiles vfut[j]<- sqrt( ksmooth.variance(x,y,bw, xfut[j], pred=T)) # pred=F for conf interval.temp[j,]<- mfut[j]+qq*vfut[j] } for (kk in 1:pp){ interval[c(2*kk-1,2*kk)]=interval.temp[kk,] } return(interval) } ############################################ conf.intNORM<- function(x,y,xfut, alpha=0.9){ #JUNE 2012 # construct a (1-alpha)100% conf interval for E yfut (to be observed at xfut) # based on NORMAL approximation as in MF2 paper # Now conf.intNORM has the same type output as conf.intMF3multi when xfut is multivariate # Minor changes on alpha, qq bw<- crossval(x,y, L1=T) ks <- ksmooth(x, y, "normal", ban = bw,x.points=xfut) mfut<- ks$y # this is the point predictor/estimate of regression vfut<- mfut*0 pp <- length(xfut) #qq<- matrix(0,12,pp) qq=qnorm(c(1/2-alpha/2,1/2+alpha/2)) interval=NA interval.temp=matrix(0,pp,2) for(j in 1:pp){ #qq[,j]<- qnorm(c(2.5, 5, 10,40,45,47.5, 52.5, 55,60, 90,95,97.5)/100) # these are the 12 normal quantiles vfut[j]<- sqrt( ksmooth.variance(x,y,bw, xfut[j], pred=F)) # pred=F for conf interval.temp[j,]<- mfut[j]+qq*vfut[j] } for (kk in 1:pp){ interval[c(2*kk-1,2*kk)]=interval.temp[kk,] } return(interval) } # DONE June 2012 ############################ conf.intMF3multi<- function(x,y,xfut, fusion=0, model=0, B=999, alpha=0.9){ # JUNE 2012 but based on pred.intMF3multi of Nov 2011 # gives key quantiles of the condifence distribution for m(xfut) # to be observed at the value xfut using L2 predictors # based on ROOT method of MF3 paper Sections 3 and 4 # options: model =0 means MB; model=1 means MF/MB; # model =2 means uniformize by Lin.Interp. (POLYGON) # model=3 means uniformize by kernel smoothed # xfut can be multivariate for ALL models (even 2 and 3) # Cases MF2 and MF/MF2 are rather slow due to double loop # (triple loop if xfut is multivariate) # xfut can be multivariate even if fusion =0 or 1 # fusion=1 only works for model=2 and 3. Please do not let fusion=1 when model<2 # when fusion=0, model=2 is MF, model=3 is PMF; when fusion=1, both model=2 and 3 are LMF (no difference) # Now when xfut is multivariate, e.g., xfut has k elements, then the output is a sequence with 2k elements. # for ith value of xfut, [(2i-1)th,(2i)th] values of output is the corresponding confidence interval. # add parameter alpha (level) # If using NW estimator instead of PI, please search "NW estimator" and change it manually. # PMF with NW estimator might produce many NA values (caused by ksmooth function) n<- length(x) pp<- length(xfut) # if (pp>1 & model==2) warning("need univariate xfut for MF2 option") pseudoresp<- matrix(0,B,pp) pred.df<- matrix(0,B,pp) # each column is an empirical pred.d.f. for Yfut[j] at xfut[j] b<- crossval(x,y, L1=T) # L1 cross validation # b<- crossvalMF(x,y, L1=T) # is an alternative but indistinguishable from above ks <- ksmooth(x,y,"normal",ban= b,x.points = x) # Get a regression estimate for y vs. x. ks2 <- ksmooth(x,y^2,"normal",ban= b,x.points = x) mm<- ks$y ss<- sqrt(ks2$y-mm^2) ks<- ksmooth(x,y,"normal",ban= b,x.points = xfut) mmfut<- ks$y # this is the point predictor/regression estimate at xfut ks2 <- ksmooth(x,y^2,"normal",ban= b,x.points = xfut) ssfut<- sqrt(ks2$y-mmfut^2) ee<- kresid(x,y,b,stud=TRUE) ee<- ee-mean(ee) if (model==1){ee<- pred.resid(x,y,b,stud=TRUE)} # used to be called: te # Bootstrap part: if (model<2){ # i.e., MB or MF/MB for (i in 1:B) { eboot<- sample(ee,size=n, replace=TRUE ) yboot<- mm+ss*eboot # Ystar data pseudoresp[i,]<- mmfut # + ssfut* sample(ee, size=pp, replace=TRUE) # Yfut star # the above is for confidence intervals so don't add random error ks2 <- ksmooth(x,yboot^2,"normal",ban= b,x.points = xfut) ks <- ksmooth(x,yboot,"normal",ban= b,x.points = xfut) sboot<- sqrt(ks2$y-ks$y^2) pred.df[i,]<- pseudoresp[i,]- ks$y + mmfut }} # end of model < 2 if (model >=2){ # i.e., MF2 or MF3 delete <- FALSE if (model==2){ uniformizeCC <- uniformize.kernel MFppCC<- MFpp } # MFppDISTRCC <- MFppDISTR if (model==3){ uniformizeCC <- uniformize.kernel delete <- TRUE # MFppCC<- MFpp.kernel MFppCC<- MFpp } # MFppDISTRCC <- MFppDISTR.kernel u<- getU(x,y,b,delete, edgecorrect=TRUE) # above line not really EEDED for (kk in 1:pp){ PI<- MFppCC(x,y, xfut[kk], ban=b, box=FALSE, L2=TRUE, fusion) # NW estimator: #PI=ksmooth(x,y,"normal",ban= b,x.points = xfut[kk])$y Ystar<- u*0 # here we need xfut to be UNIVARIATE wf<- dnorm((x-xfut[kk]), mean = 0, sd = (0.37*b)) for (j in 1:B) { #uboot<- sample(u,size=n, replace=TRUE) ################################### if (fusion==0) { uboot<- sample(u,size=n, replace=TRUE) } else { uboot<-runif(n) } for (i in 1:n){ # this computes the Ystar data w<- dnorm((x-x[i]), mean = 0, sd = (0.37*b)) # Algorithm of Ystar are different among LMF, MF,PMF if (fusion==0){ Ystar[i] <- uniformizeCC(y, uboot[i], inverse=TRUE, w=w) } else { Ystar[i] <- inverse.cdf(y,uboot[i],w) } } # NO: Yf<- uniformizeCC(y, sample(u,1), inverse=TRUE, w=wf) bootroot<- PI -MFpp_new(x,Ystar, xfut[kk], uboot, ban=b, box=FALSE, L2=TRUE, fusion) # NW estimator: #bootroot<- PI-ksmooth(x,Ystar,"normal",ban=b,x.points = xfut[kk])$y pred.df[j,kk]<- bootroot + PI } } } # end of model >=2 ################################### interval=rep(0,2*pp) # remove alpha #alpha=0.9 for (kk in 1:pp) { interval[c(2*kk-1,2*kk)]=quantile(pred.df[,kk],probs=c(1/2-alpha/2, 1/2+alpha/2),na.rm=TRUE) # interval[2*kk-1]=quantile(pred.df[,kk],probs=1/2-alpha/2) # interval[2*kk]=quantile(pred.df[,kk],probs=1/2+alpha/2) } return(interval) } ############################################ ############################################ ## \int K^2 =0.761 for normal kernel # \int K^2 =0.4857 for triangle # and \int K^2 =1 for box (in ksmooth) ################################## ksmooth.variance<- function(xx, yy = xx, b, xfut, pred = F){ # NEW FOR DEC 2012 # get estimate of variance of kernel smoothed regression at the point: xfut # xfut can be a vector n<- length(xx) m<- length(xfut) y.ones<- c(1:n)/ c(1:n) dens<- xfut*0 for (k in 1: m){ dens[k] <- sum(y.ones[ abs( xx-xfut[k]) < b/2])/(n*b) } # to give density of x-points at vector xfut NEW! ks <- ksmooth(xx, yy, "normal", ban = b, x.points=xx) r <- yy - ks$y # residuals VV <- (var(r) * 0.761)/(n*b*dens) if(pred == T) { VV <- var(r) * (1 + 0.761/(n*b*dens)) } #pred=T gives the variance for a predictive interval at vector: xfut VV } # CORRECTED: DECEMBER 2012 ####################################################### pred.resid<- function(x, y, b, stud = F, edgecorrect=TRUE){ # gets delete-1 predictive residuals for use in fast cross validation # stud=T implies diving by local (predictive) standard deviation as in MF2 paper mat<- cbind(x,y) matt<- mat[order(mat[,1]), 1:2] x<- matt[,1] y<- matt[,2] # re-order data so that x is sorted # because ksmooth needs it n <- length(x) if(stud == F) { rr <- c(1:n) * 0 for(k in 2:(n - 1)) { xx <- c(x[1:(k - 1)], x[(k + 1):n]) yy <- c(y[1:(k - 1)], y[(k + 1):n]) kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[k]) rr[k] <- y[k] - kss$y} xx <- x[2:n] yy <- y[2:n] kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[1]) rr[1] <- y[1] - kss$y xx <- x[1:(n - 1)] yy <- y[1:(n - 1)] kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[n]) rr[n] <- y[n] - kss$y} if(stud == T) { rr <- c(1:n) * 0 for(k in 2:(n - 1)) { xx <- c(x[1:(k - 1)], x[(k + 1):n]) yy <- c(y[1:(k - 1)], y[(k + 1):n]) kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[k]) kss2 <- ksmooth(xx, yy^2, "normal", ban = b, x.points = x[k]) vv<- max(c(0,kss2$y - kss$y^2), na.rm=TRUE) ss<- vv*0 if ( vv> 0 ) { ss<- sqrt(vv)} while (ss<= 0) {ss<- ss+0.001*sqrt(var(yy)) } rr[k] <- (y[k] - kss$y)/ss } xx <- x[2:n] yy <- y[2:n] kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[1]) kss2 <- ksmooth(xx, yy^2, "normal", ban = b, x.points = x[1]) vv<- max(c(0,kss2$y - kss$y^2), na.rm=TRUE) ss<- vv*0 if ( vv> 0 ) { ss<- sqrt(vv)} while (ss<= 0) {ss<- ss+0.001*sqrt(var(yy)) } rr[1] <- (y[1] - kss$y)/ss xx <- x[1:(n - 1)] yy <- y[1:(n - 1)] kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[n]) kss2 <- ksmooth(xx, yy^2, "normal", ban = b, x.points = x[n]) vv<- max(c(0,kss2$y - kss$y^2), na.rm=TRUE) ss<- vv*0 if ( vv> 0 ) { ss<- sqrt(vv)} while (ss<= 0) {ss<- ss+0.001*sqrt(var(yy)) } rr[n] <- (y[n] - kss$y)/ss } if (edgecorrect==TRUE){ xmin<- min(x, na.rm=T) xmax<- max(x, na.rm=T) rr[ abs(x-xmin) < b/2 | abs(x-xmax) < b/2 ]<- NA*rr[ abs(x-xmin) < b/2 | abs(x-xmax) < b/2 ] } na.rm(rr)} ####################################################################### kresid<-function(x,y,b,stud = FALSE, edgecorrect=TRUE){ # calculates residuals from ksmooth with bandwidth b # stud=TRUE implies diving by local (predictive) # standard deviation as in MF2 paper mat<- cbind(x,y) matt<- mat[order(mat[,1]), 1:2] x<- matt[,1] y<- matt[,2] # re-order data so that x is sorted # because ksmooth needs it n <- length(x) ks <- ksmooth(x,y,"normal",ban=b,x.points=x ) r<- y-ks$y if (stud==TRUE){ ks2 <- ksmooth(x,y^2,"normal",ban=b,x.points=x ) ss<- y*0 vv<- y*0 aa<- na.rm(ks2$y - ks$y^2) nn<- length(aa) vv[1:nn]<- aa[1:nn] if ( min(vv)>= 0 ) { ss<- sqrt(vv)} while (min(ss)<= 0) {ss<- ss+0.001*sqrt(var(y)) } r<- r/ss} if (edgecorrect==TRUE){ xmin<- min(x, na.rm=T) xmax<- max(x, na.rm=T) r[ abs(x-xmin) < b/2 | abs(x-xmax) < b/2 ]<- NA*r[ abs(x-xmin) < b/2 | abs(x-xmax) < b/2 ] } na.rm(r)} ############################################################## getU<- function(x,y,ban=0, delete=FALSE, edgecorrect=TRUE){ # get the transformed data in MF2 using normal kernel # delete=F gives the regular u's of eq. (35) # delete=T gives the predictive u's from the delete-1 data--eq. (39) # if edgecorrect=TRUE, leave out the u's that are too close to the edges n<- length(x) if (ban==0) {ban<- crossval(x,y)} u<- x if (delete==FALSE){ for (i in 1:n){ # this computes the u_t of eq. (35) w<- dnorm((x-x[i]), mean = 0, sd = (0.37*ban)) u[i]<- uniformize(y, y[i], inverse=FALSE, w=w) } } if (delete==TRUE){ for(k in 2:(n - 1)) { xx <- c(x[1:(k - 1)], x[(k + 1):n]) yy <- c(y[1:(k - 1)], y[(k + 1):n]) w<- dnorm((xx-x[k]), mean = 0, sd = (0.37*ban)) u[k]<- uniformize(yy, y[k], inverse=FALSE, w=w) } xx <- x[2:n] yy <- y[2:n] w<- dnorm((xx-x[1]), mean = 0, sd = (0.37*ban)) u[1]<- uniformize(yy, y[1], inverse=FALSE, w=w) xx <- x[1:(n - 1)] yy <- y[1:(n - 1)] w<- dnorm((xx-x[n]), mean = 0, sd = (0.37*ban)) u[n]<- uniformize(yy, y[n], inverse=FALSE, w=w) } # end if delete==TRUE if (edgecorrect==TRUE){ xmin<- min(x, na.rm=T) xmax<- max(x, na.rm=T) u[ abs(x-xmin) < ban/2 | abs(x-xmax) < ban/2 ]<- NA*u[ abs(x-xmin) < ban/2 | abs(x-xmax) < ban/2 ] } na.rm(u)} ############################################################## ####################################################################### ########### END OF MAIN FUNCTIONS ##################################### ####################################################################### #################################################################### # #### Examples #### # #The piecewise linear edf # x <- sort(rnorm(10)) # x <- c(min(x), x) # w <- c(rep(1,10),2) # w <- w/sum(w) # #plot(ecdf(x)) # plot(stepfun(x, cumsum(c(0,w)))) # tmp <- seq(min(x)-.4,max(x)+.4, .01) # lines(tmp, uniformizeSPlus(x, tmp, w=w), col="red") # # #The inverse of the piecewise linear edf # #Two lines are added to the plot produced above # #which demonstrate Finv(.75) is the point on the fitted # #function. # abline(v=uniformize(x, .75, inverse=TRUE, w=w), col="blue") # abline(h=.75, col="blue")