################################################################ # R functions used to implement the MB, MF/MB, MF2 and MF/MF2 procedures # from paper "Model-free model-fitting and predictive distributions" # by D.N. Politis (March 2010--revised April 2012). ############################################################ # To begin, choose the type of uniformize from the three functions # given at the end of this document, e.g. uniformize <- uniformize.kernel # or: uniformize <- uniformize.linear2 ##################################################################### MFpp<- function(x,Y,xf,ban, box=FALSE, L2=TRUE, delete=FALSE, edgecorrect=TRUE) { # returns MF2 point predictor (L1 or L2) of Yf at xf (as in Table 4.1) # using box or normal kernel (default) for cdf estimation # edgecorrect=TRUE implies omitting the u's close to the edges # as in Remark 4.8 of the paper YYY<- MFppDISTR(x,Y,xf,ban, box, delete, edgecorrect) Yf<- mean(YYY, na.rm=TRUE) if (L2==FALSE) {Yf<- median(YYY, na.rm=TRUE)} Yf } ############################################################ MFppDISTR<- function(x,Y,xf,ban, box=FALSE,delete=FALSE, edgecorrect=TRUE) { # returns MF2 naive predictive distr. of Yf at xf (to take mean or median of) # (x,Y) are data from regression; ban=bandwidth to use at xf # xf is a SINGLE number; box=TRUE means rectangular kernel (the default is Normal) # delete=TRUE only works for normal kernel here. n<- length(x) u<- c(1:n) if (box==TRUE){ # rectangular (Box) kernel in uniformize for (i in 1:n){ # this computes the u_t of eq. (35) yw<- Y[ abs(x-x[i])< ban] # w=constant, Rectangular Window u[i]<- uniformize(yw, Y[i], inverse=FALSE) } # qqplot(u, c(1:length(u))/length(u), xlab="quantiles of transformed data u_t", # ylab="quantiles of Uniform (0,1)" ) u<- na.rm(u) Yf<-uniformize(Y[ abs(x-xf)< ban], u, inverse=TRUE) } if (box==FALSE){ # if not rectangular, then use normal kernel ## with IQ range =(-ban/4 ,ban/4)*stdev as Box, i.e., stdev=ban/2.7=0.37*ban #for (i in 1:n){ # this computes the u_t of eq. (35) # w<- dnorm((x-x[i]), mean = 0, sd = (0.37*ban)) # u[i]<- uniformize(Y, Y[i], inverse=FALSE, w=w) } ## qqplot(u, c(1:length(u))/length(u), xlab="quantiles of transformed data u_t", ## ylab="quantiles of Uniform (0,1)" ) u<- getU(x,Y,ban,delete, edgecorrect) w<- dnorm((x-xf), mean = 0, sd = (0.37*ban)) Yf<-uniformize(Y, u, inverse=TRUE, w=w) } Yf } ######################################################### crossvalMF<- function(x,y, nstep=20,bmax=0, box=FALSE, L1=TRUE){ # finds the best bandwidth for MPpp by cross-validation # searches for bandwidths from zero to bmax # uses criterion L2 or L1 (default) # nstep is number of grid points for bandwidth search # box=FALSE means using Normal kernel for smoothing # IN VIEW OF THE CLOSENESS OF MFpp TO A KERNEL SMOOTHER THIS GIVES # ALMOST IDENTICAL ANSWERS AS REGULAR crossval WHICH IS MUCH FASTER if (bmax==0){bmax<- 1.5*sqrt(var(x))} bstep<- bmax/nstep n<- length(x) SSE<- c(1:nstep)*0 if(L1==FALSE){ for (i in 1:nstep){ for (k in 2:(n-1)){ xx<- c(x[1:(k-1)], x[(k+1):n]) yy<- c(y[1:(k-1)], y[(k+1):n]) # xx,yy are delete-k data Yf<- MFpp(xx,yy,x[k],ban=(i*bstep),box=FALSE,L2=TRUE) SSE[i]<- SSE[i]+(y[k]-Yf )^2 }}} if(L1==TRUE){ for (i in 1:nstep){ for (k in 2:(n-1)){ xx<- c(x[1:(k-1)], x[(k+1):n]) yy<- c(y[1:(k-1)], y[(k+1):n]) # xx,yy are delete-k data Yf<- MFpp(xx,yy,x[k],ban=(i*bstep),box=FALSE,L2=FALSE) SSE[i]<- SSE[i]+abs(y[k]-Yf ) }} } k<- c(1:nstep)*bstep k<- k[SSE==min(SSE,na.rm = TRUE)] na.rm(k)} ########################################################### na.rm<-function(x) { x[!is.na(x)] } ############################################################ crossval<- function(x,y, nstep=20,bmax=0, L1=TRUE){ # finds the best bandwidth for ksmooth by cross-validation # searches for bandwidts from zero to bmax # uses criterion L1 or L2 (default) # nstep is number of grid points for bandwidth search # stud=TRUE means studentize by kernel smoothed estimate of variance # (with same bandwidth) if (bmax==0){bmax<- 1.5*sqrt(var(x))} bstep<- bmax/nstep n<- length(x) SSE<- c(1:nstep)*0 if(L1==FALSE){ for (i in 1:nstep){ for (k in 2:(n-1)){ xx<- c(x[1:(k-1)], x[(k+1):n]) yy<- c(y[1:(k-1)], y[(k+1):n]) kss<- ksmooth(xx,yy,"normal",ban=(i*bstep),x.points=x[k]) # rr<- kresid(xx,yy, (i*bstep)) SSE[i]<- SSE[i]+(y[k]-kss$y )^2 }}} if(L1==TRUE){ for (i in 1:nstep){ for (k in 2:(n-1)){ xx<- c(x[1:(k-1)], x[(k+1):n]) yy<- c(y[1:(k-1)], y[(k+1):n]) kss<- ksmooth(xx,yy,"normal",ban=(i*bstep),x.points=x[k]) # rr<- kresid(xx,yy, (i*bstep)) SSE[i]<- SSE[i]+ abs(y[k]-kss$y ) }}} k<- c(1:nstep)*bstep k<- k[SSE==min(SSE,na.rm = TRUE)] na.rm(k)} ########################################################## pred.intMF3<- function(x,y,xfut, model=0, B=999){ # gives key quantiles of the predictive distribution of yfut # to be observed at the value xfut using L2 predictors # based on ROOT method of MF3 paper Sections 3 and 4 # options: model =0 means MB; model=1 means MF/MB; # model =2 means MF2; and model=3 means MF/MF2 (MF3!). # xfut can be multivariate if model =0 or 1 only. # Cases MF2 and MF/MF2 are rather slow due to double loop n<- length(x) pp<- length(xfut) if (pp>1 & model==2) warning("need univariate xfut for MF2 option") pseudoresp<- matrix(0,B,pp) pred.df<- matrix(0,B,pp) # each column is an empirical pred.d.f. for Yfut[j] at xfut[j] b<- crossval(x,y, L1=T) # L1 cross validation # b<- crossvalMF(x,y, L1=T) # is an alternative but indistinguishable from above ks <- ksmooth(x,y,"normal",ban= b,x.points = x) # Get a regression estimate for y vs. x. ks2 <- ksmooth(x,y^2,"normal",ban= b,x.points = x) mm<- ks$y ss<- sqrt(ks2$y-mm^2) ks<- ksmooth(x,y,"normal",ban= b,x.points = xfut) mmfut<- ks$y ks2 <- ksmooth(x,y^2,"normal",ban= b,x.points = xfut) ssfut<- sqrt(ks2$y-mmfut^2) ee<- kresid(x,y,b,stud=TRUE) ee<- ee-mean(ee) if (model==1){ee<- pred.resid(x,y,b,stud=TRUE)} # used to be called: te # Bootstrap part: if (model<2){ # i.e., MB or MF/MB for (i in 1:B) { eboot<- sample(ee,size=n, replace=TRUE ) yboot<- mm+ss*eboot # Ystar data pseudoresp[i,]<- mmfut + ssfut* sample(ee, size=pp, replace=TRUE) # Yfut star ks <- ksmooth(x,yboot,"normal",ban= b,x.points = xfut) ks2 <- ksmooth(x,yboot^2,"normal",ban= b,x.points = xfut) sboot<- sqrt(ks2$y-ks$y^2) pred.df[i,]<- pseudoresp[i,]- ks$y + mmfut }} if (model >=2){ # i.e., MF2 or MF3 if (model==2){ delete<- FALSE } if (model==3){ delete<- TRUE} u<- getU(x,y,b,delete, edgecorrect=TRUE) PI<- MFpp(x,y, xfut, ban=b, box=FALSE, L2=TRUE, delete, edgecorrect=TRUE) Ystar<- u # here we need xfut to be UNIVARIATE wf<- dnorm((x-xfut), mean = 0, sd = (0.37*b)) for (j in 1:B) { uboot<- sample(u,size=n, replace=TRUE) for (i in 1:n){ # this computes the Ystar data w<- dnorm((x-x[i]), mean = 0, sd = (0.37*b)) Ystar[i]<- uniformize(y, uboot[i], inverse=TRUE, w=w) } Yf<- uniformize(y, sample(u,1), inverse=TRUE, w=wf) bootroot<- Yf-MFpp(x,Ystar, xfut, ban=b, box=FALSE, L2=TRUE, delete, edgecorrect=TRUE) pred.df[j,]<- bootroot + PI } } # pred.df #this matrix has columns=empirical pred.distributions # quantile(pred.df, probs=c(alpha/2, 1-alpha/2)) qq<- matrix(0,12,pp) for(j in 1:pp){ qq[,j]<- quantile(pred.df[,j], probs=c(2.5, 5, 10,40,45,47.5, 52.5, 55,60, 90,95,97.5)/100, na.rm=TRUE) # output is 12 quantiles of the predictive distribution of Yfut # e.g. 90% pred.int. for j-th column is (qq[2,j],qq[11,j]) } qq} #END ################################### pred.intNORM<- function(x,y,xfut, alpha=0.05){ # construct a (1-alpha)100% predictive interval for yfut (to be observed at xfut) # based on NORMAL approximation as in MF2 paper bw<- crossval(x,y, L1=T) ks <- ksmooth(x, y, "normal", ban = bw,x.points=xfut) mfut<- ks$y vfut<- mfut*0 pp <- length(xfut) qq<- matrix(0,12,pp) for(j in 1:pp){ qq[,j]<- qnorm(c(2.5, 5, 10,40,45,47.5, 52.5, 55,60, 90,95,97.5)/100) # these are the 12 normal quantiles vfut[j]<- sqrt( ksmooth.variance(x,y,bw, xfut[j], pred=T)) qq[,j]<- mfut[j]+qq[,j]*vfut[j] } qq} ############################################ ## \int K^2 =0.761 for normal kernel # \int K^2 =0.4857 for triangle # and \int K^2 =1 for box (in ksmooth) ################################## ksmooth.variance<- function(xx, yy = xx, b, xfut, pred = F){ # get estimate of variance of kernel smoothed regression at point: xfut ynull<- c(1:length(xx))/c(1:length(xx)) # to give density ks1 <- ksmooth( xx, ynull, "normal", bandwidth = b, x.points = xfut) dens <- ks1$y #is the value of the density of the x-points at point: xfut ks <- ksmooth(xx, yy, "normal", ban = b, x.points=xx) r <- yy - ks$y # residuals VV <- (var(r) * 0.761)/(b * length(xx)) if(pred == T) { VV <- var(r) * (1 + 0.761/(b * length(xx))) } #pred=T gives the variance for a predictive interval at point: xfut VV } ####################################################### pred.resid<- function(x, y, b, stud = F, edgecorrect=TRUE){ # gets delete-1 predictive residuals for use in fast cross validation # stud=T implies diving by local (predictive) standard deviation as in MF2 paper mat<- cbind(x,y) matt<- mat[order(mat[,1]), 1:2] x<- matt[,1] y<- matt[,2] # re-order data so that x is sorted # because ksmooth needs it n <- length(x) if(stud == F) { rr <- c(1:n) * 0 for(k in 2:(n - 1)) { xx <- c(x[1:(k - 1)], x[(k + 1):n]) yy <- c(y[1:(k - 1)], y[(k + 1):n]) kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[k]) rr[k] <- y[k] - kss$y} xx <- x[2:n] yy <- y[2:n] kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[1]) rr[1] <- y[1] - kss$y xx <- x[1:(n - 1)] yy <- y[1:(n - 1)] kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[n]) rr[n] <- y[n] - kss$y} if(stud == T) { rr <- c(1:n) * 0 for(k in 2:(n - 1)) { xx <- c(x[1:(k - 1)], x[(k + 1):n]) yy <- c(y[1:(k - 1)], y[(k + 1):n]) kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[k]) kss2 <- ksmooth(xx, yy^2, "normal", ban = b, x.points = x[k]) vv<- max(c(0,kss2$y - kss$y^2), na.rm=TRUE) ss<- vv*0 if ( vv> 0 ) { ss<- sqrt(vv)} while (ss<= 0) {ss<- ss+0.001*sqrt(var(yy)) } rr[k] <- (y[k] - kss$y)/ss } xx <- x[2:n] yy <- y[2:n] kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[1]) kss2 <- ksmooth(xx, yy^2, "normal", ban = b, x.points = x[1]) vv<- max(c(0,kss2$y - kss$y^2), na.rm=TRUE) ss<- vv*0 if ( vv> 0 ) { ss<- sqrt(vv)} while (ss<= 0) {ss<- ss+0.001*sqrt(var(yy)) } rr[1] <- (y[1] - kss$y)/ss xx <- x[1:(n - 1)] yy <- y[1:(n - 1)] kss <- ksmooth(xx, yy, "normal", ban = b, x.points = x[n]) kss2 <- ksmooth(xx, yy^2, "normal", ban = b, x.points = x[n]) vv<- max(c(0,kss2$y - kss$y^2), na.rm=TRUE) ss<- vv*0 if ( vv> 0 ) { ss<- sqrt(vv)} while (ss<= 0) {ss<- ss+0.001*sqrt(var(yy)) } rr[n] <- (y[n] - kss$y)/ss } if (edgecorrect==TRUE){ xmin<- min(x, na.rm=T) xmax<- max(x, na.rm=T) rr[ abs(x-xmin) < b/2 | abs(x-xmax) < b/2 ]<- NA*rr[ abs(x-xmin) < b/2 | abs(x-xmax) < b/2 ] } na.rm(rr)} ####################################################################### kresid<-function(x,y,b,stud = FALSE, edgecorrect=TRUE){ # calculates residuals from ksmooth with bandwidth b # stud=TRUE implies diving by local (predictive) # standard deviation as in MF2 paper mat<- cbind(x,y) matt<- mat[order(mat[,1]), 1:2] x<- matt[,1] y<- matt[,2] # re-order data so that x is sorted # because ksmooth needs it n <- length(x) ks <- ksmooth(x,y,"normal",ban=b,x.points=x ) r<- y-ks$y if (stud==TRUE){ ks2 <- ksmooth(x,y^2,"normal",ban=b,x.points=x ) ss<- y*0 vv<- y*0 aa<- na.rm(ks2$y - ks$y^2) nn<- length(aa) vv[1:nn]<- aa[1:nn] if ( min(vv)>= 0 ) { ss<- sqrt(vv)} while (min(ss)<= 0) {ss<- ss+0.001*sqrt(var(y)) } r<- r/ss} if (edgecorrect==TRUE){ xmin<- min(x, na.rm=T) xmax<- max(x, na.rm=T) r[ abs(x-xmin) < b/2 | abs(x-xmax) < b/2 ]<- NA*r[ abs(x-xmin) < b/2 | abs(x-xmax) < b/2 ] } na.rm(r)} ############################################################## getU<- function(x,y,ban=0, delete=FALSE, edgecorrect=TRUE){ # get the transformed data in MF2 using normal kernel # delete=F gives the regular u's of eq. (35) # delete=T gives the predictive u's from the delete-1 data--eq. (39) # if edgecorrect=TRUE, leave out the u's that are too close to the edges n<- length(x) if (ban==0) {ban<- crossval(x,y)} u<- x if (delete==FALSE){ for (i in 1:n){ # this computes the u_t of eq. (35) w<- dnorm((x-x[i]), mean = 0, sd = (0.37*ban)) u[i]<- uniformize(y, y[i], inverse=FALSE, w=w) } } if (delete==TRUE){ for(k in 2:(n - 1)) { xx <- c(x[1:(k - 1)], x[(k + 1):n]) yy <- c(y[1:(k - 1)], y[(k + 1):n]) w<- dnorm((xx-x[k]), mean = 0, sd = (0.37*ban)) u[k]<- uniformize(yy, y[k], inverse=FALSE, w=w) } xx <- x[2:n] yy <- y[2:n] w<- dnorm((xx-x[1]), mean = 0, sd = (0.37*ban)) u[1]<- uniformize(yy, y[1], inverse=FALSE, w=w) xx <- x[1:(n - 1)] yy <- y[1:(n - 1)] w<- dnorm((xx-x[n]), mean = 0, sd = (0.37*ban)) u[n]<- uniformize(yy, y[n], inverse=FALSE, w=w) } # end if delete==TRUE if (edgecorrect==TRUE){ xmin<- min(x, na.rm=T) xmax<- max(x, na.rm=T) u[ abs(x-xmin) < ban/2 | abs(x-xmax) < ban/2 ]<- NA*u[ abs(x-xmin) < ban/2 | abs(x-xmax) < ban/2 ] } na.rm(u)} ####################################################################### ########### END OF MAIN FUNCTIONS ##################################### ####################################################################### ####### BELOW ARE THREE OPTIONS FOR THE uniformize FUNCTION ########### ####################################################################### uniformize.linear <- function(x, x0, inverse=FALSE, w) { # MARCH 2010 # this function implements transformation (35) and its inverse # using a piecewise linear estimate of local cdf # IF inverse=FALSE, then x is really the Y data in regression # IF inverse=TRUE, then x is actually the x variable # x0 can be vector! n <- length(x) if (n <= 1) return(NA) # ("sample size too small!") if(missing(w)) { x <- sort(x) w <- (0:n)/n } else { if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } ord <- order(x) x <- x[ord] w <- cumsum(c(0,w[ord])) } midpoints <- (x[-n] + x[-1])/2 midpoints <- c(2*x[1] - midpoints[1], midpoints, 2*x[n] - midpoints[n-1]) if(inverse) { Finv <- approxfun(w, midpoints) return(Finv(x0)) } else { Fmid <- approxfun(midpoints, w) F <- function(t) ifelse(t <= midpoints[1], 0, ifelse(t >= midpoints[n+1], 1, Fmid(t))) return(F(x0)) } } ############################################################################ # BELOW IS A NEW VERSION OF THE uniformize R FUNCTION (Oct. 2011) uniformize.linear2 <- function(x, x0, inverse=FALSE, w) { # October 2011 # this function implements transformation (35) and its inverse # using a piecewise linear estimate of local cdf # IF inverse=FALSE, then x is really the Y data in regression # IF inverse=TRUE, then x is actually the x variable # x0 can be vector! n <- length(x) if(missing(w)) { x <- sort(x) w <- (0:n)/n } else { if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } ord <- order(x) x <- x[ord] w <- cumsum(c(0,w[ord])) } midpoints <- (x[-n] + x[-1])/2 lwr <- min(2* midpoints[1] - midpoints[2], x[1] - (midpoints[1]-x[1])) upr <- max(2* midpoints[n-1] - midpoints[n-2], x[n] + (x[n] - midpoints[n-1])) midpoints <- c(lwr, midpoints, upr) if(inverse) { Finv <- approxfun(w, midpoints) return(Finv(x0)) } else { Fmid <- approxfun(midpoints, w) F <- function(t) ifelse(t <= midpoints[1], 0, ifelse(t >= midpoints[n+1], 1, Fmid(t))) return(F(x0)) } } ################################################################## uniformizeSPlus <- function(x, x0, inverse=FALSE, w) { # Splus version of uniformize.linear n <- length(x) if(missing(w)) { x <- sort(x) w <- (0:n)/n } else { if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } ord <- order(x) x <- x[ord] w <- cumsum(c(0,w[ord])) } midpoints <- (x[-n] + x[-1])/2 lwr <- min(2* midpoints[1] - midpoints[2], x[1] - (midpoints[1]-x[1])) upr <- max(2* midpoints[n-1] - midpoints[n-2], x[n] + (x[n] - midpoints[n-1])) midpoints <- c(lwr, midpoints, upr) if(inverse) { return(approx(w, midpoints, x0)) } else { F <- function(t) { ret <- numeric(length(t)) zeros <- (t <= midpoints[1]) ones <- (t >= midpoints[n+1]) mid <- ((t > midpoints[1]) & (t < midpoints[n+1])) ret[mid] <- approx(midpoints, w, t[mid])$y ret[ones] <- 1 ret[zeros] <- 0 return(ret) } return(F(x0)) } } ################################################################# uniformize.kernel <- function(x, x0, inverse=FALSE, w) { # APRIL 2012 # New R function that does uniformize on the basis # of a kernel smoothed edf (need kernel > 0 everywhere # to ensure the resulting edf is invertible) n <- length(x) if (n <= 1) return(NA) # ("sample size too small!") if(missing(w)) { w <- c(1:n)*0+1/n } if(any(w < 0)) warning("Negative weight encountered.") if(sum(w) != 1) { w <- w/sum(w) warning("Weights do not sum to 1. Renormalizing.") } if ( is.na(sum(x)) ) { w<- w[!is.na(x)] w <- w/sum(w) x<- x[!is.na(x)] warning("missing values in x") } # first compute the smoothed edf, i.e., the inverse=FALSE case ppdf<- density(x, weights = w ) # default uses gaussian kernel npdf<- length(ppdf$x) delta<- ppdf$x[2]-ppdf$x[1] pcdf<- cumsum(ppdf$y)*delta if(inverse) { FFinv <- approxfun(pcdf, ppdf$x) return(FFinv(x0)) #In the inverse case x0 denotes the vector of y values to which FFinv is applied } else { FF <- approxfun(ppdf$x, pcdf) return(FF(x0)) #Here x0 denotes the vector of x values to which FF is applied }} #################################################################### ####################### END ######################################## #################################################################### #### Examples #### #The piecewise linear edf x <- sort(rnorm(10)) x <- c(min(x), x) w <- c(rep(1,10),2) w <- w/sum(w) #plot(ecdf(x)) plot(stepfun(x, cumsum(c(0,w)))) tmp <- seq(min(x)-.4,max(x)+.4, .01) lines(tmp, uniformizeSPlus(x, tmp, w=w), col="red") #The inverse of the piecewise linear edf #Two lines are added to the plot produced above #which demonstrate Finv(.75) is the point on the fitted #function. abline(v=uniformize(x, .75, inverse=TRUE, w=w), col="blue") abline(h=.75, col="blue")