MIP/DESCRIPTION00006660000000212013246134256006704 0ustar00Package: MIP Type: Package Title: Multiple Influential Point Detection Version: 2.0 Date: 2018-2-1 Author: Chao Liu Maintainer: Lu Niu Description: By explicitly taking into account the covariance structure of Y and the idea of random group deletion, we propose a novel procedure named MIP, short for multiple influential point detection for high-dimensional data. Along the process, we propose two novel quantities named Max and Min statistics to assess the extremeness of each point when data are sub-sampled. The Min statistic is useful for overcoming the swamping effect but less effective for masked influential observations, while the Max statistic is well suited for detecting masked influential observations but is less effective in handling the swamping effect. Combining their advantages, we propose a computationally efficient yet simple Min-Max algorithm for obtaining a clean subset of the data that contains no influential points. License: GPL LazyLoad: yes Depends: NeedsCompilation: no Packaged: 2018-03-02 02:36:30 UTC; niulu MIP/NAMESPACE00006660000000015713246134254006423 0ustar00importFrom(stats,cor) importFrom(stats,mad) importFrom(stats,median) importFrom(stats,pchisq) export(MIP) MIP/R/00007770000000000013246134254005402 5ustar00MIP/R/MIP.r00006660000001604513246134254006220 0ustar00library(Matrix) library(MASS) ############################### ##main function ##MIP: function to detect multiple influential points ##Usage: MIP(X,Y,n,p,q,n_subset,subset_vol,ep=0.1,alpha) ##input: # X: the data of predictors with dimension n by p # Y: the data of response with dimension n by q # n: the sample size # p: the dimension of predictor # q: the dimension of response # n_subset: the number of subsets chosen at random to compute the Min and Max statistics # subset_vol: the samples size in each subset # ep: the proportion of maximum number of rejected null hypothesis in the Min-step. The defaulted value is set at 0.1. # alpha: significant level used in FDR procedure ##output #inf_setfinal: the indices of the influential points detected by MIP algorithm #################################################### ####explanation for the functions used ########################################################## ##fun_pv: function to comupute the max-statistics and the min-statistics ##Usage: fun_pv(X,Y,n,p,q,n_subset,subset_vol,clean_setv) ##input: #clean_setv: the estimated clean set obtained by Min/Max step ##output: #Tmax: the values of the max-statistics #Tmin: the values of the min-statistics ########################################################## ########################################################## ##fun_masking: function to detect the influential points using the max-statistics ##Usage:fun_masking(X,Y,n,p,q,n_subset,subset_vol,clean_setv,alpha) ##input: #clean_setv: an input value of estimated clean set obtained by Min/Max step ##output: #clean_set: the indices of the estimated clean set obtained by Max-step ########################################################## ########################################################## ##fun_swamping: function to detect the influential points by using the min-statistics ##Usage: fun_swamping(X,Y,n,p,q,n_subset,subset_vol,clean_setv,ep=0.1,alpha) ##input: #clean_setv: an input value of estimated clean set obtained by Min/Max step ##output: #clean_setv: the indices of the estimated clean set obtained by the Min-step (obtained by the min-statistics) ########################################################## ########################################################## ##fun_checking:function for checking whether there are noninfluentila points being identified as influential ones ##Usage: fun_checking(X,Y,n,p,q,inf_t,clean_t,alpha) ##input: #inf_t: the estimated indices of influential poins found by Min-Max algorithm #clean_t: the estimated indices of clean poins found by Min-Max algorithm ##output: #inf_setfinal: the estimated indices of influential points obtained by MIP algorithm, after applying the checking algorithm to the potential influential point inf_t. ########################################################## fun_pv=function(X,Y,n,p,q,n_subset,subset_vol,clean_setv) { rob_sd=apply(X,2,mad) X=X-rep(1,n)%o%apply(X,2,median); X=X%*%diag(1/rob_sd); if (q==1) {Y=(Y-median(Y))/mad(Y)} if (q>1) { y_sig=apply(Y,1,mad) Y_sigma_D=diag(y_sig,q,q) Y_sigma_R=sin(pi/2*(cor(t(Y),method='kendall'))) Y_sigma=Y_sigma_D%*%Y_sigma_R%*%Y_sigma_D sY_sigma=eigen(Y_sigma) YY_sigma=sY_sigma$vectors%*%diag(1/(sqrt(sY_sigma$values)))%*%t(sY_sigma$vectors) Y=YY_sigma%*%(Y-median(Y)) } TT=rep(0,n_subset) Tmax=rep(0,length(clean_setv)) Tmin=rep(0,length(clean_setv)) for (i in 1:length(clean_setv)) { S_i=setdiff(clean_setv,clean_setv[i]) for (m in 1: n_subset) { I=sample(S_i,size =subset_vol,replace=FALSE,prob=NULL) X1=X[c(clean_setv[i],I),] Y1=Y[,c(clean_setv[i],I)] X2=X[I,] Y2=Y[,I] rhat1= Y1%*%X1/(subset_vol+1) rhat2=Y2%*%X2/subset_vol TT[m]=((subset_vol+1)^2)*(sum((rhat1-rhat2)^2)/p) } Tmax[i]=max(TT) Tmin[i]=min(TT) } list(Tmax=Tmax,Tmin=Tmin) } fun_masking=function(X,Y,n,p,q,n_subset,subset_vol,clean_setv,alpha) { Tmax=(fun_pv(X,Y,n,p,q,n_subset,subset_vol,clean_setv))$Tmax pv=1-pchisq(Tmax,q) Spv=sort.int(pv,index.return=TRUE) # sorted p value Si=Spv$ix dp=Spv$x-alpha*c(1:length(Tmax))/length(Tmax) In=which(dp<=0) # BH procedure to control the error rate if (length(In)==0) {clean_set=clean_setv} else { rin=max(In) inf_set=clean_setv[Si[1:rin]] clean_set=setdiff(clean_setv,inf_set) } list(clean_set=clean_set) } fun_swamping=function(X,Y,n,p,q,n_subset,subset_vol,clean_setv,ep=0.1,alpha) { Tmin=(fun_pv(X,Y,n,p,q,n_subset,subset_vol,clean_setv))$Tmin pvv=1-pchisq(Tmin,q) Spvv=sort.int(pvv,index.return=TRUE) Sii=Spvv$ix dpv=Spvv$x-alpha*c(1:length(Tmin))/length(Tmin) In=which(dpv<=0) if (length(In)==0) {clean_set=clean_setv} else { rin=max(In) inf_setv=clean_setv[Sii[1:min(floor(ep*n),rin)]] clean_set=setdiff(clean_setv,inf_setv) } clean_setv=clean_set list(clean_setv=clean_setv) } fun_checking=function(X,Y,n,p,q,inf_t,clean_t,alpha) { rob_sd=apply(X,2,mad) X=X-rep(1,n)%o%apply(X,2,median); X=X%*%diag(1/rob_sd); if (q==1) {Y=(Y-median(Y))/mad(Y)} if (q>1) { y_sig=apply(Y,1,mad) Y_sigma_D=diag(y_sig,q,q) Y_sigma_R=sin(pi/2*(cor(t(Y),method='kendall'))) Y_sigma=Y_sigma_D%*%Y_sigma_R%*%Y_sigma_D sY_sigma=eigen(Y_sigma) YY_sigma=sY_sigma$vectors%*%diag(1/(sqrt(sY_sigma$values)))%*%t(sY_sigma$vectors) Y=YY_sigma%*%(Y-median(Y)) } T=rep(0,length(inf_t)) for (i in 1:length(inf_t)) { X1=X[c(inf_t[i],clean_t),] Y1=Y[,c(inf_t[i],clean_t)] X2=X[clean_t,] Y2=Y[,clean_t] rhat1=Y1%*%X1/(length(clean_t)+1) rhat2=Y2%*%X2/length(clean_t) T[i]=((length(clean_t)+1)^2)*(sum((rhat1-rhat2)^2)/p) } pv_inf=1-pchisq(T,q) Spv_inf=sort.int(pv_inf,index.return=TRUE) Si=Spv_inf$ix dp=Spv_inf$x-alpha*c(1:length(inf_t))/length(inf_t) In=which(dp<=0) if (length(In)==0) {clean_setfinal=c(1:n) inf_setfinal=setdiff(c(1:n),clean_setfinal) } else{ rin=max(In) inf_setfinal=inf_t[Si[1:rin]] clean_setfinal=setdiff(c(1:n),inf_setfinal) } list(inf_setfinal=inf_setfinal) } MIP=function(X,Y,n,p,q,n_subset,subset_vol,ep=0.1,alpha) { clean_setv=c(1:n) clean_setv=fun_swamping(X,Y,n,p,q,n_subset,subset_vol,clean_setv,ep,alpha)$clean_setv clean_set=fun_masking(X,Y,n,p,q,n_subset,subset_vol,clean_setv,alpha)$clean_set while (length(clean_set)t^*u~X CBLR5STV&= nlDK]VvBS~af :Ϸ6j]5Skڦmocv-CbصgI6kF[U*FcU3YbZWSUe~]#C+ \(X1¿hL*]cG8+TlcS z8LPcpdPcd*Kb X25&9fqap!ť|ALbEx eh_ZL*j);.[!IexQ0t>+(S( kCܶu0G_GzYnll`+%Ƞ!=ܣưNЎ43=M-Sgd|x^u#:/VAS5 BLuiz|}nu,(p~n\23&%I=H3US]E<*9hgø8 ^v\dܳT&ǵҡ' <Jyc(_*ØS5 ktđA' FB ^VaqLZŬ<)rwi*8!d?!Fv>5N5a3CŠ|L:DgevEv')bۃQ[}'D5ZҚJT^hrB WcNJ۲NPJ@)jwcA :%V9̎'{K_8X=XD%( ׄbS GBE7jnTnѢ^BGZ_nvjP{p&!lWT_ܲGatڤ5ѹ1ߋqr@k(rhMQObG@̈́I\x^ o5F,vWY6YVK6!ޔL <6oO uؤ(`M٨ן}U|^+]_]-Z]ug5j}C)=?3 ,)!v~z,hu[F?=4^nn7)< CF޽Z_[Orj<ߜqFq.HA{/ .XY|#A5>c_^Sw͛3gM5h 8 fQ`%_ }3gN0<Һ} ~ahG }ロγE!w#jMX, HZE`12"kñTp5]h|ؾ>ƹ ^XΨ muROK* x{FTltd폧Efl~bpBMIP/man/00007770000000000013246134254005754 5ustar00MIP/man/MIP-package.Rd00006660000000205713246134254010265 0ustar00\name{MIP-package} \alias{MIP-package} \docType{package} \title{ \packageTitle{MIP} } \description{This funciton is to implement the multiple influential point (MIP) detection algorithm of Zhao et al.(2016). MIP algorithm aims to detect the multiple influential observations of high dimensional space. There are two major steps: Min-Max step and Checking step. Applying the Min-Max step, an estimate of clean set is obtained. The Min-step and Max-step are implemented by the function "fun_swamping" and "fun_masking" in this package, respectively. The Min-step is used to remove the influential points of moderate or strong effect, and the following Max-step removing those of weak effect. Finally, based on the estimated clean set, one can implement the Checking step by the function "fun_checking". } \details{ \tabular{ll}{ Package: \tab MIP\cr Type: \tab Package\cr Version: \tab 2.0\cr Date: \tab 2018-2-1\cr License: \tab GPL 2.0 or later\cr LazyLoad: \tab yes\cr } } \author{ Chao Liu Maintainer: Lu Niu } MIP/man/MIP.Rd00006660000000674413246134254006703 0ustar00\name{MIP} \alias{MIP} %- Also NEED an '\alias' for EACH other topic documented here. \title{ function to detect multiple influential point } \description{ With predictors X and responses Y, this function is to indentify the influential points by implemeting the MIP algorithm proposed by ZHAO et al.(2016) } \usage{ MIP(X, Y, n, p, q, n_subset, subset_vol, ep = 0.1, alpha) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{X}{ the data of predictors with dimension n by p } \item{Y}{ the data of response with dimension n by q } \item{n}{ the sample size } \item{p}{ the dimension of predictor } \item{q}{ the dimension of response } \item{n_subset}{ the number of subsets chosen at random to compute the Min and Max statistics } \item{subset_vol}{ the samples size in each subset } \item{ep}{ the upper bound on the proportion of the rejected null hypothesis in the Min-step. The defaulted value is set at 0.1. } \item{alpha}{ significance level used in FDR procedure } } \details{ This funciton is to implement the multiple influential point (MIP) detection algorithm of Zhao et al.(2016). MIP algorithm aims to detect the multiple influential observations of high dimensional space. There are two major steps: Min-Max step and Checking step. Applying the Min-Max step, an estimate of clean set is obtained. The Min-step and Max-step are implemented by the function "fun_swamping" and "fun_masking" in this package, respectively. The Min-step is used to remove the influential points of moderate or strong effect, and the following Max-step removing those of weak effect. Finally, based on the estimated clean set, one can implement the Checking step by the function "fun_checking". } \value{ the indices of the influential points detected by the MIP algorithm \item{inf_setfinal }{the indices of the influential points detected by MIP algorithm} } \references{ Zhao, J., Liu, C., Niu, L., and Leng, C. (2016). Multiple influential point detection in high-dimensional spaces. arXiv:1609.03320v2} \author{ %% ~~who you are~~ } \note{ %% ~~further notes~~ } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ %% ~~objects to See Also as \code{\link{help}}, ~~~ } \examples{ #example:masking #step 1:generating dataset, X1,Y1 represents the clean set, while X2,Y2 represents the influential set library(MASS) n_out=10 n=100 p=1000 q=1 n_subset=100 mx_shift=5 alpha=0.05 subset_vol=n/2 A=diag(rep(1,p)) for (i in 1:p) { for (j in i:p) { A[i,j]=0.5^(abs(j-i)) A[j,i]=A[i,j] } } X1=mvrnorm(n,mu=rep(0,p),Sigma = A) beta=matrix(c(0.4,0.5,0.5,0.6,0.4,rep(0,p-5)),p,1) Y1<- X1\%*\%beta+rnorm(n) X2=matrix(0,n_out,p); Y2=rep(0,n_out) for (j in 1:n_out) {a=sample(c(1:n),size =10,replace=FALSE,prob=NULL) X2[j,]=X1[which(Y1==max(Y1)),] X2[j,a]=X2[j,a]+j/1000 Y2[j]=max(Y1)+mx_shift+rnorm(1,0,0.5)*j/1000} X=rbind(X2[1:n_out,],X1[(n_out+1):n,]) # combination of influential and non-influential observations. Y1[1:n_out]=Y2[1:n_out] Y=t(Y1) #step 2: call the function "MIP" to detect the influnecial points infset_index=MIP(X,Y,n,p,q,n_subset,subset_vol,ep=0.1,alpha) #output rhe influential point index print(infset_index) } % Add one or more standard keywords, see file 'KEYWORDS' in the % R documentation directory. \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line MIP/man/fun_checking.Rd00006660000000343713246134254010675 0ustar00\name{fun_checking} \alias{fun_checking} %- Also NEED an '\alias' for EACH other topic documented here. \title{ function to check whether there are non-influential points being identified as influential ones(Checking step) } \description{ After the Min-Max step (i.e. applying function "fun_masking" and "fun_swamping" iteratively), one can get an estimate of clean set. The complementary of the estimated clean set may still contain some non-influention points. This function is to check whether some non-influential points are falsely identified as influential ones. } \usage{ fun_checking(X, Y, n, p, q, inf_t, clean_t, alpha) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{X}{ the data of predictors with dimension n by p } \item{Y}{ the data of response with dimension n by q } \item{n}{ the sample size } \item{p}{ the dimension of predictor } \item{q}{ the dimension of response } \item{inf_t}{ the estimated indices of influential poins found by Min-Max algorithm } \item{clean_t}{ the estimated indices of clean poins found by Min-Max algorithm } \item{alpha}{ significance level used in FDR procedure } } \details{ %% ~~ If necessary, more details than the description above ~~ } \value{ the influential points detected by the MIP algorithm \item{inf_setfinal}{the estimated indices of influential points obtained by MIP algorithm, after applying the checking algorithm to the potential influential point inf_t.} } \references{ %% ~put references to the literature/web site here ~ } \author{ %% ~~who you are~~ } \note{ %% ~~further notes~~ } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ %% ~~objects to See Also as \code{\link{help}}, ~~~ } MIP/man/fun_masking.Rd00006660000000311613246134254010545 0ustar00\name{fun_masking} \alias{fun_masking} %- Also NEED an '\alias' for EACH other topic documented here. \title{ function to detect the influential points using the Max-statistics(Max-step) } \description{ This function is to detect the influential points using the Max-statistics. } \usage{ fun_masking(X, Y, n, p, q, n_subset, subset_vol, clean_setv, alpha) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{X}{ the data of predictors with dimension n by p } \item{Y}{ the data of response with dimension n by q } \item{n}{ the sample size } \item{p}{ the dimension of predictor } \item{q}{ the dimension of response } \item{n_subset}{ the number of subsets chosen at random to compute the Min and Max statistics } \item{subset_vol}{ the samples size in each subset } \item{clean_setv}{ an input value of estimated clean set obtained during the iteration of Min-Max step } \item{alpha}{ significance level used in FDR procedure } } \details{ %% ~~ If necessary, more details than the description above ~~ } \value{ return the size of clean set and the indices of the observations in the clean_set \item{S_clean }{the size of the clean set} \item{clean_set }{the indices of the estimated clean set obtained by Max-step} } \references{ %% ~put references to the literature/web site here ~ } \author{ %% ~~who you are~~ } \note{ %% ~~further notes~~ } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ %% ~~objects to See Also as \code{\link{help}}, ~~~ } MIP/man/fun_pv.Rd00006660000000263613246134254007547 0ustar00\name{fun_pv} \alias{fun_pv} %- Also NEED an '\alias' for EACH other topic documented here. \title{ function to comulate the max-statistics and the min-statistics } \description{ This functioin is to compute the Max-statistics and the Min-statistics in MIP algorithm of MIP. } \usage{ fun_pv(X, Y, n, p, q, n_subset, subset_vol, clean_setv) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{X}{ the data of predictors with dimension n by p } \item{Y}{ the data of response with dimension n by q } \item{n}{ the sample size } \item{p}{ the dimension of predictor } \item{q}{ the dimension of response } \item{n_subset}{ the number of subsets chosen at random to compute the Min and Max statistics } \item{subset_vol}{ the samples size in each subset } \item{clean_setv}{ the estimated clean set obtained during the iteration of Min-Max step } } \details{ %% ~~ If necessary, more details than the description above ~~ } \value{ the max-statistics and the min-statistics \item{T1}{the values of the max-statistics} \item{T2}{the values of the min-statistics} } \references{ %% ~put references to the literature/web site here ~ } \author{ %% ~~who you are~~ } \note{ %% ~~further notes~~ } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ %% ~~objects to See Also as \code{\link{help}}, ~~~ } MIP/man/fun_swamping.Rd00006660000000324413246134254010743 0ustar00\name{fun_swamping} \alias{fun_swamping} %- Also NEED an '\alias' for EACH other topic documented here. \title{ function to detect the influential points using the Min-statistics(Min-step) } \description{ Applying this function, one can remove the influential points of moderate or strong effect, alleviating the swamping effect. } \usage{ fun_swamping(X, Y, n, p, q, n_subset, subset_vol, clean_setv, ep = 0.1, alpha) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{X}{ the data of predictors with dimension n by p } \item{Y}{ the data of response with dimension n by q } \item{n}{ the sample size } \item{p}{ the dimension of predictor } \item{q}{ the dimension of response } \item{n_subset}{ the number of subsets chosen at random to compute the Min and Max statistics } \item{subset_vol}{ the samples size in each subset } \item{clean_setv}{ an input value of estimated clean set obtained by Min/Max step } \item{ep}{ the upper bound on the proportion of rejected null hypothesis in the Min-step. The defaulted value is set at 0.1. } \item{alpha}{ significance level used in FDR procedure } } \details{ %% ~~ If necessary, more details than the description above ~~ } \value{ return the clean set updated \item{clean_setv}{the indices of the estimated clean set obtained by the min-statistics} } \references{ %% ~put references to the literature/web site here ~ } \author{ %% ~~who you are~~ } \note{ %% ~~further notes~~ } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ %% ~~objects to See Also as \code{\link{help}}, ~~~ }