View Javadoc

1   package baseCode.math;
2   
3   import baseCode.math.distribution.ProbabilityComputer;
4   import cern.colt.list.DoubleArrayList;
5   import cern.colt.list.IntArrayList;
6   import cern.jet.stat.Descriptive;
7   
8   /***
9    * Class to perform the Kolmogorov-Smirnov test. Ported from R.
10   * <hr>
11   * <p>
12   * Copyright (c) 2004 Columbia University
13   * 
14   * @author pavlidis
15   * @version $Id: KSTest.java,v 1.2 2005/01/05 02:01:02 pavlidis Exp $
16   */
17  public class KSTest {
18  
19     public static double twoSample( DoubleArrayList x, DoubleArrayList y ) {
20        int nx = x.size();
21        int ny = y.size();
22        if ( ny < 1 || nx < 1 ) {
23           throw new IllegalStateException( "Can't do test" );
24        }
25  
26        double n = nx * ny / ( nx + ny );
27  
28        DoubleArrayList w = new DoubleArrayList( x.elements() );
29        w.addAllOf( y );
30  
31        IntArrayList orderw = Rank.order( w );
32  
33        //z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) // tricky...
34        DoubleArrayList z = new DoubleArrayList( w.size() );
35  
36        for ( int i = 0; i < orderw.size(); i++ ) {
37           int ww = orderw.getQuick( i );
38  
39           if ( ww <= nx ) {
40              z.add( 1.0 / nx + ( i > 0 ? z.getQuick( i - 1 ) : 0 ) );
41           } else {
42              z.add( -1.0 / ny + ( i > 0 ? z.getQuick( i - 1 ) : 0 ) );
43           }
44        }
45  
46        // FIXME do something about ties here //
47  
48        // take absolute value of w.
49        for ( int i = 1; i < z.size(); i++ ) {
50           z.setQuick( i, Math.abs( z.getQuick( i ) ) );
51        }
52  
53        double statistic = Descriptive.max( z );
54  
55        return 1.0 - psmirnov2x( statistic, nx, ny );
56  
57     }
58  
59     public static double oneSample( DoubleArrayList x, ProbabilityComputer pg ) {
60  
61        DoubleArrayList xs = x.copy();
62        int n = xs.size();
63  
64        DoubleArrayList z = new DoubleArrayList( 2 * n );
65  
66        //    x <- y(sort(x), ...) - (0 : (n-1)) / n
67        xs.sort();
68        for ( int i = 0; i < n; i++ ) {
69           z.add( pg.probability( xs.getQuick( i ) ) - ( ( double ) i / n ) );
70        }
71  
72        // c(x, 1/n - x)
73        for ( int i = 0; i < n; i++ ) {
74           z.add( 1.0 / n - z.getQuick( i ) );
75        }
76  
77        double statistic = Descriptive.max( z );
78  
79        // 1 - pkstwo(sqrt(n) * STATISTIC)
80        return 1.0 - pkstwo( Math.sqrt( n ) * statistic );
81  
82     }
83  
84     private static double pkstwo( double x ) {
85        double tol = 1e-6;
86        double[] p = new double[] {
87           x
88        };
89        pkstwo( 1, p, tol );
90        return p[0];
91     }
92  
93     /*
94      * Compute the asymptotic distribution of the one- and two-sample two-sided Kolmogorov-Smirnov statistics, and the
95      * exact distribution in the two-sided two-sample case.
96      */
97  
98     /***
99      * From R code.
100     * 
101     * <pre>
102     * 
103     *     Compute
104     *                    \sum_{k=-\infty}&circ;\infty (-1)&circ;k e&circ;{-2 k&circ;2 x&circ;2}
105     *                    = 1 + 2 \sum_{k=1}&circ;\infty (-1)&circ;k e&circ;{-2 k&circ;2 x&circ;2}
106     *                    = \sqrt{2\pi/x} \sum_{k=1}&circ;\infty \exp(-(2k-1)&circ;2\pi&circ;2/(8x&circ;2))
107     * 
108     *  
109     * </pre>
110     * 
111     * <p>
112     * See e.g. J. Durbin (1973), Distribution Theory for Tests Based on the Sample Distribution Function. SIAM.
113     * <p>
114     * The 'standard' series expansion obviously cannot be used close to 0; we use the alternative series for x < 1, and
115     * a rather crude estimate of the series remainder term in this case, in particular using that ue^(-lu^2) \le
116     * e^(-lu^2 + u) \le e^(-(l-1)u^2 - u^2+u) \le e^(-(l-1)) provided that u and l are >= 1.
117     * <p>
118     * (But note that for reasonable tolerances, one could simply take 0 as the value for x < 0.2, and use the standard
119     * expansion otherwise.)
120     * <hr>
121     * 
122     * @param x[1:n] is input and output
123     * @param n Number of items in the data
124     * @param tol Tolerance; 1e-6 is used by R.
125     */
126    private static void pkstwo( int n, double[] x, double tol ) {
127 
128       double newV, old, s, w, z;
129       int i, k, k_max;
130 
131       k_max = ( int ) Math.sqrt( 2.0 - Math.log( tol ) );
132 
133       for ( i = 0; i < n; i++ ) {
134          if ( x[i] < 1 ) {
135             z = -( Constants.M_PI_2 * Constants.M_PI_4 ) / ( x[i] * x[i] );
136             w = Math.log( x[i] );
137             s = 0;
138             for ( k = 1; k < k_max; k += 2 ) {
139                s += Math.exp( k * k * z - w );
140             }
141             x[i] = s / Constants.M_1_SQRT_2PI;
142          } else {
143             z = -2 * x[i] * x[i];
144             s = -1;
145             k = 1;
146             old = 0;
147             newV = 1;
148             while ( Math.abs( old - newV ) > tol ) {
149                old = newV;
150                newV += 2 * s * Math.exp( z * k * k );
151                s *= -1;
152                k++;
153             }
154             x[i] = newV;
155          }
156       }
157    }
158 
159    private static double psmirnov2x( double x, int m, int n ) {
160       int i, j;
161 
162       if ( m > n ) {
163          i = n;
164          n = m;
165          m = i;
166       }
167       double md = m;
168       double nd = n;
169       double q = Math.floor( x * md * nd - 1e-7 ) / ( md * nd );
170       double[] u = new double[n + 1];
171 
172       for ( j = 0; j <= n; j++ ) {
173          u[j] = ( ( j / nd ) > q ) ? 0 : 1;
174       }
175       for ( i = 1; i <= m; i++ ) {
176          double w = ( double ) ( i ) / ( ( double ) ( i + n ) );
177          if ( ( i / md ) > q )
178             u[0] = 0;
179          else
180             u[0] = w * u[0];
181          for ( j = 1; j <= n; j++ ) {
182             if ( Math.abs( i / md - j / nd ) > q )
183                u[j] = 0;
184             else
185                u[j] = w * u[j] + u[j - 1];
186          }
187 
188          x = u[n];
189       }
190       return x;
191    }
192 }
193 
194 /*
195 //
196 //    * ks.test <-
197 //function(x, y, ..., alternative = c("two.sided", "less", "greater"),
198 //         exact = NULL)
199 //{
200 //    alternative <- match.arg(alternative)
201 //    DNAME <- deparse(substitute(x))
202 //    x <- x[!is.na(x)]
203 //    n <- length(x)
204 //    if(n < 1)
205 //        stop("Not enough x data")
206 //    PVAL <- NULL
207 //
208 //    if(is.numeric(y)) {
209 //        DNAME <- paste(DNAME, "and", deparse(substitute(y)))
210 //        y <- y[!is.na(y)] // delete values that are nans.
211 //        n.x <- as.double(n) # to avoid integer overflow
212 //        n.y <- length(y)
213 //        if(n.y < 1)
214 //            stop("Not enough y data")
215 //        if(is.null(exact))
216 //            exact <- (n.x * n.y < 10000)
217 //        METHOD <- "Two-sample Kolmogorov-Smirnov test"
218 //        TIES <- FALSE
219 //        n <- n.x * n.y / (n.x + n.y)
220 //        w <- c(x, y) // concatenating the data...
221 //        z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) // z is our test statistic
222 //        if(length(unique(w)) < (n.x + n.y)) {
223 //            warning("cannot compute correct p-values with ties")
224 //            z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
225 //            TIES <- TRUE
226 //        }
227 //        STATISTIC <- switch(alternative,
228 //                            "two.sided" = max(abs(z)),
229 //                            "greater" = max(z),
230 //                            "less" = - min(z))
231 //        if(exact && alternative == "two.sided" && !TIES)
232 //            PVAL <- 1 - .C("psmirnov2x",
233 //                           p = as.double(STATISTIC),
234 //                           as.integer(n.x),
235 //                           as.integer(n.y),
236 //                           PACKAGE = "stats")$p
237 //    }
238 //    else { // it is a distribution - one-sample test.
239 //        if(is.character(y))
240 //            y <- get(y, mode="function")
241 //        if(mode(y) != "function")
242 //            stop("y must be numeric or a string naming a valid function")
243 //        METHOD <- "One-sample Kolmogorov-Smirnov test"
244 //        if(length(unique(x)) < n)
245 //            warning("cannot compute correct p-values with ties")
246 //        x <- y(sort(x), ...) - (0 : (n-1)) / n
247 //        STATISTIC <- switch(alternative,
248 //                            "two.sided" = max(c(x, 1/n - x)),
249 //                            "greater" = max(1/n - x),
250 //                            "less" = max(x))
251 //    }
252 //
253 //    names(STATISTIC) <- switch(alternative,
254 //                               "two.sided" = "D",
255 //                               "greater" = "D^+",
256 //                               "less" = "D^-")
257 //
258 //    pkstwo <- function(x, tol = 1e-6) {
259 //        ## Compute \sum_{-\infty}^\infty (-1)^k e^{-2k^2x^2}
260 //        ## Not really needed at this generality for computing a single
261 //        ## asymptotic p-value as below.
262 //        if(is.numeric(x))
263 //            x <- as.vector(x)
264 //        else
265 //            stop("Argument x must be numeric")
266 //        p <- rep(0, length(x))
267 //        p[is.na(x)] <- NA
268 //        IND <- which(!is.na(x) & (x > 0))
269 //        if(length(IND) > 0) {
270 //            p[IND] <- .C("pkstwo",
271 //                         as.integer(length(x)),
272 //                         p = as.double(x[IND]),
273 //                         as.double(tol),
274 //                         PACKAGE = "stats")$p
275 //        }
276 //        return(p)
277 //    }
278 //
279 //    if(is.null(PVAL)) {
280 //        ##  
281 //        ## Currently, p-values for the two-sided two-sample case are
282 //        ## exact if n.x * n.y < 10000 (unless controlled explicitly).
283 //        ## In all other cases, the asymptotic distribution is used
284 //        ## directly. But: let m and n be the min and max of the sample
285 //        ## sizes, respectively. Then, according to Kim and Jennrich
286 //        ## (1973), if m < n / 10, we should use the
287 //        ## * Kolmogorov approximation with c.c. -1/(2*n) if 1 < m < 80;
288 //        ## * Smirnov approximation with c.c. 1/(2*sqrt(n)) if m >= 80.
289 //        ## Also, we should use exact values in the two-sided one-sample
290 //        ## case if the sample size is small (< 80).
291 //        PVAL <- ifelse(alternative == "two.sided",
292 //                       1 - pkstwo(sqrt(n) * STATISTIC),
293 //                       exp(- 2 * n * STATISTIC^2))
294 //        ##  
295 //    }
296 //
297 //    RVAL <- list(statistic = STATISTIC,
298 //                 p.value = PVAL,
299 //                 alternative = alternative,
300 //                 method = METHOD,
301 //                 data.name = DNAME)
302 //    class(RVAL) <- "htest"
303 //    return(RVAL)
304 //}</pre>
305 // 
306 //     
307 //   
308 */