Clover coverage report - baseCode - 0.2.5
Coverage timestamp: Tue Apr 12 2005 11:31:58 EDT
file stats: LOC: 308   Methods: 5
NCLOC: 113   Classes: 1
30 day Evaluation Version distributed via the Maven Jar Repository. Clover is not free. You have 30 days to evaluate it. Please visit http://www.thecortex.net/clover to obtain a licensed version of Clover
 
 Source file Conditionals Statements Methods TOTAL
KSTest.java 84.2% 86.8% 100% 86.6%
coverage coverage
 1   
 package baseCode.math;
 2   
 
 3   
 import baseCode.math.distribution.ProbabilityComputer;
 4   
 import cern.colt.list.DoubleArrayList;
 5   
 import cern.colt.list.IntArrayList;
 6   
 import cern.jet.stat.Descriptive;
 7   
 
 8   
 /**
 9   
  * Class to perform the Kolmogorov-Smirnov test. Ported from R.
 10   
  * <hr>
 11   
  * <p>
 12   
  * Copyright (c) 2004 Columbia University
 13   
  * 
 14   
  * @author pavlidis
 15   
  * @version $Id: KSTest.java,v 1.2 2005/01/05 02:01:02 pavlidis Exp $
 16   
  */
 17   
 public class KSTest {
 18   
 
 19  1
    public static double twoSample( DoubleArrayList x, DoubleArrayList y ) {
 20  1
       int nx = x.size();
 21  1
       int ny = y.size();
 22  1
       if ( ny < 1 || nx < 1 ) {
 23  0
          throw new IllegalStateException( "Can't do test" );
 24   
       }
 25   
 
 26  1
       double n = nx * ny / ( nx + ny );
 27   
 
 28  1
       DoubleArrayList w = new DoubleArrayList( x.elements() );
 29  1
       w.addAllOf( y );
 30   
 
 31  1
       IntArrayList orderw = Rank.order( w );
 32   
 
 33   
       //z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) // tricky...
 34  1
       DoubleArrayList z = new DoubleArrayList( w.size() );
 35   
 
 36  1
       for ( int i = 0; i < orderw.size(); i++ ) {
 37  20
          int ww = orderw.getQuick( i );
 38   
 
 39  20
          if ( ww <= nx ) {
 40  11
             z.add( 1.0 / nx + ( i > 0 ? z.getQuick( i - 1 ) : 0 ) );
 41   
          } else {
 42  9
             z.add( -1.0 / ny + ( i > 0 ? z.getQuick( i - 1 ) : 0 ) );
 43   
          }
 44   
       }
 45   
 
 46   
       // FIXME do something about ties here //
 47   
 
 48   
       // take absolute value of w.
 49  1
       for ( int i = 1; i < z.size(); i++ ) {
 50  19
          z.setQuick( i, Math.abs( z.getQuick( i ) ) );
 51   
       }
 52   
 
 53  1
       double statistic = Descriptive.max( z );
 54   
 
 55  1
       return 1.0 - psmirnov2x( statistic, nx, ny );
 56   
 
 57   
    }
 58   
 
 59  1
    public static double oneSample( DoubleArrayList x, ProbabilityComputer pg ) {
 60   
 
 61  1
       DoubleArrayList xs = x.copy();
 62  1
       int n = xs.size();
 63   
 
 64  1
       DoubleArrayList z = new DoubleArrayList( 2 * n );
 65   
 
 66   
       //    x <- y(sort(x), ...) - (0 : (n-1)) / n
 67  1
       xs.sort();
 68  1
       for ( int i = 0; i < n; i++ ) {
 69  10
          z.add( pg.probability( xs.getQuick( i ) ) - ( ( double ) i / n ) );
 70   
       }
 71   
 
 72   
       // c(x, 1/n - x)
 73  1
       for ( int i = 0; i < n; i++ ) {
 74  10
          z.add( 1.0 / n - z.getQuick( i ) );
 75   
       }
 76   
 
 77  1
       double statistic = Descriptive.max( z );
 78   
 
 79   
       // 1 - pkstwo(sqrt(n) * STATISTIC)
 80  1
       return 1.0 - pkstwo( Math.sqrt( n ) * statistic );
 81   
 
 82   
    }
 83   
 
 84  1
    private static double pkstwo( double x ) {
 85  1
       double tol = 1e-6;
 86  1
       double[] p = new double[] {
 87   
          x
 88   
       };
 89  1
       pkstwo( 1, p, tol );
 90  1
       return p[0];
 91   
    }
 92   
 
 93   
    /*
 94   
     * Compute the asymptotic distribution of the one- and two-sample two-sided Kolmogorov-Smirnov statistics, and the
 95   
     * exact distribution in the two-sided two-sample case.
 96   
     */
 97   
 
 98   
    /**
 99   
     * From R code.
 100   
     * 
 101   
     * <pre>
 102   
     * 
 103   
     *     Compute
 104   
     *                    \sum_{k=-\infty}&circ;\infty (-1)&circ;k e&circ;{-2 k&circ;2 x&circ;2}
 105   
     *                    = 1 + 2 \sum_{k=1}&circ;\infty (-1)&circ;k e&circ;{-2 k&circ;2 x&circ;2}
 106   
     *                    = \sqrt{2\pi/x} \sum_{k=1}&circ;\infty \exp(-(2k-1)&circ;2\pi&circ;2/(8x&circ;2))
 107   
     * 
 108   
     *  
 109   
     * </pre>
 110   
     * 
 111   
     * <p>
 112   
     * See e.g. J. Durbin (1973), Distribution Theory for Tests Based on the Sample Distribution Function. SIAM.
 113   
     * <p>
 114   
     * The 'standard' series expansion obviously cannot be used close to 0; we use the alternative series for x < 1, and
 115   
     * a rather crude estimate of the series remainder term in this case, in particular using that ue^(-lu^2) \le
 116   
     * e^(-lu^2 + u) \le e^(-(l-1)u^2 - u^2+u) \le e^(-(l-1)) provided that u and l are >= 1.
 117   
     * <p>
 118   
     * (But note that for reasonable tolerances, one could simply take 0 as the value for x < 0.2, and use the standard
 119   
     * expansion otherwise.)
 120   
     * <hr>
 121   
     * 
 122   
     * @param x[1:n] is input and output
 123   
     * @param n Number of items in the data
 124   
     * @param tol Tolerance; 1e-6 is used by R.
 125   
     */
 126  1
    private static void pkstwo( int n, double[] x, double tol ) {
 127   
 
 128  1
       double newV, old, s, w, z;
 129  1
       int i, k, k_max;
 130   
 
 131  1
       k_max = ( int ) Math.sqrt( 2.0 - Math.log( tol ) );
 132   
 
 133  1
       for ( i = 0; i < n; i++ ) {
 134  1
          if ( x[i] < 1 ) {
 135  0
             z = -( Constants.M_PI_2 * Constants.M_PI_4 ) / ( x[i] * x[i] );
 136  0
             w = Math.log( x[i] );
 137  0
             s = 0;
 138  0
             for ( k = 1; k < k_max; k += 2 ) {
 139  0
                s += Math.exp( k * k * z - w );
 140   
             }
 141  0
             x[i] = s / Constants.M_1_SQRT_2PI;
 142   
          } else {
 143  1
             z = -2 * x[i] * x[i];
 144  1
             s = -1;
 145  1
             k = 1;
 146  1
             old = 0;
 147  1
             newV = 1;
 148  1
             while ( Math.abs( old - newV ) > tol ) {
 149  3
                old = newV;
 150  3
                newV += 2 * s * Math.exp( z * k * k );
 151  3
                s *= -1;
 152  3
                k++;
 153   
             }
 154  1
             x[i] = newV;
 155   
          }
 156   
       }
 157   
    }
 158   
 
 159  1
    private static double psmirnov2x( double x, int m, int n ) {
 160  1
       int i, j;
 161   
 
 162  1
       if ( m > n ) {
 163  0
          i = n;
 164  0
          n = m;
 165  0
          m = i;
 166   
       }
 167  1
       double md = m;
 168  1
       double nd = n;
 169  1
       double q = Math.floor( x * md * nd - 1e-7 ) / ( md * nd );
 170  1
       double[] u = new double[n + 1];
 171   
 
 172  1
       for ( j = 0; j <= n; j++ ) {
 173  11
          u[j] = ( ( j / nd ) > q ) ? 0 : 1;
 174   
       }
 175  1
       for ( i = 1; i <= m; i++ ) {
 176  10
          double w = ( double ) ( i ) / ( ( double ) ( i + n ) );
 177  10
          if ( ( i / md ) > q )
 178  7
             u[0] = 0;
 179   
          else
 180  3
             u[0] = w * u[0];
 181  10
          for ( j = 1; j <= n; j++ ) {
 182  100
             if ( Math.abs( i / md - j / nd ) > q )
 183  42
                u[j] = 0;
 184   
             else
 185  58
                u[j] = w * u[j] + u[j - 1];
 186   
          }
 187   
 
 188  10
          x = u[n];
 189   
       }
 190  1
       return x;
 191   
    }
 192   
 }
 193   
 
 194   
 /*
 195   
 //
 196   
 //    * ks.test <-
 197   
 //function(x, y, ..., alternative = c("two.sided", "less", "greater"),
 198   
 //         exact = NULL)
 199   
 //{
 200   
 //    alternative <- match.arg(alternative)
 201   
 //    DNAME <- deparse(substitute(x))
 202   
 //    x <- x[!is.na(x)]
 203   
 //    n <- length(x)
 204   
 //    if(n < 1)
 205   
 //        stop("Not enough x data")
 206   
 //    PVAL <- NULL
 207   
 //
 208   
 //    if(is.numeric(y)) {
 209   
 //        DNAME <- paste(DNAME, "and", deparse(substitute(y)))
 210   
 //        y <- y[!is.na(y)] // delete values that are nans.
 211   
 //        n.x <- as.double(n) # to avoid integer overflow
 212   
 //        n.y <- length(y)
 213   
 //        if(n.y < 1)
 214   
 //            stop("Not enough y data")
 215   
 //        if(is.null(exact))
 216   
 //            exact <- (n.x * n.y < 10000)
 217   
 //        METHOD <- "Two-sample Kolmogorov-Smirnov test"
 218   
 //        TIES <- FALSE
 219   
 //        n <- n.x * n.y / (n.x + n.y)
 220   
 //        w <- c(x, y) // concatenating the data...
 221   
 //        z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) // z is our test statistic
 222   
 //        if(length(unique(w)) < (n.x + n.y)) {
 223   
 //            warning("cannot compute correct p-values with ties")
 224   
 //            z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
 225   
 //            TIES <- TRUE
 226   
 //        }
 227   
 //        STATISTIC <- switch(alternative,
 228   
 //                            "two.sided" = max(abs(z)),
 229   
 //                            "greater" = max(z),
 230   
 //                            "less" = - min(z))
 231   
 //        if(exact && alternative == "two.sided" && !TIES)
 232   
 //            PVAL <- 1 - .C("psmirnov2x",
 233   
 //                           p = as.double(STATISTIC),
 234   
 //                           as.integer(n.x),
 235   
 //                           as.integer(n.y),
 236   
 //                           PACKAGE = "stats")$p
 237   
 //    }
 238   
 //    else { // it is a distribution - one-sample test.
 239   
 //        if(is.character(y))
 240   
 //            y <- get(y, mode="function")
 241   
 //        if(mode(y) != "function")
 242   
 //            stop("y must be numeric or a string naming a valid function")
 243   
 //        METHOD <- "One-sample Kolmogorov-Smirnov test"
 244   
 //        if(length(unique(x)) < n)
 245   
 //            warning("cannot compute correct p-values with ties")
 246   
 //        x <- y(sort(x), ...) - (0 : (n-1)) / n
 247   
 //        STATISTIC <- switch(alternative,
 248   
 //                            "two.sided" = max(c(x, 1/n - x)),
 249   
 //                            "greater" = max(1/n - x),
 250   
 //                            "less" = max(x))
 251   
 //    }
 252   
 //
 253   
 //    names(STATISTIC) <- switch(alternative,
 254   
 //                               "two.sided" = "D",
 255   
 //                               "greater" = "D^+",
 256   
 //                               "less" = "D^-")
 257   
 //
 258   
 //    pkstwo <- function(x, tol = 1e-6) {
 259   
 //        ## Compute \sum_{-\infty}^\infty (-1)^k e^{-2k^2x^2}
 260   
 //        ## Not really needed at this generality for computing a single
 261   
 //        ## asymptotic p-value as below.
 262   
 //        if(is.numeric(x))
 263   
 //            x <- as.vector(x)
 264   
 //        else
 265   
 //            stop("Argument x must be numeric")
 266   
 //        p <- rep(0, length(x))
 267   
 //        p[is.na(x)] <- NA
 268   
 //        IND <- which(!is.na(x) & (x > 0))
 269   
 //        if(length(IND) > 0) {
 270   
 //            p[IND] <- .C("pkstwo",
 271   
 //                         as.integer(length(x)),
 272   
 //                         p = as.double(x[IND]),
 273   
 //                         as.double(tol),
 274   
 //                         PACKAGE = "stats")$p
 275   
 //        }
 276   
 //        return(p)
 277   
 //    }
 278   
 //
 279   
 //    if(is.null(PVAL)) {
 280   
 //        ##  
 281   
 //        ## Currently, p-values for the two-sided two-sample case are
 282   
 //        ## exact if n.x * n.y < 10000 (unless controlled explicitly).
 283   
 //        ## In all other cases, the asymptotic distribution is used
 284   
 //        ## directly. But: let m and n be the min and max of the sample
 285   
 //        ## sizes, respectively. Then, according to Kim and Jennrich
 286   
 //        ## (1973), if m < n / 10, we should use the
 287   
 //        ## * Kolmogorov approximation with c.c. -1/(2*n) if 1 < m < 80;
 288   
 //        ## * Smirnov approximation with c.c. 1/(2*sqrt(n)) if m >= 80.
 289   
 //        ## Also, we should use exact values in the two-sided one-sample
 290   
 //        ## case if the sample size is small (< 80).
 291   
 //        PVAL <- ifelse(alternative == "two.sided",
 292   
 //                       1 - pkstwo(sqrt(n) * STATISTIC),
 293   
 //                       exp(- 2 * n * STATISTIC^2))
 294   
 //        ##  
 295   
 //    }
 296   
 //
 297   
 //    RVAL <- list(statistic = STATISTIC,
 298   
 //                 p.value = PVAL,
 299   
 //                 alternative = alternative,
 300   
 //                 method = METHOD,
 301   
 //                 data.name = DNAME)
 302   
 //    class(RVAL) <- "htest"
 303   
 //    return(RVAL)
 304   
 //}</pre>
 305   
 // 
 306   
 //     
 307   
 //   
 308   
 */