|
|||||||||||||||||||
| 30 day Evaluation Version distributed via the Maven Jar Repository. Clover is not free. You have 30 days to evaluate it. Please visit http://www.thecortex.net/clover to obtain a licensed version of Clover | |||||||||||||||||||
| Source file | Conditionals | Statements | Methods | TOTAL | |||||||||||||||
| KSTest.java | 84.2% | 86.8% | 100% | 86.6% |
|
||||||||||||||
| 1 |
package baseCode.math;
|
|
| 2 |
|
|
| 3 |
import baseCode.math.distribution.ProbabilityComputer;
|
|
| 4 |
import cern.colt.list.DoubleArrayList;
|
|
| 5 |
import cern.colt.list.IntArrayList;
|
|
| 6 |
import cern.jet.stat.Descriptive;
|
|
| 7 |
|
|
| 8 |
/**
|
|
| 9 |
* Class to perform the Kolmogorov-Smirnov test. Ported from R.
|
|
| 10 |
* <hr>
|
|
| 11 |
* <p>
|
|
| 12 |
* Copyright (c) 2004 Columbia University
|
|
| 13 |
*
|
|
| 14 |
* @author pavlidis
|
|
| 15 |
* @version $Id: KSTest.java,v 1.2 2005/01/05 02:01:02 pavlidis Exp $
|
|
| 16 |
*/
|
|
| 17 |
public class KSTest { |
|
| 18 |
|
|
| 19 | 1 |
public static double twoSample( DoubleArrayList x, DoubleArrayList y ) { |
| 20 | 1 |
int nx = x.size();
|
| 21 | 1 |
int ny = y.size();
|
| 22 | 1 |
if ( ny < 1 || nx < 1 ) {
|
| 23 | 0 |
throw new IllegalStateException( "Can't do test" ); |
| 24 |
} |
|
| 25 |
|
|
| 26 | 1 |
double n = nx * ny / ( nx + ny );
|
| 27 |
|
|
| 28 | 1 |
DoubleArrayList w = new DoubleArrayList( x.elements() );
|
| 29 | 1 |
w.addAllOf( y ); |
| 30 |
|
|
| 31 | 1 |
IntArrayList orderw = Rank.order( w ); |
| 32 |
|
|
| 33 |
//z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) // tricky...
|
|
| 34 | 1 |
DoubleArrayList z = new DoubleArrayList( w.size() );
|
| 35 |
|
|
| 36 | 1 |
for ( int i = 0; i < orderw.size(); i++ ) { |
| 37 | 20 |
int ww = orderw.getQuick( i );
|
| 38 |
|
|
| 39 | 20 |
if ( ww <= nx ) {
|
| 40 | 11 |
z.add( 1.0 / nx + ( i > 0 ? z.getQuick( i - 1 ) : 0 ) ); |
| 41 |
} else {
|
|
| 42 | 9 |
z.add( -1.0 / ny + ( i > 0 ? z.getQuick( i - 1 ) : 0 ) ); |
| 43 |
} |
|
| 44 |
} |
|
| 45 |
|
|
| 46 |
// FIXME do something about ties here //
|
|
| 47 |
|
|
| 48 |
// take absolute value of w.
|
|
| 49 | 1 |
for ( int i = 1; i < z.size(); i++ ) { |
| 50 | 19 |
z.setQuick( i, Math.abs( z.getQuick( i ) ) ); |
| 51 |
} |
|
| 52 |
|
|
| 53 | 1 |
double statistic = Descriptive.max( z );
|
| 54 |
|
|
| 55 | 1 |
return 1.0 - psmirnov2x( statistic, nx, ny );
|
| 56 |
|
|
| 57 |
} |
|
| 58 |
|
|
| 59 | 1 |
public static double oneSample( DoubleArrayList x, ProbabilityComputer pg ) { |
| 60 |
|
|
| 61 | 1 |
DoubleArrayList xs = x.copy(); |
| 62 | 1 |
int n = xs.size();
|
| 63 |
|
|
| 64 | 1 |
DoubleArrayList z = new DoubleArrayList( 2 * n );
|
| 65 |
|
|
| 66 |
// x <- y(sort(x), ...) - (0 : (n-1)) / n
|
|
| 67 | 1 |
xs.sort(); |
| 68 | 1 |
for ( int i = 0; i < n; i++ ) { |
| 69 | 10 |
z.add( pg.probability( xs.getQuick( i ) ) - ( ( double ) i / n ) );
|
| 70 |
} |
|
| 71 |
|
|
| 72 |
// c(x, 1/n - x)
|
|
| 73 | 1 |
for ( int i = 0; i < n; i++ ) { |
| 74 | 10 |
z.add( 1.0 / n - z.getQuick( i ) ); |
| 75 |
} |
|
| 76 |
|
|
| 77 | 1 |
double statistic = Descriptive.max( z );
|
| 78 |
|
|
| 79 |
// 1 - pkstwo(sqrt(n) * STATISTIC)
|
|
| 80 | 1 |
return 1.0 - pkstwo( Math.sqrt( n ) * statistic );
|
| 81 |
|
|
| 82 |
} |
|
| 83 |
|
|
| 84 | 1 |
private static double pkstwo( double x ) { |
| 85 | 1 |
double tol = 1e-6;
|
| 86 | 1 |
double[] p = new double[] { |
| 87 |
x |
|
| 88 |
}; |
|
| 89 | 1 |
pkstwo( 1, p, tol ); |
| 90 | 1 |
return p[0];
|
| 91 |
} |
|
| 92 |
|
|
| 93 |
/*
|
|
| 94 |
* Compute the asymptotic distribution of the one- and two-sample two-sided Kolmogorov-Smirnov statistics, and the
|
|
| 95 |
* exact distribution in the two-sided two-sample case.
|
|
| 96 |
*/
|
|
| 97 |
|
|
| 98 |
/**
|
|
| 99 |
* From R code.
|
|
| 100 |
*
|
|
| 101 |
* <pre>
|
|
| 102 |
*
|
|
| 103 |
* Compute
|
|
| 104 |
* \sum_{k=-\infty}ˆ\infty (-1)ˆk eˆ{-2 kˆ2 xˆ2}
|
|
| 105 |
* = 1 + 2 \sum_{k=1}ˆ\infty (-1)ˆk eˆ{-2 kˆ2 xˆ2}
|
|
| 106 |
* = \sqrt{2\pi/x} \sum_{k=1}ˆ\infty \exp(-(2k-1)ˆ2\piˆ2/(8xˆ2))
|
|
| 107 |
*
|
|
| 108 |
*
|
|
| 109 |
* </pre>
|
|
| 110 |
*
|
|
| 111 |
* <p>
|
|
| 112 |
* See e.g. J. Durbin (1973), Distribution Theory for Tests Based on the Sample Distribution Function. SIAM.
|
|
| 113 |
* <p>
|
|
| 114 |
* The 'standard' series expansion obviously cannot be used close to 0; we use the alternative series for x < 1, and
|
|
| 115 |
* a rather crude estimate of the series remainder term in this case, in particular using that ue^(-lu^2) \le
|
|
| 116 |
* e^(-lu^2 + u) \le e^(-(l-1)u^2 - u^2+u) \le e^(-(l-1)) provided that u and l are >= 1.
|
|
| 117 |
* <p>
|
|
| 118 |
* (But note that for reasonable tolerances, one could simply take 0 as the value for x < 0.2, and use the standard
|
|
| 119 |
* expansion otherwise.)
|
|
| 120 |
* <hr>
|
|
| 121 |
*
|
|
| 122 |
* @param x[1:n] is input and output
|
|
| 123 |
* @param n Number of items in the data
|
|
| 124 |
* @param tol Tolerance; 1e-6 is used by R.
|
|
| 125 |
*/
|
|
| 126 | 1 |
private static void pkstwo( int n, double[] x, double tol ) { |
| 127 |
|
|
| 128 | 1 |
double newV, old, s, w, z;
|
| 129 | 1 |
int i, k, k_max;
|
| 130 |
|
|
| 131 | 1 |
k_max = ( int ) Math.sqrt( 2.0 - Math.log( tol ) );
|
| 132 |
|
|
| 133 | 1 |
for ( i = 0; i < n; i++ ) {
|
| 134 | 1 |
if ( x[i] < 1 ) {
|
| 135 | 0 |
z = -( Constants.M_PI_2 * Constants.M_PI_4 ) / ( x[i] * x[i] ); |
| 136 | 0 |
w = Math.log( x[i] ); |
| 137 | 0 |
s = 0; |
| 138 | 0 |
for ( k = 1; k < k_max; k += 2 ) {
|
| 139 | 0 |
s += Math.exp( k * k * z - w ); |
| 140 |
} |
|
| 141 | 0 |
x[i] = s / Constants.M_1_SQRT_2PI; |
| 142 |
} else {
|
|
| 143 | 1 |
z = -2 * x[i] * x[i]; |
| 144 | 1 |
s = -1; |
| 145 | 1 |
k = 1; |
| 146 | 1 |
old = 0; |
| 147 | 1 |
newV = 1; |
| 148 | 1 |
while ( Math.abs( old - newV ) > tol ) {
|
| 149 | 3 |
old = newV; |
| 150 | 3 |
newV += 2 * s * Math.exp( z * k * k ); |
| 151 | 3 |
s *= -1; |
| 152 | 3 |
k++; |
| 153 |
} |
|
| 154 | 1 |
x[i] = newV; |
| 155 |
} |
|
| 156 |
} |
|
| 157 |
} |
|
| 158 |
|
|
| 159 | 1 |
private static double psmirnov2x( double x, int m, int n ) { |
| 160 | 1 |
int i, j;
|
| 161 |
|
|
| 162 | 1 |
if ( m > n ) {
|
| 163 | 0 |
i = n; |
| 164 | 0 |
n = m; |
| 165 | 0 |
m = i; |
| 166 |
} |
|
| 167 | 1 |
double md = m;
|
| 168 | 1 |
double nd = n;
|
| 169 | 1 |
double q = Math.floor( x * md * nd - 1e-7 ) / ( md * nd );
|
| 170 | 1 |
double[] u = new double[n + 1]; |
| 171 |
|
|
| 172 | 1 |
for ( j = 0; j <= n; j++ ) {
|
| 173 | 11 |
u[j] = ( ( j / nd ) > q ) ? 0 : 1; |
| 174 |
} |
|
| 175 | 1 |
for ( i = 1; i <= m; i++ ) {
|
| 176 | 10 |
double w = ( double ) ( i ) / ( ( double ) ( i + n ) ); |
| 177 | 10 |
if ( ( i / md ) > q )
|
| 178 | 7 |
u[0] = 0; |
| 179 |
else
|
|
| 180 | 3 |
u[0] = w * u[0]; |
| 181 | 10 |
for ( j = 1; j <= n; j++ ) {
|
| 182 | 100 |
if ( Math.abs( i / md - j / nd ) > q )
|
| 183 | 42 |
u[j] = 0; |
| 184 |
else
|
|
| 185 | 58 |
u[j] = w * u[j] + u[j - 1]; |
| 186 |
} |
|
| 187 |
|
|
| 188 | 10 |
x = u[n]; |
| 189 |
} |
|
| 190 | 1 |
return x;
|
| 191 |
} |
|
| 192 |
} |
|
| 193 |
|
|
| 194 |
/*
|
|
| 195 |
//
|
|
| 196 |
// * ks.test <-
|
|
| 197 |
//function(x, y, ..., alternative = c("two.sided", "less", "greater"),
|
|
| 198 |
// exact = NULL)
|
|
| 199 |
//{
|
|
| 200 |
// alternative <- match.arg(alternative)
|
|
| 201 |
// DNAME <- deparse(substitute(x))
|
|
| 202 |
// x <- x[!is.na(x)]
|
|
| 203 |
// n <- length(x)
|
|
| 204 |
// if(n < 1)
|
|
| 205 |
// stop("Not enough x data")
|
|
| 206 |
// PVAL <- NULL
|
|
| 207 |
//
|
|
| 208 |
// if(is.numeric(y)) {
|
|
| 209 |
// DNAME <- paste(DNAME, "and", deparse(substitute(y)))
|
|
| 210 |
// y <- y[!is.na(y)] // delete values that are nans.
|
|
| 211 |
// n.x <- as.double(n) # to avoid integer overflow
|
|
| 212 |
// n.y <- length(y)
|
|
| 213 |
// if(n.y < 1)
|
|
| 214 |
// stop("Not enough y data")
|
|
| 215 |
// if(is.null(exact))
|
|
| 216 |
// exact <- (n.x * n.y < 10000)
|
|
| 217 |
// METHOD <- "Two-sample Kolmogorov-Smirnov test"
|
|
| 218 |
// TIES <- FALSE
|
|
| 219 |
// n <- n.x * n.y / (n.x + n.y)
|
|
| 220 |
// w <- c(x, y) // concatenating the data...
|
|
| 221 |
// z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) // z is our test statistic
|
|
| 222 |
// if(length(unique(w)) < (n.x + n.y)) {
|
|
| 223 |
// warning("cannot compute correct p-values with ties")
|
|
| 224 |
// z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
|
|
| 225 |
// TIES <- TRUE
|
|
| 226 |
// }
|
|
| 227 |
// STATISTIC <- switch(alternative,
|
|
| 228 |
// "two.sided" = max(abs(z)),
|
|
| 229 |
// "greater" = max(z),
|
|
| 230 |
// "less" = - min(z))
|
|
| 231 |
// if(exact && alternative == "two.sided" && !TIES)
|
|
| 232 |
// PVAL <- 1 - .C("psmirnov2x",
|
|
| 233 |
// p = as.double(STATISTIC),
|
|
| 234 |
// as.integer(n.x),
|
|
| 235 |
// as.integer(n.y),
|
|
| 236 |
// PACKAGE = "stats")$p
|
|
| 237 |
// }
|
|
| 238 |
// else { // it is a distribution - one-sample test.
|
|
| 239 |
// if(is.character(y))
|
|
| 240 |
// y <- get(y, mode="function")
|
|
| 241 |
// if(mode(y) != "function")
|
|
| 242 |
// stop("y must be numeric or a string naming a valid function")
|
|
| 243 |
// METHOD <- "One-sample Kolmogorov-Smirnov test"
|
|
| 244 |
// if(length(unique(x)) < n)
|
|
| 245 |
// warning("cannot compute correct p-values with ties")
|
|
| 246 |
// x <- y(sort(x), ...) - (0 : (n-1)) / n
|
|
| 247 |
// STATISTIC <- switch(alternative,
|
|
| 248 |
// "two.sided" = max(c(x, 1/n - x)),
|
|
| 249 |
// "greater" = max(1/n - x),
|
|
| 250 |
// "less" = max(x))
|
|
| 251 |
// }
|
|
| 252 |
//
|
|
| 253 |
// names(STATISTIC) <- switch(alternative,
|
|
| 254 |
// "two.sided" = "D",
|
|
| 255 |
// "greater" = "D^+",
|
|
| 256 |
// "less" = "D^-")
|
|
| 257 |
//
|
|
| 258 |
// pkstwo <- function(x, tol = 1e-6) {
|
|
| 259 |
// ## Compute \sum_{-\infty}^\infty (-1)^k e^{-2k^2x^2}
|
|
| 260 |
// ## Not really needed at this generality for computing a single
|
|
| 261 |
// ## asymptotic p-value as below.
|
|
| 262 |
// if(is.numeric(x))
|
|
| 263 |
// x <- as.vector(x)
|
|
| 264 |
// else
|
|
| 265 |
// stop("Argument x must be numeric")
|
|
| 266 |
// p <- rep(0, length(x))
|
|
| 267 |
// p[is.na(x)] <- NA
|
|
| 268 |
// IND <- which(!is.na(x) & (x > 0))
|
|
| 269 |
// if(length(IND) > 0) {
|
|
| 270 |
// p[IND] <- .C("pkstwo",
|
|
| 271 |
// as.integer(length(x)),
|
|
| 272 |
// p = as.double(x[IND]),
|
|
| 273 |
// as.double(tol),
|
|
| 274 |
// PACKAGE = "stats")$p
|
|
| 275 |
// }
|
|
| 276 |
// return(p)
|
|
| 277 |
// }
|
|
| 278 |
//
|
|
| 279 |
// if(is.null(PVAL)) {
|
|
| 280 |
// ##
|
|
| 281 |
// ## Currently, p-values for the two-sided two-sample case are
|
|
| 282 |
// ## exact if n.x * n.y < 10000 (unless controlled explicitly).
|
|
| 283 |
// ## In all other cases, the asymptotic distribution is used
|
|
| 284 |
// ## directly. But: let m and n be the min and max of the sample
|
|
| 285 |
// ## sizes, respectively. Then, according to Kim and Jennrich
|
|
| 286 |
// ## (1973), if m < n / 10, we should use the
|
|
| 287 |
// ## * Kolmogorov approximation with c.c. -1/(2*n) if 1 < m < 80;
|
|
| 288 |
// ## * Smirnov approximation with c.c. 1/(2*sqrt(n)) if m >= 80.
|
|
| 289 |
// ## Also, we should use exact values in the two-sided one-sample
|
|
| 290 |
// ## case if the sample size is small (< 80).
|
|
| 291 |
// PVAL <- ifelse(alternative == "two.sided",
|
|
| 292 |
// 1 - pkstwo(sqrt(n) * STATISTIC),
|
|
| 293 |
// exp(- 2 * n * STATISTIC^2))
|
|
| 294 |
// ##
|
|
| 295 |
// }
|
|
| 296 |
//
|
|
| 297 |
// RVAL <- list(statistic = STATISTIC,
|
|
| 298 |
// p.value = PVAL,
|
|
| 299 |
// alternative = alternative,
|
|
| 300 |
// method = METHOD,
|
|
| 301 |
// data.name = DNAME)
|
|
| 302 |
// class(RVAL) <- "htest"
|
|
| 303 |
// return(RVAL)
|
|
| 304 |
//}</pre>
|
|
| 305 |
//
|
|
| 306 |
//
|
|
| 307 |
//
|
|
| 308 |
*/
|
|
||||||||||