1 package baseCode.math;
2
3 import baseCode.math.distribution.ProbabilityComputer;
4 import cern.colt.list.DoubleArrayList;
5 import cern.colt.list.IntArrayList;
6 import cern.jet.stat.Descriptive;
7
8 /***
9 * Class to perform the Kolmogorov-Smirnov test. Ported from R.
10 * <hr>
11 * <p>
12 * Copyright (c) 2004 Columbia University
13 *
14 * @author pavlidis
15 * @version $Id: KSTest.java,v 1.2 2005/01/05 02:01:02 pavlidis Exp $
16 */
17 public class KSTest {
18
19 public static double twoSample( DoubleArrayList x, DoubleArrayList y ) {
20 int nx = x.size();
21 int ny = y.size();
22 if ( ny < 1 || nx < 1 ) {
23 throw new IllegalStateException( "Can't do test" );
24 }
25
26 double n = nx * ny / ( nx + ny );
27
28 DoubleArrayList w = new DoubleArrayList( x.elements() );
29 w.addAllOf( y );
30
31 IntArrayList orderw = Rank.order( w );
32
33 //z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) // tricky...
34 DoubleArrayList z = new DoubleArrayList( w.size() );
35
36 for ( int i = 0; i < orderw.size(); i++ ) {
37 int ww = orderw.getQuick( i );
38
39 if ( ww <= nx ) {
40 z.add( 1.0 / nx + ( i > 0 ? z.getQuick( i - 1 ) : 0 ) );
41 } else {
42 z.add( -1.0 / ny + ( i > 0 ? z.getQuick( i - 1 ) : 0 ) );
43 }
44 }
45
46 // FIXME do something about ties here //
47
48 // take absolute value of w.
49 for ( int i = 1; i < z.size(); i++ ) {
50 z.setQuick( i, Math.abs( z.getQuick( i ) ) );
51 }
52
53 double statistic = Descriptive.max( z );
54
55 return 1.0 - psmirnov2x( statistic, nx, ny );
56
57 }
58
59 public static double oneSample( DoubleArrayList x, ProbabilityComputer pg ) {
60
61 DoubleArrayList xs = x.copy();
62 int n = xs.size();
63
64 DoubleArrayList z = new DoubleArrayList( 2 * n );
65
66 // x <- y(sort(x), ...) - (0 : (n-1)) / n
67 xs.sort();
68 for ( int i = 0; i < n; i++ ) {
69 z.add( pg.probability( xs.getQuick( i ) ) - ( ( double ) i / n ) );
70 }
71
72 // c(x, 1/n - x)
73 for ( int i = 0; i < n; i++ ) {
74 z.add( 1.0 / n - z.getQuick( i ) );
75 }
76
77 double statistic = Descriptive.max( z );
78
79 // 1 - pkstwo(sqrt(n) * STATISTIC)
80 return 1.0 - pkstwo( Math.sqrt( n ) * statistic );
81
82 }
83
84 private static double pkstwo( double x ) {
85 double tol = 1e-6;
86 double[] p = new double[] {
87 x
88 };
89 pkstwo( 1, p, tol );
90 return p[0];
91 }
92
93 /*
94 * Compute the asymptotic distribution of the one- and two-sample two-sided Kolmogorov-Smirnov statistics, and the
95 * exact distribution in the two-sided two-sample case.
96 */
97
98 /***
99 * From R code.
100 *
101 * <pre>
102 *
103 * Compute
104 * \sum_{k=-\infty}ˆ\infty (-1)ˆk eˆ{-2 kˆ2 xˆ2}
105 * = 1 + 2 \sum_{k=1}ˆ\infty (-1)ˆk eˆ{-2 kˆ2 xˆ2}
106 * = \sqrt{2\pi/x} \sum_{k=1}ˆ\infty \exp(-(2k-1)ˆ2\piˆ2/(8xˆ2))
107 *
108 *
109 * </pre>
110 *
111 * <p>
112 * See e.g. J. Durbin (1973), Distribution Theory for Tests Based on the Sample Distribution Function. SIAM.
113 * <p>
114 * The 'standard' series expansion obviously cannot be used close to 0; we use the alternative series for x < 1, and
115 * a rather crude estimate of the series remainder term in this case, in particular using that ue^(-lu^2) \le
116 * e^(-lu^2 + u) \le e^(-(l-1)u^2 - u^2+u) \le e^(-(l-1)) provided that u and l are >= 1.
117 * <p>
118 * (But note that for reasonable tolerances, one could simply take 0 as the value for x < 0.2, and use the standard
119 * expansion otherwise.)
120 * <hr>
121 *
122 * @param x[1:n] is input and output
123 * @param n Number of items in the data
124 * @param tol Tolerance; 1e-6 is used by R.
125 */
126 private static void pkstwo( int n, double[] x, double tol ) {
127
128 double newV, old, s, w, z;
129 int i, k, k_max;
130
131 k_max = ( int ) Math.sqrt( 2.0 - Math.log( tol ) );
132
133 for ( i = 0; i < n; i++ ) {
134 if ( x[i] < 1 ) {
135 z = -( Constants.M_PI_2 * Constants.M_PI_4 ) / ( x[i] * x[i] );
136 w = Math.log( x[i] );
137 s = 0;
138 for ( k = 1; k < k_max; k += 2 ) {
139 s += Math.exp( k * k * z - w );
140 }
141 x[i] = s / Constants.M_1_SQRT_2PI;
142 } else {
143 z = -2 * x[i] * x[i];
144 s = -1;
145 k = 1;
146 old = 0;
147 newV = 1;
148 while ( Math.abs( old - newV ) > tol ) {
149 old = newV;
150 newV += 2 * s * Math.exp( z * k * k );
151 s *= -1;
152 k++;
153 }
154 x[i] = newV;
155 }
156 }
157 }
158
159 private static double psmirnov2x( double x, int m, int n ) {
160 int i, j;
161
162 if ( m > n ) {
163 i = n;
164 n = m;
165 m = i;
166 }
167 double md = m;
168 double nd = n;
169 double q = Math.floor( x * md * nd - 1e-7 ) / ( md * nd );
170 double[] u = new double[n + 1];
171
172 for ( j = 0; j <= n; j++ ) {
173 u[j] = ( ( j / nd ) > q ) ? 0 : 1;
174 }
175 for ( i = 1; i <= m; i++ ) {
176 double w = ( double ) ( i ) / ( ( double ) ( i + n ) );
177 if ( ( i / md ) > q )
178 u[0] = 0;
179 else
180 u[0] = w * u[0];
181 for ( j = 1; j <= n; j++ ) {
182 if ( Math.abs( i / md - j / nd ) > q )
183 u[j] = 0;
184 else
185 u[j] = w * u[j] + u[j - 1];
186 }
187
188 x = u[n];
189 }
190 return x;
191 }
192 }
193
194 /*
195 //
196 // * ks.test <-
197 //function(x, y, ..., alternative = c("two.sided", "less", "greater"),
198 // exact = NULL)
199 //{
200 // alternative <- match.arg(alternative)
201 // DNAME <- deparse(substitute(x))
202 // x <- x[!is.na(x)]
203 // n <- length(x)
204 // if(n < 1)
205 // stop("Not enough x data")
206 // PVAL <- NULL
207 //
208 // if(is.numeric(y)) {
209 // DNAME <- paste(DNAME, "and", deparse(substitute(y)))
210 // y <- y[!is.na(y)] // delete values that are nans.
211 // n.x <- as.double(n) # to avoid integer overflow
212 // n.y <- length(y)
213 // if(n.y < 1)
214 // stop("Not enough y data")
215 // if(is.null(exact))
216 // exact <- (n.x * n.y < 10000)
217 // METHOD <- "Two-sample Kolmogorov-Smirnov test"
218 // TIES <- FALSE
219 // n <- n.x * n.y / (n.x + n.y)
220 // w <- c(x, y) // concatenating the data...
221 // z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) // z is our test statistic
222 // if(length(unique(w)) < (n.x + n.y)) {
223 // warning("cannot compute correct p-values with ties")
224 // z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
225 // TIES <- TRUE
226 // }
227 // STATISTIC <- switch(alternative,
228 // "two.sided" = max(abs(z)),
229 // "greater" = max(z),
230 // "less" = - min(z))
231 // if(exact && alternative == "two.sided" && !TIES)
232 // PVAL <- 1 - .C("psmirnov2x",
233 // p = as.double(STATISTIC),
234 // as.integer(n.x),
235 // as.integer(n.y),
236 // PACKAGE = "stats")$p
237 // }
238 // else { // it is a distribution - one-sample test.
239 // if(is.character(y))
240 // y <- get(y, mode="function")
241 // if(mode(y) != "function")
242 // stop("y must be numeric or a string naming a valid function")
243 // METHOD <- "One-sample Kolmogorov-Smirnov test"
244 // if(length(unique(x)) < n)
245 // warning("cannot compute correct p-values with ties")
246 // x <- y(sort(x), ...) - (0 : (n-1)) / n
247 // STATISTIC <- switch(alternative,
248 // "two.sided" = max(c(x, 1/n - x)),
249 // "greater" = max(1/n - x),
250 // "less" = max(x))
251 // }
252 //
253 // names(STATISTIC) <- switch(alternative,
254 // "two.sided" = "D",
255 // "greater" = "D^+",
256 // "less" = "D^-")
257 //
258 // pkstwo <- function(x, tol = 1e-6) {
259 // ## Compute \sum_{-\infty}^\infty (-1)^k e^{-2k^2x^2}
260 // ## Not really needed at this generality for computing a single
261 // ## asymptotic p-value as below.
262 // if(is.numeric(x))
263 // x <- as.vector(x)
264 // else
265 // stop("Argument x must be numeric")
266 // p <- rep(0, length(x))
267 // p[is.na(x)] <- NA
268 // IND <- which(!is.na(x) & (x > 0))
269 // if(length(IND) > 0) {
270 // p[IND] <- .C("pkstwo",
271 // as.integer(length(x)),
272 // p = as.double(x[IND]),
273 // as.double(tol),
274 // PACKAGE = "stats")$p
275 // }
276 // return(p)
277 // }
278 //
279 // if(is.null(PVAL)) {
280 // ##
281 // ## Currently, p-values for the two-sided two-sample case are
282 // ## exact if n.x * n.y < 10000 (unless controlled explicitly).
283 // ## In all other cases, the asymptotic distribution is used
284 // ## directly. But: let m and n be the min and max of the sample
285 // ## sizes, respectively. Then, according to Kim and Jennrich
286 // ## (1973), if m < n / 10, we should use the
287 // ## * Kolmogorov approximation with c.c. -1/(2*n) if 1 < m < 80;
288 // ## * Smirnov approximation with c.c. 1/(2*sqrt(n)) if m >= 80.
289 // ## Also, we should use exact values in the two-sided one-sample
290 // ## case if the sample size is small (< 80).
291 // PVAL <- ifelse(alternative == "two.sided",
292 // 1 - pkstwo(sqrt(n) * STATISTIC),
293 // exp(- 2 * n * STATISTIC^2))
294 // ##
295 // }
296 //
297 // RVAL <- list(statistic = STATISTIC,
298 // p.value = PVAL,
299 // alternative = alternative,
300 // method = METHOD,
301 // data.name = DNAME)
302 // class(RVAL) <- "htest"
303 // return(RVAL)
304 //}</pre>
305 //
306 //
307 //
308 */