1 package baseCode.dataFilter;
2
3 import java.util.Vector;
4
5 import baseCode.dataStructure.matrix.DenseDoubleMatrix2DNamed;
6 import baseCode.dataStructure.matrix.NamedMatrix;
7 import baseCode.math.DescriptiveWithMissing;
8 import baseCode.math.Stats;
9 import cern.colt.list.DoubleArrayList;
10 import cern.jet.stat.Descriptive;
11
12 /***
13 * Remove rows from a matrix based on some row-based statistic. Rows with values too high and/or too low can be removed.
14 * Thresholds are inclusive (i.e., values must be at least as high as the set threshold to be included. A number of
15 * statistics are available. In addition, this filter can remove rows that have all negative data values.
16 * <p>
17 * There are a number of decisions/caveats to consider:
18 * <h2>Cutpoint determination</h2>
19 * <p>
20 * There are multiple ways of determining cutpoints. Some possibilities are the maximum value, the minimum value, the
21 * mean value, or the median value. The range and coefficient of variation are also included.
22 * <p>
23 * Note that if you want to use different methods for high-level filtering than for low-level filtering (e.g., using max
24 * for the low-level, and min for the high-level, you have to filter twice. This could cause problems if you are using
25 * fractional filtering and there are negative values (see below).
26 * <h2>Filtering ratiometric data</h2>
27 * <p>
28 * For data that are normalized or ratios, it does not make sense to use this method on the raw data. In that situation,
29 * you should filter the data based on the raw data, and then use a {@link RowNameFilter}to select the rows from the
30 * ratio data.
31 * <h2>Negative values</h2>
32 * <p>
33 * For microarray expression data based on the Affymetrix MAS4.0 protocol (and possibly others), negative values can
34 * occur. In some cases all the values can be negative. As these values are generally viewed as nonsensical, one might
35 * decide that data rows that are all negative should be filtered.
36 * <h2>Behavior at extremes</h2>
37 * <p>
38 * If you request removal/inclusion of 1.0 of the data, you might not get the result you expect because the filtering is
39 * inclusive.
40 * <hr>
41 * <p>
42 * Copyright (c) 2004 Columbia University
43 * <p>
44 *
45 * @author Paul Pavlidis
46 * @version $Id: RowLevelFilter.java,v 1.7 2004/07/27 03:18:58 pavlidis Exp $
47 */
48 public class RowLevelFilter extends AbstractLevelFilter {
49
50 private boolean removeAllNegative = false;
51
52 /***
53 * Use the minimum of the row as the criterion.
54 */
55 public static final int MIN = 1;
56
57 /***
58 * Use the maximum of the row as the criterion.
59 */
60 public static final int MAX = 2;
61
62 /***
63 * Use the median as the criterion.
64 */
65 public static final int MEDIAN = 3;
66
67 /***
68 * Use the mean as the criterion.
69 */
70 public static final int MEAN = 4;
71
72 /***
73 * Use the range as the criterion
74 */
75 public static final int RANGE = 5;
76
77 /***
78 * Use the coefficient of variation as the criterion
79 */
80 public static final int CV = 6;
81
82 private int method = MAX;
83
84 /***
85 * Choose the method that will be used for filtering. Default is 'MAX'. Those rows with the lowest values are removed
86 * during 'low' filtering.
87 *
88 * @param method one of the filtering method constants.
89 */
90 public void setMethod( int method ) {
91 if ( method != MIN && method != MAX && method != MEDIAN && method != MEAN
92 && method != RANGE && method != CV ) {
93 throw new IllegalArgumentException(
94 "Unknown filtering method requested" );
95 }
96 this.method = method;
97 }
98
99 /***
100 * Set the filter to remove all rows that have only negative values. This is applied BEFORE applying fraction-based
101 * criteria. In other words, if you request filtering 0.5 of the values, and 0.5 have all negative values, you will
102 * get 0.25 of the data back. Default = false.
103 *
104 * @param t boolean
105 */
106 public void setRemoveAllNegative( boolean t ) {
107 log.info( "Rows with all negative values will be "
108 + "removed PRIOR TO applying fraction-based criteria." );
109 removeAllNegative = t;
110 }
111
112 /***
113 * @param data
114 * @return
115 */
116 public NamedMatrix filter( NamedMatrix data ) {
117
118 if ( !( data instanceof DenseDoubleMatrix2DNamed ) ) {
119 throw new IllegalArgumentException(
120 "Only valid for DenseDoubleMatrix2DNamed" );
121 }
122
123 if ( lowCut == -Double.MAX_VALUE && highCut == Double.MAX_VALUE ) {
124 log.info( "No filtering requested" );
125 return data;
126 }
127
128 int numRows = data.rows();
129 int numCols = data.columns();
130
131 DoubleArrayList criteria = new DoubleArrayList( new double[numRows] );
132
133
134
135
136 DoubleArrayList rowAsList = new DoubleArrayList( new double[numCols] );
137 int numAllNeg = 0;
138 for ( int i = 0; i < numRows; i++ ) {
139 Double[] row = ( Double[] ) data.getRowObj( i );
140 int numNeg = 0;
141
142 for ( int j = 0; j < numCols; j++ ) {
143 double item = row[j].doubleValue();
144 rowAsList.set( j, item );
145 if ( item < 0.0 || Double.isNaN( item ) ) {
146 numNeg++;
147 }
148 }
149 if ( numNeg == numCols ) {
150 numAllNeg++;
151 }
152
153 switch ( method ) {
154 case MIN: {
155 criteria.set( i, Descriptive.min( rowAsList ) );
156 break;
157 }
158 case MAX: {
159 criteria.set( i, Descriptive.max( rowAsList ) );
160 break;
161 }
162 case MEAN: {
163 criteria.set( i, DescriptiveWithMissing.mean( rowAsList ) );
164 break;
165 }
166 case MEDIAN: {
167 criteria.set( i, DescriptiveWithMissing.median( rowAsList ) );
168 break;
169 }
170 case RANGE: {
171 criteria.set( i, Stats.range( rowAsList ) );
172 break;
173 }
174 case CV: {
175 criteria.set( i, Stats.cv( rowAsList ) );
176 break;
177 }
178 default: {
179 break;
180 }
181 }
182 }
183
184 DoubleArrayList sortedCriteria = criteria.copy();
185 sortedCriteria.sort();
186
187 double realLowCut = -Double.MAX_VALUE;
188 double realHighCut = Double.MAX_VALUE;
189 int consideredRows = numRows;
190 int startIndex = 0;
191 if ( removeAllNegative ) {
192 consideredRows = numRows - numAllNeg;
193 startIndex = numAllNeg;
194 }
195
196 if ( useHighAsFraction ) {
197 if ( !Stats.isValidFraction( highCut ) ) {
198 throw new IllegalStateException(
199 "High level cut must be a fraction between 0 and 1" );
200 }
201 int thresholdIndex = 0;
202 thresholdIndex = ( int ) Math
203 .ceil( consideredRows * ( 1.0 - highCut ) ) - 1;
204
205 thresholdIndex = Math.max( 0, thresholdIndex );
206 realHighCut = sortedCriteria.get( thresholdIndex );
207 } else {
208 realHighCut = highCut;
209 }
210
211 if ( useLowAsFraction ) {
212 if ( !Stats.isValidFraction( lowCut ) ) {
213 throw new IllegalStateException(
214 "Low level cut must be a fraction between 0 and 1" );
215 }
216
217 int thresholdIndex = 0;
218 thresholdIndex = startIndex
219 + ( int ) Math.floor( consideredRows * lowCut );
220 thresholdIndex = Math.min( numRows - 1, thresholdIndex );
221 realLowCut = sortedCriteria.get( thresholdIndex );
222 } else {
223 realLowCut = lowCut;
224 }
225
226
227
228 int kept = 0;
229 Vector rowsToKeep = new Vector();
230 Vector rowNames = new Vector();
231
232 for ( int i = 0; i < numRows; i++ ) {
233 if ( criteria.get( i ) >= realLowCut
234 && criteria.get( i ) <= realHighCut ) {
235 kept++;
236 rowsToKeep.add( data.getRowObj( i ) );
237 rowNames.add( data.getRowName( i ) );
238 }
239 }
240
241 DenseDoubleMatrix2DNamed returnval = new DenseDoubleMatrix2DNamed(
242 rowsToKeep.size(), numCols );
243 for ( int i = 0; i < kept; i++ ) {
244 Double[] row = ( Double[] ) rowsToKeep.get( i );
245 for ( int j = 0; j < numCols; j++ ) {
246 returnval.set( i, j, row[j].doubleValue() );
247 }
248 }
249 returnval.setColumnNames( data.getColNames() );
250 returnval.setRowNames( rowNames );
251
252 log.info( "There are " + kept + " rows left after filtering." );
253
254 return ( returnval );
255
256 }
257 }