View Javadoc

1   package baseCode.dataFilter;
2   
3   import java.util.Vector;
4   
5   import baseCode.dataStructure.matrix.DenseDoubleMatrix2DNamed;
6   import baseCode.dataStructure.matrix.NamedMatrix;
7   import baseCode.math.DescriptiveWithMissing;
8   import baseCode.math.Stats;
9   import cern.colt.list.DoubleArrayList;
10  import cern.jet.stat.Descriptive;
11  
12  /***
13   * Remove rows from a matrix based on some row-based statistic. Rows with values too high and/or too low can be removed.
14   * Thresholds are inclusive (i.e., values must be at least as high as the set threshold to be included. A number of
15   * statistics are available. In addition, this filter can remove rows that have all negative data values.
16   * <p>
17   * There are a number of decisions/caveats to consider:
18   * <h2>Cutpoint determination</h2>
19   * <p>
20   * There are multiple ways of determining cutpoints. Some possibilities are the maximum value, the minimum value, the
21   * mean value, or the median value. The range and coefficient of variation are also included.
22   * <p>
23   * Note that if you want to use different methods for high-level filtering than for low-level filtering (e.g., using max
24   * for the low-level, and min for the high-level, you have to filter twice. This could cause problems if you are using
25   * fractional filtering and there are negative values (see below).
26   * <h2>Filtering ratiometric data</h2>
27   * <p>
28   * For data that are normalized or ratios, it does not make sense to use this method on the raw data. In that situation,
29   * you should filter the data based on the raw data, and then use a {@link RowNameFilter}to select the rows from the
30   * ratio data.
31   * <h2>Negative values</h2>
32   * <p>
33   * For microarray expression data based on the Affymetrix MAS4.0 protocol (and possibly others), negative values can
34   * occur. In some cases all the values can be negative. As these values are generally viewed as nonsensical, one might
35   * decide that data rows that are all negative should be filtered.
36   * <h2>Behavior at extremes</h2>
37   * <p>
38   * If you request removal/inclusion of 1.0 of the data, you might not get the result you expect because the filtering is
39   * inclusive.
40   * <hr>
41   * <p>
42   * Copyright (c) 2004 Columbia University
43   * <p>
44   * 
45   * @author Paul Pavlidis
46   * @version $Id: RowLevelFilter.java,v 1.7 2004/07/27 03:18:58 pavlidis Exp $
47   */
48  public class RowLevelFilter extends AbstractLevelFilter {
49  
50     private boolean removeAllNegative = false;
51  
52     /***
53      * Use the minimum of the row as the criterion.
54      */
55     public static final int MIN = 1;
56  
57     /***
58      * Use the maximum of the row as the criterion.
59      */
60     public static final int MAX = 2;
61  
62     /***
63      * Use the median as the criterion.
64      */
65     public static final int MEDIAN = 3;
66  
67     /***
68      * Use the mean as the criterion.
69      */
70     public static final int MEAN = 4;
71  
72     /***
73      * Use the range as the criterion
74      */
75     public static final int RANGE = 5;
76  
77     /***
78      * Use the coefficient of variation as the criterion
79      */
80     public static final int CV = 6;
81  
82     private int method = MAX;
83  
84     /***
85      * Choose the method that will be used for filtering. Default is 'MAX'. Those rows with the lowest values are removed
86      * during 'low' filtering.
87      * 
88      * @param method one of the filtering method constants.
89      */
90     public void setMethod( int method ) {
91        if ( method != MIN && method != MAX && method != MEDIAN && method != MEAN
92              && method != RANGE && method != CV ) {
93           throw new IllegalArgumentException(
94                 "Unknown filtering method requested" );
95        }
96        this.method = method;
97     }
98  
99     /***
100     * Set the filter to remove all rows that have only negative values. This is applied BEFORE applying fraction-based
101     * criteria. In other words, if you request filtering 0.5 of the values, and 0.5 have all negative values, you will
102     * get 0.25 of the data back. Default = false.
103     * 
104     * @param t boolean
105     */
106    public void setRemoveAllNegative( boolean t ) {
107       log.info( "Rows with all negative values will be "
108             + "removed PRIOR TO applying fraction-based criteria." );
109       removeAllNegative = t;
110    }
111 
112    /***
113     * @param data
114     * @return
115     */
116    public NamedMatrix filter( NamedMatrix data ) {
117 
118       if ( !( data instanceof DenseDoubleMatrix2DNamed ) ) {
119          throw new IllegalArgumentException(
120                "Only valid for DenseDoubleMatrix2DNamed" );
121       }
122 
123       if ( lowCut == -Double.MAX_VALUE && highCut == Double.MAX_VALUE ) {
124          log.info( "No filtering requested" );
125          return data;
126       }
127 
128       int numRows = data.rows();
129       int numCols = data.columns();
130 
131       DoubleArrayList criteria = new DoubleArrayList( new double[numRows] );
132 
133       /*
134        * compute criteria.
135        */
136       DoubleArrayList rowAsList = new DoubleArrayList( new double[numCols] );
137       int numAllNeg = 0;
138       for ( int i = 0; i < numRows; i++ ) {
139          Double[] row = ( Double[] ) data.getRowObj( i );
140          int numNeg = 0;
141          /* stupid, copy into a DoubleArrayList so we can do stats */
142          for ( int j = 0; j < numCols; j++ ) {
143             double item = row[j].doubleValue();
144             rowAsList.set( j, item );
145             if ( item < 0.0 || Double.isNaN( item ) ) {
146                numNeg++;
147             }
148          }
149          if ( numNeg == numCols ) {
150             numAllNeg++;
151          }
152 
153          switch ( method ) {
154             case MIN: {
155                criteria.set( i, Descriptive.min( rowAsList ) );
156                break;
157             }
158             case MAX: {
159                criteria.set( i, Descriptive.max( rowAsList ) );
160                break;
161             }
162             case MEAN: {
163                criteria.set( i, DescriptiveWithMissing.mean( rowAsList ) );
164                break;
165             }
166             case MEDIAN: {
167                criteria.set( i, DescriptiveWithMissing.median( rowAsList ) );
168                break;
169             }
170             case RANGE: {
171                criteria.set( i, Stats.range( rowAsList ) );
172                break;
173             }
174             case CV: {
175                criteria.set( i, Stats.cv( rowAsList ) );
176                break;
177             }
178             default: {
179                break;
180             }
181          }
182       }
183 
184       DoubleArrayList sortedCriteria = criteria.copy();
185       sortedCriteria.sort();
186 
187       double realLowCut = -Double.MAX_VALUE;
188       double realHighCut = Double.MAX_VALUE;
189       int consideredRows = numRows;
190       int startIndex = 0;
191       if ( removeAllNegative ) {
192          consideredRows = numRows - numAllNeg;
193          startIndex = numAllNeg;
194       }
195 
196       if ( useHighAsFraction ) {
197          if ( !Stats.isValidFraction( highCut ) ) {
198             throw new IllegalStateException(
199                   "High level cut must be a fraction between 0 and 1" );
200          }
201          int thresholdIndex = 0;
202          thresholdIndex = ( int ) Math
203                .ceil( consideredRows * ( 1.0 - highCut ) ) - 1;
204 
205          thresholdIndex = Math.max( 0, thresholdIndex );
206          realHighCut = sortedCriteria.get( thresholdIndex );
207       } else {
208          realHighCut = highCut;
209       }
210 
211       if ( useLowAsFraction ) {
212          if ( !Stats.isValidFraction( lowCut ) ) {
213             throw new IllegalStateException(
214                   "Low level cut must be a fraction between 0 and 1" );
215          }
216 
217          int thresholdIndex = 0;
218          thresholdIndex = startIndex
219                + ( int ) Math.floor( consideredRows * lowCut );
220          thresholdIndex = Math.min( numRows - 1, thresholdIndex );
221          realLowCut = sortedCriteria.get( thresholdIndex );
222       } else {
223          realLowCut = lowCut;
224       }
225 
226       // go back over the data now using the cutpoints. This is not optimally
227       // efficient.
228       int kept = 0;
229       Vector rowsToKeep = new Vector();
230       Vector rowNames = new Vector();
231 
232       for ( int i = 0; i < numRows; i++ ) {
233          if ( criteria.get( i ) >= realLowCut
234                && criteria.get( i ) <= realHighCut ) {
235             kept++;
236             rowsToKeep.add( data.getRowObj( i ) );
237             rowNames.add( data.getRowName( i ) );
238          }
239       }
240 
241       DenseDoubleMatrix2DNamed returnval = new DenseDoubleMatrix2DNamed(
242             rowsToKeep.size(), numCols );
243       for ( int i = 0; i < kept; i++ ) {
244          Double[] row = ( Double[] ) rowsToKeep.get( i );
245          for ( int j = 0; j < numCols; j++ ) {
246             returnval.set( i, j, row[j].doubleValue() );
247          }
248       }
249       returnval.setColumnNames( data.getColNames() );
250       returnval.setRowNames( rowNames );
251 
252       log.info( "There are " + kept + " rows left after filtering." );
253 
254       return ( returnval );
255 
256    }
257 }