View Javadoc

1   package baseCode.dataFilter;
2   
3   import java.util.Vector;
4   
5   import baseCode.dataStructure.matrix.NamedMatrix;
6   import cern.colt.list.IntArrayList;
7   
8   /***
9    * Remove rows from a matrix that are missing too many points.
10   * <p>
11   * Copyright (c) 2004 Columbia University
12   * </p>
13   * 
14   * @author Paul Pavlidis
15   * @version $Id: RowMissingFilter.java,v 1.4 2004/07/27 03:18:58 pavlidis Exp $
16   */
17  public class RowMissingFilter extends AbstractFilter implements Filter {
18  
19     private int minPresentCount = 5;
20     private static final int ABSOLUTEMINPRESENT = 1;
21     private double maxFractionRemoved = 0.0;
22     private double minPresentFraction = 1.0;
23     private boolean maxFractionRemovedIsSet = false;
24     private boolean minPresentFractionIsSet = false;
25     private boolean minPresentIsSet = false;
26  
27     /***
28      * Set the minimum number of values that must be present in each row. The default value is 5. This is always
29      * overridden by a hard-coded value (currently 2) that must be present for a row to be kept; but this value is in
30      * turn overridden by the maxfractionRemoved.
31      * 
32      * @param m int
33      */
34     public void setMinPresentCount( int m ) {
35        if ( m < 0 ) {
36           throw new IllegalArgumentException(
37                 "Minimum present count must be > 0." );
38        }
39        minPresentIsSet = true;
40        minPresentCount = m;
41     }
42  
43     /***
44      * @param k double the fraction of values to be removed.
45      */
46     public void setMinPresentFraction( double k ) {
47        if ( k < 0.0 || k > 1.0 )
48              throw new IllegalArgumentException(
49                    "Min present fraction must be between 0 and 1, got " + k );
50        minPresentFractionIsSet = true;
51        minPresentFraction = k;
52     }
53  
54     /***
55      * Set the maximum fraction of rows which will be removed from the data set. The default value is 0.3 Set it to 1.0
56      * to remove this restriction.
57      * 
58      * @param f double
59      */
60     public void setMaxFractionRemoved( double f ) {
61        if ( f < 0.0 || f > 1.0 )
62              throw new IllegalArgumentException(
63                    "Max fraction removed must be between 0 and 1, got " + f );
64        maxFractionRemovedIsSet = true;
65        maxFractionRemoved = f;
66     }
67  
68     public NamedMatrix filter( NamedMatrix data ) {
69        Vector MTemp = new Vector();
70        Vector rowNames = new Vector();
71        int numRows = data.rows();
72        int numCols = data.columns();
73        IntArrayList present = new IntArrayList( numRows );
74  
75        int kept = 0;
76  
77        if ( minPresentFractionIsSet ) {
78           setMinPresentCount( ( int ) Math.ceil( minPresentFraction * numCols ) );
79        }
80  
81        if ( minPresentCount > numCols ) {
82           throw new IllegalStateException( "Minimum present count is set to "
83                 + minPresentCount + " but there are only " + numCols
84                 + " columns in the matrix." );
85        }
86  
87        if ( !minPresentIsSet ) {
88           log.info( "No filtering was requested" );
89           return data;
90        }
91  
92        /* first pass - determine how many missing values there are per row */
93        for ( int i = 0; i < numRows; i++ ) {
94           int missingCount = 0;
95           for ( int j = 0; j < numCols; j++ ) {
96              if ( !data.isMissing( i, j ) ) {
97                 missingCount++;
98              }
99           }
100          present.add( missingCount );
101          if ( missingCount >= ABSOLUTEMINPRESENT
102                && missingCount >= minPresentCount ) {
103             kept++;
104             MTemp.add( data.getRowObj( i ) );
105          }
106       }
107 
108       /* decide whether we need to invoke the 'too many removed' clause */
109       if ( kept < numRows * ( 1.0 - maxFractionRemoved )
110             && maxFractionRemoved != 0.0 ) {
111          IntArrayList sortedPresent = new IntArrayList( numRows );
112          sortedPresent = present.copy();
113          sortedPresent.sort();
114          sortedPresent.reverse();
115 
116          log
117                .info( "There are "
118                      + kept
119                      + " rows that meet criterion of at least "
120                      + minPresentCount
121                      + " non-missing values, but that's too many given the max fraction of "
122                      + maxFractionRemoved
123                      + "; minpresent adjusted to "
124                      + sortedPresent
125                            .get( ( int ) ( numRows * ( maxFractionRemoved ) ) ) );
126          minPresentCount = sortedPresent
127                .get( ( int ) ( numRows * ( maxFractionRemoved ) ) );
128 
129          // Do another pass to add rows we missed before.
130          kept = 0;
131          MTemp.clear();
132          for ( int i = 0; i < numRows; i++ ) {
133             if ( present.get( i ) >= minPresentCount
134                   && present.get( i ) >= ABSOLUTEMINPRESENT ) {
135                kept++;
136                MTemp.add( data.getRowObj( i ) );
137             }
138          }
139 
140       }
141 
142       NamedMatrix returnval = getOutputMatrix( data, MTemp.size(), numCols );
143 
144       // Finally fill in the return value.
145       for ( int i = 0; i < MTemp.size(); i++ ) {
146          for ( int j = 0; j < numCols; j++ ) {
147             returnval.set( i, j, ( ( Object[] ) MTemp.get( i ) )[j] );
148          }
149       }
150       returnval.setColumnNames( data.getColNames() );
151       returnval.setRowNames( rowNames );
152 
153       log.info( "There are " + kept
154             + " rows after removing rows which have fewer than "
155             + minPresentCount + " values (or fewer than " + ABSOLUTEMINPRESENT
156             + ")" );
157 
158       return ( returnval );
159 
160    }
161 }