1 package baseCode.dataFilter;
2
3 import java.util.Vector;
4
5 import baseCode.dataStructure.matrix.NamedMatrix;
6 import cern.colt.list.IntArrayList;
7
8 /***
9 * Remove rows from a matrix that are missing too many points.
10 * <p>
11 * Copyright (c) 2004 Columbia University
12 * </p>
13 *
14 * @author Paul Pavlidis
15 * @version $Id: RowMissingFilter.java,v 1.4 2004/07/27 03:18:58 pavlidis Exp $
16 */
17 public class RowMissingFilter extends AbstractFilter implements Filter {
18
19 private int minPresentCount = 5;
20 private static final int ABSOLUTEMINPRESENT = 1;
21 private double maxFractionRemoved = 0.0;
22 private double minPresentFraction = 1.0;
23 private boolean maxFractionRemovedIsSet = false;
24 private boolean minPresentFractionIsSet = false;
25 private boolean minPresentIsSet = false;
26
27 /***
28 * Set the minimum number of values that must be present in each row. The default value is 5. This is always
29 * overridden by a hard-coded value (currently 2) that must be present for a row to be kept; but this value is in
30 * turn overridden by the maxfractionRemoved.
31 *
32 * @param m int
33 */
34 public void setMinPresentCount( int m ) {
35 if ( m < 0 ) {
36 throw new IllegalArgumentException(
37 "Minimum present count must be > 0." );
38 }
39 minPresentIsSet = true;
40 minPresentCount = m;
41 }
42
43 /***
44 * @param k double the fraction of values to be removed.
45 */
46 public void setMinPresentFraction( double k ) {
47 if ( k < 0.0 || k > 1.0 )
48 throw new IllegalArgumentException(
49 "Min present fraction must be between 0 and 1, got " + k );
50 minPresentFractionIsSet = true;
51 minPresentFraction = k;
52 }
53
54 /***
55 * Set the maximum fraction of rows which will be removed from the data set. The default value is 0.3 Set it to 1.0
56 * to remove this restriction.
57 *
58 * @param f double
59 */
60 public void setMaxFractionRemoved( double f ) {
61 if ( f < 0.0 || f > 1.0 )
62 throw new IllegalArgumentException(
63 "Max fraction removed must be between 0 and 1, got " + f );
64 maxFractionRemovedIsSet = true;
65 maxFractionRemoved = f;
66 }
67
68 public NamedMatrix filter( NamedMatrix data ) {
69 Vector MTemp = new Vector();
70 Vector rowNames = new Vector();
71 int numRows = data.rows();
72 int numCols = data.columns();
73 IntArrayList present = new IntArrayList( numRows );
74
75 int kept = 0;
76
77 if ( minPresentFractionIsSet ) {
78 setMinPresentCount( ( int ) Math.ceil( minPresentFraction * numCols ) );
79 }
80
81 if ( minPresentCount > numCols ) {
82 throw new IllegalStateException( "Minimum present count is set to "
83 + minPresentCount + " but there are only " + numCols
84 + " columns in the matrix." );
85 }
86
87 if ( !minPresentIsSet ) {
88 log.info( "No filtering was requested" );
89 return data;
90 }
91
92
93 for ( int i = 0; i < numRows; i++ ) {
94 int missingCount = 0;
95 for ( int j = 0; j < numCols; j++ ) {
96 if ( !data.isMissing( i, j ) ) {
97 missingCount++;
98 }
99 }
100 present.add( missingCount );
101 if ( missingCount >= ABSOLUTEMINPRESENT
102 && missingCount >= minPresentCount ) {
103 kept++;
104 MTemp.add( data.getRowObj( i ) );
105 }
106 }
107
108
109 if ( kept < numRows * ( 1.0 - maxFractionRemoved )
110 && maxFractionRemoved != 0.0 ) {
111 IntArrayList sortedPresent = new IntArrayList( numRows );
112 sortedPresent = present.copy();
113 sortedPresent.sort();
114 sortedPresent.reverse();
115
116 log
117 .info( "There are "
118 + kept
119 + " rows that meet criterion of at least "
120 + minPresentCount
121 + " non-missing values, but that's too many given the max fraction of "
122 + maxFractionRemoved
123 + "; minpresent adjusted to "
124 + sortedPresent
125 .get( ( int ) ( numRows * ( maxFractionRemoved ) ) ) );
126 minPresentCount = sortedPresent
127 .get( ( int ) ( numRows * ( maxFractionRemoved ) ) );
128
129
130 kept = 0;
131 MTemp.clear();
132 for ( int i = 0; i < numRows; i++ ) {
133 if ( present.get( i ) >= minPresentCount
134 && present.get( i ) >= ABSOLUTEMINPRESENT ) {
135 kept++;
136 MTemp.add( data.getRowObj( i ) );
137 }
138 }
139
140 }
141
142 NamedMatrix returnval = getOutputMatrix( data, MTemp.size(), numCols );
143
144
145 for ( int i = 0; i < MTemp.size(); i++ ) {
146 for ( int j = 0; j < numCols; j++ ) {
147 returnval.set( i, j, ( ( Object[] ) MTemp.get( i ) )[j] );
148 }
149 }
150 returnval.setColumnNames( data.getColNames() );
151 returnval.setRowNames( rowNames );
152
153 log.info( "There are " + kept
154 + " rows after removing rows which have fewer than "
155 + minPresentCount + " values (or fewer than " + ABSOLUTEMINPRESENT
156 + ")" );
157
158 return ( returnval );
159
160 }
161 }