1 package baseCode.bio.geneset;
2
3 import hep.aida.IHistogram1D;
4 import hep.aida.ref.Histogram1D;
5
6 import java.util.ArrayList;
7 import java.util.Collection;
8 import java.util.Collections;
9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.Iterator;
12 import java.util.LinkedHashMap;
13 import java.util.List;
14 import java.util.Map;
15 import java.util.Set;
16 import java.util.Vector;
17
18 import baseCode.util.StatusViewer;
19
20 /***
21 * Methods to 'clean' a set of geneSets - to remove redundancies, for example.
22 *
23 * @author Paul Pavlidis
24 * @version $Id: GeneSetMapTools.java,v 1.5 2004/10/13 21:58:46 pavlidis Exp $
25 */
26 public class GeneSetMapTools {
27
28 /***
29 * @param ga
30 * @param countEmpty if false, gene sets that have no members are not counted in the total.
31 * @return The average size of the gene sets.
32 */
33 public static double meanGeneSetSize( GeneAnnotations ga, boolean countEmpty ) {
34 double sum = 0.0;
35 int n = 0;
36
37 Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
38
39 for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
40 .hasNext(); ) {
41 String geneSet = ( String ) iter.next();
42
43 Collection element;
44
45 element = ( Collection ) geneSetToGeneMap.get( geneSet );
46
47 if ( !countEmpty && element.size() == 0 ) {
48 continue;
49 }
50
51 sum += element.size();
52 n++;
53 }
54
55 return sum / n;
56
57 }
58
59 /***
60 * @param sum
61 * @param ga
62 * @param countEmpty if false ,genes that have no gene sets assigned to them are not counted in the total.
63 * @return The average number of gene sets per gene (per probe actually). This is a measure of gene set overlap. If
64 * the value is 1, it means that each gene is (on average) in only one set. Large values indicate larger
65 * amounts of overelap between gene sets.
66 */
67 public static double meanSetsPerGene( GeneAnnotations ga, boolean countEmpty ) {
68 double sum = 0.0;
69 int n = 0;
70
71 Map probeToSetMap = ga.getProbeToGeneSetMap();
72
73 for ( Iterator iter = probeToSetMap.keySet().iterator(); iter.hasNext(); ) {
74 String probe = ( String ) iter.next();
75
76 Collection element;
77
78 element = ( Collection ) probeToSetMap.get( probe );
79
80 if ( !countEmpty && element.size() == 0 ) {
81 continue;
82 }
83
84 sum += element.size();
85 n++;
86
87 }
88
89 return sum / n;
90
91 }
92
93 public static IHistogram1D geneSetSizeDistribution( GeneAnnotations ga,
94 int numBins, int minSize, int maxSize ) {
95 Histogram1D hist = new Histogram1D( "Distribution of gene set sizes",
96 numBins, minSize, maxSize );
97
98 Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
99
100 for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
101 .hasNext(); ) {
102 String geneSet = ( String ) iter.next();
103
104 Collection element;
105
106 element = ( Collection ) geneSetToGeneMap.get( geneSet );
107 hist.fill( element.size() );
108 }
109
110 return hist;
111
112 }
113
114 /***
115 * @param ga
116 * @param gon
117 * @param messenger
118 * @param aspect
119 */
120 public static void removeAspect( GeneAnnotations ga, GONames gon,
121 StatusViewer messenger, String aspect ) {
122 if ( !( aspect.equals( "molecular_function" )
123 || aspect.equals( "biological_process" ) || aspect
124 .equals( "cellular_component" ) ) ) {
125 throw new IllegalArgumentException( "Unknown aspect requested" );
126 }
127
128 Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
129
130 Set removeUs = new HashSet();
131 for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
132 .hasNext(); ) {
133 String geneSet = ( String ) iter.next();
134
135 if ( gon.getAspectForId( geneSet ).equals( aspect ) ) {
136 removeUs.add( geneSet );
137 }
138
139 }
140
141 for ( Iterator iter = removeUs.iterator(); iter.hasNext(); ) {
142 String geneSet = ( String ) iter.next();
143 ga.removeClassFromMaps( geneSet );
144 }
145
146 ga.resetSelectedSets();
147 ga.sortGeneSets();
148
149 if ( messenger != null ) {
150 messenger.setStatus( "There are now " + ga.numGeneSets()
151 + " sets remaining after removing aspect " + aspect );
152 }
153
154 }
155
156 /***
157 * Remove gene sets that don't meet certain criteria.
158 *
159 * @param ga
160 * @param messenger
161 * @param minClassSize
162 * @param maxClassSize
163 */
164 public static void removeBySize( GeneAnnotations ga, StatusViewer messenger,
165 int minClassSize, int maxClassSize ) {
166
167 Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
168
169 Set removeUs = new HashSet();
170 for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
171 .hasNext(); ) {
172 String geneSet = ( String ) iter.next();
173
174 Set element;
175 element = ( Set ) geneSetToGeneMap.get( geneSet );
176 if ( element.size() < minClassSize || element.size() > maxClassSize ) {
177 removeUs.add( geneSet );
178 }
179
180 }
181
182 for ( Iterator iter = removeUs.iterator(); iter.hasNext(); ) {
183 String geneSet = ( String ) iter.next();
184 ga.removeClassFromMaps( geneSet );
185 }
186
187 ga.resetSelectedSets();
188 ga.sortGeneSets();
189
190 if ( messenger != null ) {
191 messenger.setStatus( "There are now " + ga.numGeneSets()
192 + " sets remaining after removing sets with excluded sizes." );
193 }
194
195 }
196
197 /***
198 * <p>
199 * Remove classes which are too similar to some other class. In addition, the user can select a penalty for large
200 * gene sets. Thus when two gene sets are found to be similar, the decision of which one to keep can be tuned based
201 * on the size penalty. We find it useful to penalize large gene sets so we tend to keep smaller ones (but not too
202 * small). Useful values of the penalty are above 1 (a value of 1 will result in the larger class always being
203 * retained).
204 * </p>
205 * <p>
206 * The amount of similarity to be tolerated is set by the parameter fractionSameThreshold, representing the fraction
207 * of genes in the smaller class which are also found in the larger class. Thus, setting this threshold to be 0.0
208 * means that no overlap is tolerated. Setting it to 1 means that classes will never be discarded.
209 * </p>
210 *
211 * @param fractionSameThreshold A value between 0 and 1, indicating how similar a class must be before it gets
212 * ditched.
213 * @param ga
214 * @param messenger For updating a log.
215 * @param maxClassSize Large class considered. (that doesn't mean they are removed)
216 * @param minClassSize Smallest class considered. (that doesn't mean they are removed)
217 * @param bigClassPenalty A value greater or equal to one, indicating the cost of retaining a larger class in favor
218 * of a smaller one. The penalty is scaled with the difference in sizes of the two classes being considered,
219 * so very large classes are more heavily penalized.
220 */
221 public static void ignoreSimilar( double fractionSameThreshold,
222 GeneAnnotations ga, StatusViewer messenger, int maxClassSize,
223 int minClassSize, double bigClassPenalty ) {
224
225 Map classesToSimilarMap = new LinkedHashMap();
226 Set seenit = new HashSet();
227 Set deleteUs = new HashSet();
228
229 if ( messenger != null ) {
230 messenger.setStatus( "...Highly (" + fractionSameThreshold * 100
231 + "%) similar classes are being removed..." + ga.numGeneSets()
232 + " to start..." );
233 }
234
235
236
237 List sortedList = new ArrayList( ga.getGeneSetToGeneMap().keySet() );
238 Collections.shuffle( sortedList );
239
240
241 for ( Iterator iter = sortedList.iterator(); iter.hasNext(); ) {
242 String queryClassId = ( String ) iter.next();
243 Set queryClass = ( Set ) ga.getGeneSetToGeneMap().get( queryClassId );
244
245 int querySize = queryClass.size();
246
247 if ( seenit.contains( queryClassId ) || querySize > maxClassSize
248 || querySize < minClassSize ) {
249 continue;
250 }
251
252 seenit.add( queryClassId );
253
254
255 for ( Iterator iterb = sortedList.iterator(); iterb.hasNext(); ) {
256 String targetClassId = ( String ) iterb.next();
257
258
259 if ( seenit.contains( targetClassId )
260 || targetClassId.equals( queryClassId ) ) {
261 continue;
262 }
263
264 Set targetClass = ( Set ) ga.getGeneSetToGeneMap().get(
265 targetClassId );
266
267 int targetSize = targetClass.size();
268 if ( targetSize < querySize || targetSize > maxClassSize
269 || targetSize < minClassSize ) {
270 continue;
271 }
272
273 double sizeScore;
274
275 if ( areSimilarClasses( targetClass, queryClass,
276 fractionSameThreshold, bigClassPenalty ) ) {
277
278 sizeScore = ( ( double ) targetClass.size() / ( double ) queryClass
279 .size() )
280 / bigClassPenalty;
281
282 if ( sizeScore < 1.0 ) {
283 deleteUs.add( targetClassId );
284 seenit.add( targetClassId );
285 } else {
286 deleteUs.add( queryClassId );
287 seenit.add( queryClassId );
288 break;
289 }
290
291 storeSimilarSets( classesToSimilarMap, queryClassId,
292 targetClassId );
293 }
294
295 }
296 }
297
298
299
300 Iterator itrd = deleteUs.iterator();
301 while ( itrd.hasNext() ) {
302 String deleteMe = ( String ) itrd.next();
303 ga.removeClassFromMaps( deleteMe );
304 }
305
306 ga.resetSelectedSets();
307 ga.sortGeneSets();
308
309 if ( messenger != null ) {
310 messenger.setStatus( "There are now " + ga.numGeneSets()
311 + " classes represented on the chip (" + deleteUs.size()
312 + " were ignored)" );
313 }
314 }
315
316
317
318 /***
319 * @param classesToSimilarMap
320 * @param queryClassId
321 * @param targetClassId
322 */
323 private static void storeSimilarSets( Map classesToSimilarMap,
324 String queryClassId, String targetClassId ) {
325 if ( !classesToSimilarMap.containsKey( targetClassId ) ) {
326 classesToSimilarMap.put( targetClassId, new HashSet() );
327 }
328 if ( !classesToSimilarMap.containsKey( queryClassId ) ) {
329 classesToSimilarMap.put( queryClassId, new HashSet() );
330
331 }
332 ( ( HashSet ) classesToSimilarMap.get( queryClassId ) )
333 .add( targetClassId );
334 ( ( HashSet ) classesToSimilarMap.get( targetClassId ) )
335 .add( queryClassId );
336 }
337
338 /***
339 * Helper function for ignoreSimilar.
340 */
341 private static boolean areSimilarClasses( Set biggerClass, Set smallerClass,
342 double fractionSameThreshold, double bigClassPenalty ) {
343
344 if ( biggerClass.size() < smallerClass.size() ) {
345 throw new IllegalArgumentException( "Invalid sizes" );
346 }
347
348
349
350
351
352 int notInThresh = ( int ) Math.ceil( fractionSameThreshold
353 * smallerClass.size() );
354
355 int notin = 0;
356
357 int overlap = 0;
358 for ( Iterator iter = smallerClass.iterator(); iter.hasNext(); ) {
359
360 String gene = ( String ) iter.next();
361 if ( !biggerClass.contains( gene ) ) {
362 notin++;
363 } else {
364 overlap++;
365 }
366 if ( notin > notInThresh ) {
367
368 }
369 }
370
371 if ( ( double ) overlap / ( double ) smallerClass.size() > fractionSameThreshold ) {
372
373
374
375 return true;
376 }
377
378
379
380 return false;
381
382 }
383
384 /***
385 * Identify classes which are absoluely identical to others. This isn't superfast, because it doesn't know which
386 * classes are actually relevant in the data.
387 */
388 public static void collapseGeneSets( GeneAnnotations geneData,
389 StatusViewer messenger ) {
390 Map setToGeneMap = geneData.getGeneSetToGeneMap();
391 Map classesToRedundantMap = geneData.geneSetToRedundantMap();
392 LinkedHashMap seenClasses = new LinkedHashMap();
393 LinkedHashMap sigs = new LinkedHashMap();
394
395 HashMap seenit = new HashMap();
396
397 if ( messenger != null ) {
398 messenger
399 .setStatus( "There are "
400 + geneData.numGeneSets()
401 + " classes represented on the chip (of any size). Redundant classes are being removed..." );
402 }
403
404
405 int ignored = 0;
406 for ( Iterator iter = setToGeneMap.keySet().iterator(); iter.hasNext(); ) {
407 String classId = ( String ) iter.next();
408 Set classMembers = ( Set ) setToGeneMap.get( classId );
409
410 if ( classMembers.contains( null ) ) {
411 classMembers.remove( null );
412
413 }
414
415
416
417
418
419
420
421
422 if ( classMembers == null || classMembers.size() > 250
423 || classMembers.size() < 2 ) {
424 continue;
425 }
426
427 Vector cls = new Vector( classMembers );
428
429 if ( cls == null ) continue;
430
431 Collections.sort( cls );
432 String signature = "";
433 seenit.clear();
434 Iterator classit = cls.iterator();
435 while ( classit.hasNext() ) {
436 String probeid = ( String ) classit.next();
437 if ( !seenit.containsKey( probeid ) ) {
438 signature = signature + "__" + probeid;
439 seenit.put( probeid, new Boolean( true ) );
440 }
441 }
442 sigs.put( classId, signature );
443 }
444
445
446 for ( Iterator iter = sigs.keySet().iterator(); iter.hasNext(); ) {
447 String classId = ( String ) iter.next();
448 String signature = ( String ) sigs.get( classId );
449
450
451
452 if ( seenClasses.containsKey( signature ) ) {
453 if ( !classesToRedundantMap.containsKey( seenClasses
454 .get( signature ) ) ) {
455 classesToRedundantMap.put( seenClasses.get( signature ),
456 new ArrayList() );
457
458 }
459 ( ( ArrayList ) classesToRedundantMap.get( seenClasses
460 .get( signature ) ) ).add( classId );
461 ignored++;
462 geneData.removeClassFromMaps( classId );
463
464 } else {
465
466 seenClasses.put( signature, classId );
467 }
468 }
469
470 geneData.resetSelectedSets();
471 geneData.sortGeneSets();
472
473 if ( messenger != null ) {
474 messenger.setStatus( "There are now " + geneData.numGeneSets()
475 + " classes represented on the chip (" + ignored
476 + " were removed)" );
477 }
478 }
479
480 /***
481 * @param classId
482 * @param classesToRedundantMap
483 * @return
484 */
485 public static ArrayList getRedundancies( String classId,
486 Map classesToRedundantMap ) {
487 if ( classesToRedundantMap != null
488 && classesToRedundantMap.containsKey( classId ) ) {
489 return ( ArrayList ) classesToRedundantMap.get( classId );
490 }
491 return null;
492
493 }
494
495 /***
496 * @param classId
497 * @param classesToSimilarMap
498 * @return
499 */
500 public static ArrayList getSimilarities( String classId,
501 Map classesToSimilarMap ) {
502 if ( classesToSimilarMap != null
503 && classesToSimilarMap.containsKey( classId ) ) {
504 return ( ArrayList ) classesToSimilarMap.get( classId );
505 }
506 return null;
507 }
508
509 /***
510 * @param classId
511 * @param classesToRedundantMap
512 * @return
513 */
514 public String getRedundanciesString( String classId,
515 Map classesToRedundantMap ) {
516 if ( classesToRedundantMap != null
517 && classesToRedundantMap.containsKey( classId ) ) {
518 ArrayList redundant = ( ArrayList ) classesToRedundantMap
519 .get( classId );
520 Iterator it = redundant.iterator();
521 String returnValue = "";
522 while ( it.hasNext() ) {
523 returnValue = returnValue + ", " + it.next();
524 }
525 return returnValue;
526 }
527 return "";
528 }
529
530 }