View Javadoc

1   package baseCode.bio.geneset;
2   
3   import hep.aida.IHistogram1D;
4   import hep.aida.ref.Histogram1D;
5   
6   import java.util.ArrayList;
7   import java.util.Collection;
8   import java.util.Collections;
9   import java.util.HashMap;
10  import java.util.HashSet;
11  import java.util.Iterator;
12  import java.util.LinkedHashMap;
13  import java.util.List;
14  import java.util.Map;
15  import java.util.Set;
16  import java.util.Vector;
17  
18  import baseCode.util.StatusViewer;
19  
20  /***
21   * Methods to 'clean' a set of geneSets - to remove redundancies, for example.
22   * 
23   * @author Paul Pavlidis
24   * @version $Id: GeneSetMapTools.java,v 1.5 2004/10/13 21:58:46 pavlidis Exp $
25   */
26  public class GeneSetMapTools {
27  
28     /***
29      * @param ga
30      * @param countEmpty if false, gene sets that have no members are not counted in the total.
31      * @return The average size of the gene sets.
32      */
33     public static double meanGeneSetSize( GeneAnnotations ga, boolean countEmpty ) {
34        double sum = 0.0;
35        int n = 0;
36  
37        Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
38  
39        for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
40              .hasNext(); ) {
41           String geneSet = ( String ) iter.next();
42  
43           Collection element;
44  
45           element = ( Collection ) geneSetToGeneMap.get( geneSet );
46  
47           if ( !countEmpty && element.size() == 0 ) {
48              continue;
49           }
50  
51           sum += element.size();
52           n++;
53        }
54  
55        return sum / n;
56  
57     }
58  
59     /***
60      * @param sum
61      * @param ga
62      * @param countEmpty if false ,genes that have no gene sets assigned to them are not counted in the total.
63      * @return The average number of gene sets per gene (per probe actually). This is a measure of gene set overlap. If
64      *         the value is 1, it means that each gene is (on average) in only one set. Large values indicate larger
65      *         amounts of overelap between gene sets.
66      */
67     public static double meanSetsPerGene( GeneAnnotations ga, boolean countEmpty ) {
68        double sum = 0.0;
69        int n = 0;
70  
71        Map probeToSetMap = ga.getProbeToGeneSetMap();
72  
73        for ( Iterator iter = probeToSetMap.keySet().iterator(); iter.hasNext(); ) {
74           String probe = ( String ) iter.next();
75  
76           Collection element;
77  
78           element = ( Collection ) probeToSetMap.get( probe );
79  
80           if ( !countEmpty && element.size() == 0 ) {
81              continue;
82           }
83  
84           sum += element.size();
85           n++;
86  
87        }
88  
89        return sum / n;
90  
91     }
92  
93     public static IHistogram1D geneSetSizeDistribution( GeneAnnotations ga,
94           int numBins, int minSize, int maxSize ) {
95        Histogram1D hist = new Histogram1D( "Distribution of gene set sizes",
96              numBins, minSize, maxSize );
97  
98        Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
99  
100       for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
101             .hasNext(); ) {
102          String geneSet = ( String ) iter.next();
103 
104          Collection element;
105 
106          element = ( Collection ) geneSetToGeneMap.get( geneSet );
107          hist.fill( element.size() );
108       }
109 
110       return hist;
111 
112    }
113 
114    /***
115     * @param ga
116     * @param gon
117     * @param messenger
118     * @param aspect
119     */
120    public static void removeAspect( GeneAnnotations ga, GONames gon,
121          StatusViewer messenger, String aspect ) {
122       if ( !( aspect.equals( "molecular_function" )
123             || aspect.equals( "biological_process" ) || aspect
124             .equals( "cellular_component" ) ) ) {
125          throw new IllegalArgumentException( "Unknown aspect requested" );
126       }
127 
128       Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
129 
130       Set removeUs = new HashSet();
131       for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
132             .hasNext(); ) {
133          String geneSet = ( String ) iter.next();
134 
135          if ( gon.getAspectForId( geneSet ).equals( aspect ) ) {
136             removeUs.add( geneSet );
137          }
138 
139       }
140 
141       for ( Iterator iter = removeUs.iterator(); iter.hasNext(); ) {
142          String geneSet = ( String ) iter.next();
143          ga.removeClassFromMaps( geneSet );
144       }
145 
146       ga.resetSelectedSets();
147       ga.sortGeneSets();
148 
149       if ( messenger != null ) {
150          messenger.setStatus( "There are now " + ga.numGeneSets()
151                + " sets remaining after removing aspect " + aspect );
152       }
153 
154    }
155 
156    /***
157     * Remove gene sets that don't meet certain criteria.
158     * 
159     * @param ga
160     * @param messenger
161     * @param minClassSize
162     * @param maxClassSize
163     */
164    public static void removeBySize( GeneAnnotations ga, StatusViewer messenger,
165          int minClassSize, int maxClassSize ) {
166 
167       Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
168 
169       Set removeUs = new HashSet();
170       for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
171             .hasNext(); ) {
172          String geneSet = ( String ) iter.next();
173 
174          Set element;
175          element = ( Set ) geneSetToGeneMap.get( geneSet );
176          if ( element.size() < minClassSize || element.size() > maxClassSize ) {
177             removeUs.add( geneSet );
178          }
179 
180       }
181 
182       for ( Iterator iter = removeUs.iterator(); iter.hasNext(); ) {
183          String geneSet = ( String ) iter.next();
184          ga.removeClassFromMaps( geneSet );
185       }
186 
187       ga.resetSelectedSets();
188       ga.sortGeneSets();
189 
190       if ( messenger != null ) {
191          messenger.setStatus( "There are now " + ga.numGeneSets()
192                + " sets remaining after removing sets with excluded sizes." );
193       }
194 
195    }
196 
197    /***
198     * <p>
199     * Remove classes which are too similar to some other class. In addition, the user can select a penalty for large
200     * gene sets. Thus when two gene sets are found to be similar, the decision of which one to keep can be tuned based
201     * on the size penalty. We find it useful to penalize large gene sets so we tend to keep smaller ones (but not too
202     * small). Useful values of the penalty are above 1 (a value of 1 will result in the larger class always being
203     * retained).
204     * </p>
205     * <p>
206     * The amount of similarity to be tolerated is set by the parameter fractionSameThreshold, representing the fraction
207     * of genes in the smaller class which are also found in the larger class. Thus, setting this threshold to be 0.0
208     * means that no overlap is tolerated. Setting it to 1 means that classes will never be discarded.
209     * </p>
210     * 
211     * @param fractionSameThreshold A value between 0 and 1, indicating how similar a class must be before it gets
212     *        ditched.
213     * @param ga
214     * @param messenger For updating a log.
215     * @param maxClassSize Large class considered. (that doesn't mean they are removed)
216     * @param minClassSize Smallest class considered. (that doesn't mean they are removed)
217     * @param bigClassPenalty A value greater or equal to one, indicating the cost of retaining a larger class in favor
218     *        of a smaller one. The penalty is scaled with the difference in sizes of the two classes being considered,
219     *        so very large classes are more heavily penalized.
220     */
221    public static void ignoreSimilar( double fractionSameThreshold,
222          GeneAnnotations ga, StatusViewer messenger, int maxClassSize,
223          int minClassSize, double bigClassPenalty ) {
224 
225       Map classesToSimilarMap = new LinkedHashMap();
226       Set seenit = new HashSet();
227       Set deleteUs = new HashSet();
228 
229       if ( messenger != null ) {
230          messenger.setStatus( "...Highly (" + fractionSameThreshold * 100
231                + "%)  similar classes are being removed..." + ga.numGeneSets()
232                + " to start..." );
233       }
234 
235       // iterate over all the classes, starting from the smallest one.
236       //      List sortedList = ga.sortGeneSetsBySize();
237       List sortedList = new ArrayList( ga.getGeneSetToGeneMap().keySet() );
238       Collections.shuffle( sortedList );
239 
240       // OUTER - compare all classes to each other.
241       for ( Iterator iter = sortedList.iterator(); iter.hasNext(); ) {
242          String queryClassId = ( String ) iter.next();
243          Set queryClass = ( Set ) ga.getGeneSetToGeneMap().get( queryClassId );
244 
245          int querySize = queryClass.size();
246 
247          if ( seenit.contains( queryClassId ) || querySize > maxClassSize
248                || querySize < minClassSize ) {
249             continue;
250          }
251 
252          seenit.add( queryClassId );
253 
254          // INNER
255          for ( Iterator iterb = sortedList.iterator(); iterb.hasNext(); ) {
256             String targetClassId = ( String ) iterb.next();
257 
258             /// skip self comparisons and also symmetric comparisons.
259             if ( seenit.contains( targetClassId )
260                   || targetClassId.equals( queryClassId ) ) {
261                continue;
262             }
263 
264             Set targetClass = ( Set ) ga.getGeneSetToGeneMap().get(
265                   targetClassId );
266 
267             int targetSize = targetClass.size();
268             if ( targetSize < querySize || targetSize > maxClassSize
269                   || targetSize < minClassSize ) {
270                continue;
271             }
272 
273             double sizeScore;
274 
275             if ( areSimilarClasses( targetClass, queryClass,
276                   fractionSameThreshold, bigClassPenalty ) ) {
277 
278                sizeScore = ( ( double ) targetClass.size() / ( double ) queryClass
279                      .size() )
280                      / bigClassPenalty;
281 
282                if ( sizeScore < 1.0 ) { // delete the larget class.
283                   deleteUs.add( targetClassId );
284                   seenit.add( targetClassId );
285                } else {
286                   deleteUs.add( queryClassId );
287                   seenit.add( queryClassId );
288                   break; // query is no longer relevant, go to the next one.
289                }
290 
291                storeSimilarSets( classesToSimilarMap, queryClassId,
292                      targetClassId );
293             }
294 
295          } /* inner while */
296       }
297       /* end while ... */
298 
299       /* remove the ones we don't want to keep */
300       Iterator itrd = deleteUs.iterator();
301       while ( itrd.hasNext() ) {
302          String deleteMe = ( String ) itrd.next();
303          ga.removeClassFromMaps( deleteMe );
304       }
305 
306       ga.resetSelectedSets();
307       ga.sortGeneSets();
308 
309       if ( messenger != null ) {
310          messenger.setStatus( "There are now " + ga.numGeneSets()
311                + " classes represented on the chip (" + deleteUs.size()
312                + " were ignored)" );
313       }
314    }
315 
316    /* ignoreSimilar */
317 
318    /***
319     * @param classesToSimilarMap
320     * @param queryClassId
321     * @param targetClassId
322     */
323    private static void storeSimilarSets( Map classesToSimilarMap,
324          String queryClassId, String targetClassId ) {
325       if ( !classesToSimilarMap.containsKey( targetClassId ) ) {
326          classesToSimilarMap.put( targetClassId, new HashSet() );
327       }
328       if ( !classesToSimilarMap.containsKey( queryClassId ) ) {
329          classesToSimilarMap.put( queryClassId, new HashSet() );
330 
331       }
332       ( ( HashSet ) classesToSimilarMap.get( queryClassId ) )
333             .add( targetClassId );
334       ( ( HashSet ) classesToSimilarMap.get( targetClassId ) )
335             .add( queryClassId );
336    }
337 
338    /***
339     * Helper function for ignoreSimilar.
340     */
341    private static boolean areSimilarClasses( Set biggerClass, Set smallerClass,
342          double fractionSameThreshold, double bigClassPenalty ) {
343 
344       if ( biggerClass.size() < smallerClass.size() ) {
345          throw new IllegalArgumentException( "Invalid sizes" );
346       }
347 
348       /*
349        * Threshold of how many items from the smaller class must NOT be in the bigger class, before we consider the
350        * classes different.
351        */
352       int notInThresh = ( int ) Math.ceil( fractionSameThreshold
353             * smallerClass.size() );
354 
355       int notin = 0;
356 
357       int overlap = 0;
358       for ( Iterator iter = smallerClass.iterator(); iter.hasNext(); ) {
359 
360          String gene = ( String ) iter.next();
361          if ( !biggerClass.contains( gene ) ) {
362             notin++;
363          } else {
364             overlap++;
365          }
366          if ( notin > notInThresh ) {
367             // return false;
368          }
369       }
370 
371       if ( ( double ) overlap / ( double ) smallerClass.size() > fractionSameThreshold ) {
372          //         System.err.println( "Small class of size " + smallerClass.size()
373          //               + " too much contained (overlap = " + overlap
374          //               + ") in large class of size " + biggerClass.size() );
375          return true;
376       }
377 
378       /* return true is the count is high enough */
379       //   return true;
380       return false;
381 
382    }
383 
384    /***
385     * Identify classes which are absoluely identical to others. This isn't superfast, because it doesn't know which
386     * classes are actually relevant in the data.
387     */
388    public static void collapseGeneSets( GeneAnnotations geneData,
389          StatusViewer messenger ) {
390       Map setToGeneMap = geneData.getGeneSetToGeneMap();
391       Map classesToRedundantMap = geneData.geneSetToRedundantMap();
392       LinkedHashMap seenClasses = new LinkedHashMap();
393       LinkedHashMap sigs = new LinkedHashMap();
394 
395       HashMap seenit = new HashMap();
396 
397       if ( messenger != null ) {
398          messenger
399                .setStatus( "There are "
400                      + geneData.numGeneSets()
401                      + " classes represented on the chip (of any size). Redundant classes are being removed..." );
402       }
403 
404       // sort each arraylist in for each go and create a string that is a signature for this class.
405       int ignored = 0;
406       for ( Iterator iter = setToGeneMap.keySet().iterator(); iter.hasNext(); ) {
407          String classId = ( String ) iter.next();
408          Set classMembers = ( Set ) setToGeneMap.get( classId );
409 
410          if ( classMembers.contains( null ) ) {
411             classMembers.remove( null ); // FIXME why do we need to do this?
412             //          throw new IllegalStateException(classId + " contains null.");
413          }
414 
415          // @todo - hack : Skip classes that are huge. It's too slow
416          // otherwise. This is a total heuristic. Note that this
417          // doesn't mean the class won't get analyzed, it just
418          // means we don't bother looking for redundancies. Big
419          // classes are less likely to be identical to others,
420          // anyway. In tests, the range shown below has no effect
421          // on the results, but it _could_ matter.
422          if ( classMembers == null || classMembers.size() > 250
423                || classMembers.size() < 2 ) {
424             continue;
425          }
426 
427          Vector cls = new Vector( classMembers );
428 
429          if ( cls == null ) continue;
430 
431          Collections.sort( cls );
432          String signature = "";
433          seenit.clear();
434          Iterator classit = cls.iterator();
435          while ( classit.hasNext() ) {
436             String probeid = ( String ) classit.next();
437             if ( !seenit.containsKey( probeid ) ) {
438                signature = signature + "__" + probeid;
439                seenit.put( probeid, new Boolean( true ) );
440             }
441          }
442          sigs.put( classId, signature );
443       }
444 
445       // look at the signatures for repeats.
446       for ( Iterator iter = sigs.keySet().iterator(); iter.hasNext(); ) {
447          String classId = ( String ) iter.next();
448          String signature = ( String ) sigs.get( classId );
449 
450          // if the signature has already been seen, add it to the redundant
451          // list, and remove this class from the classToProbeMap.
452          if ( seenClasses.containsKey( signature ) ) {
453             if ( !classesToRedundantMap.containsKey( seenClasses
454                   .get( signature ) ) ) {
455                classesToRedundantMap.put( seenClasses.get( signature ),
456                      new ArrayList() );
457 
458             }
459             ( ( ArrayList ) classesToRedundantMap.get( seenClasses
460                   .get( signature ) ) ).add( classId );
461             ignored++;
462             geneData.removeClassFromMaps( classId );
463             //		System.err.println(classId + " is the same as an existing class, " + seenClasses.get(signature));
464          } else {
465             // add string to hash
466             seenClasses.put( signature, classId );
467          }
468       }
469 
470       geneData.resetSelectedSets();
471       geneData.sortGeneSets();
472 
473       if ( messenger != null ) {
474          messenger.setStatus( "There are now " + geneData.numGeneSets()
475                + " classes represented on the chip (" + ignored
476                + " were removed)" );
477       }
478    }
479 
480    /***
481     * @param classId
482     * @param classesToRedundantMap
483     * @return
484     */
485    public static ArrayList getRedundancies( String classId,
486          Map classesToRedundantMap ) {
487       if ( classesToRedundantMap != null
488             && classesToRedundantMap.containsKey( classId ) ) {
489          return ( ArrayList ) classesToRedundantMap.get( classId );
490       }
491       return null;
492 
493    }
494 
495    /***
496     * @param classId
497     * @param classesToSimilarMap
498     * @return
499     */
500    public static ArrayList getSimilarities( String classId,
501          Map classesToSimilarMap ) {
502       if ( classesToSimilarMap != null
503             && classesToSimilarMap.containsKey( classId ) ) {
504          return ( ArrayList ) classesToSimilarMap.get( classId );
505       }
506       return null;
507    }
508 
509    /***
510     * @param classId
511     * @param classesToRedundantMap
512     * @return
513     */
514    public String getRedundanciesString( String classId,
515          Map classesToRedundantMap ) {
516       if ( classesToRedundantMap != null
517             && classesToRedundantMap.containsKey( classId ) ) {
518          ArrayList redundant = ( ArrayList ) classesToRedundantMap
519                .get( classId );
520          Iterator it = redundant.iterator();
521          String returnValue = "";
522          while ( it.hasNext() ) {
523             returnValue = returnValue + ", " + it.next();
524          }
525          return returnValue;
526       }
527       return "";
528    }
529 
530 } // end of class