libStatGen Software  1
SamFilter.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 //////////////////////////////////////////////////////////////////////////
19 
20 
21 #include "SamFilter.h"
22 
23 #include "SamQuerySeqWithRefHelper.h"
24 #include "BaseUtilities.h"
25 #include "SamFlag.h"
26 
28  GenomeSequence& refSequence,
29  double mismatchThreshold)
30 {
31  // Read & clip from the left & right.
32  SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
33  SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);
34 
35  SamSingleBaseMatchInfo baseMatchInfo;
36 
37  int32_t readLength = record.getReadLength();
38  // Init last front clip to be prior to the lastFront index (0).
39  const int32_t initialLastFrontClipPos = -1;
40  int32_t lastFrontClipPos = initialLastFrontClipPos;
41  // Init first back clip to be past the last index (readLength).
42  int32_t firstBackClipPos = readLength;
43 
44  bool fromFrontComplete = false;
45  bool fromBackComplete = false;
46  int32_t numBasesFromFront = 0;
47  int32_t numBasesFromBack = 0;
48  int32_t numMismatchFromFront = 0;
49  int32_t numMismatchFromBack = 0;
50 
51  //////////////////////////////////////////////////////////
52  // Determining the clip positions.
53  while(!fromFrontComplete || !fromBackComplete)
54  {
55  // Read from the front (left to right) of the read until
56  // more have been read from that direction than the opposite direction.
57  while(!fromFrontComplete &&
58  ((numBasesFromFront <= numBasesFromBack) ||
59  (fromBackComplete)))
60  {
61  if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
62  {
63  // Nothing more to read in this direction.
64  fromFrontComplete = true;
65  break;
66  }
67  // Got a read. Check to see if it is to or past the last clip.
68  if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
69  {
70  // This base is past where we are clipping, so we
71  // are done reading in this direction.
72  fromFrontComplete = true;
73  break;
74  }
75  // This is an actual base read from the left to the
76  // right, so up the counter and determine if it was a mismatch.
77  ++numBasesFromFront;
78 
79  if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
80  {
81  // Mismatch
82  ++numMismatchFromFront;
83  // Check to see if it is over the threshold.
84  double mismatchPercent =
85  (double)numMismatchFromFront / numBasesFromFront;
86  if(mismatchPercent > mismatchThreshold)
87  {
88  // Need to clip.
89  lastFrontClipPos = baseMatchInfo.getQueryIndex();
90  // Reset the counters.
91  numBasesFromFront = 0;
92  numMismatchFromFront = 0;
93  }
94  }
95  }
96 
97  // Now, read from right to left until more have been read
98  // from the back than from the front.
99  while(!fromBackComplete &&
100  ((numBasesFromBack <= numBasesFromFront) ||
101  (fromFrontComplete)))
102  {
103  if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
104  {
105  // Nothing more to read in this direction.
106  fromBackComplete = true;
107  break;
108  }
109  // Got a read. Check to see if it is to or past the first clip.
110  if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
111  {
112  // This base is past where we are clipping, so we
113  // are done reading in this direction.
114  fromBackComplete = true;
115  break;
116  }
117  // This is an actual base read from the right to the
118  // left, so up the counter and determine if it was a mismatch.
119  ++numBasesFromBack;
120 
121  if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
122  {
123  // Mismatch
124  ++numMismatchFromBack;
125  // Check to see if it is over the threshold.
126  double mismatchPercent =
127  (double)numMismatchFromBack / numBasesFromBack;
128  if(mismatchPercent > mismatchThreshold)
129  {
130  // Need to clip.
131  firstBackClipPos = baseMatchInfo.getQueryIndex();
132  // Reset the counters.
133  numBasesFromBack = 0;
134  numMismatchFromBack = 0;
135  }
136  }
137  }
138  }
139 
140  //////////////////////////////////////////////////////////
141  // Done determining the clip positions, so clip.
142  // To determine the number of clips from the front, add 1 to the
143  // lastFrontClipPos since the index starts at 0.
144  // To determine the number of clips from the back, subtract the
145  // firstBackClipPos from the readLength.
146  // Example:
147  // Pos: 012345
148  // Read: AAAAAA
149  // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2.
150  return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
151 }
152 
153 
154 // Soft clip the record from the front and/or the back.
156  int32_t numFrontClips,
157  int32_t numBackClips)
158 {
159  //////////////////////////////////////////////////////////
160  Cigar* cigar = record.getCigarInfo();
161  FilterStatus status = NONE;
162  int32_t startPos = record.get0BasedPosition();
163  CigarRoller updatedCigar;
164 
165  status = softClip(*cigar, numFrontClips, numBackClips,
166  startPos, updatedCigar);
167 
168  if(status == FILTERED)
169  {
170  /////////////////////////////
171  // The entire read is clipped, so rather than clipping it,
172  // filter it out.
173  filterRead(record);
174  return(FILTERED);
175  }
176  else if(status == CLIPPED)
177  {
178  // Part of the read was clipped, and now that we have
179  // an updated cigar, update the read.
180  record.setCigar(updatedCigar);
181 
182  // Update the starting position.
183  record.set0BasedPosition(startPos);
184  }
185  return(status);
186 }
187 
188 
189 // Soft clip the cigar from the front and/or the back, writing the value
190 // into the new cigar.
192  int32_t numFrontClips,
193  int32_t numBackClips,
194  int32_t& startPos,
195  CigarRoller& updatedCigar)
196 {
197  int32_t readLength = oldCigar.getExpectedQueryBaseCount();
198  int32_t endClipPos = readLength - numBackClips;
199  FilterStatus status = NONE;
200 
201  if((numFrontClips != 0) || (numBackClips != 0))
202  {
203  // Clipping from front and/or from the back.
204 
205  // Check to see if the entire read was clipped.
206  int32_t totalClips = numFrontClips + numBackClips;
207  if(totalClips >= readLength)
208  {
209  /////////////////////////////
210  // The entire read is clipped, so rather than clipping it,
211  // filter it out.
212  return(FILTERED);
213  }
214 
215  // Part of the read was clipped.
216  status = CLIPPED;
217 
218  // Loop through, creating an updated cigar.
219  int origCigarOpIndex = 0;
220 
221  // Track how many read positions are covered up to this
222  // point by the cigar to determine up to up to what
223  // point in the cigar is affected by this clipping.
224  int32_t numPositions = 0;
225 
226  // Track if any non-clips are in the new cigar.
227  bool onlyClips = true;
228 
229  const Cigar::CigarOperator* op = NULL;
230 
231  //////////////////
232  // Clip from front
233  while((origCigarOpIndex < oldCigar.size()) &&
234  (numPositions < numFrontClips))
235  {
236  op = &(oldCigar.getOperator(origCigarOpIndex));
237  switch(op->operation)
238  {
239  case Cigar::hardClip:
240  // Keep this operation as the new clips do not
241  // affect other clips.
242  updatedCigar += *op;
243  break;
244  case Cigar::del:
245  case Cigar::skip:
246  // Skip and delete are going to be dropped, and
247  // are not in the read, so the read index doesn't
248  // need to be updated
249  break;
250  case Cigar::insert:
251  case Cigar::match:
252  case Cigar::mismatch:
253  case Cigar::softClip:
254  // Update the read index as these types
255  // are found in the read.
256  numPositions += op->count;
257  break;
258  case Cigar::none:
259  default:
260  // Nothing to do for none.
261  break;
262  };
263  ++origCigarOpIndex;
264  }
265 
266  // If bases were clipped from the front, add the clip and
267  // any partial cigar operation as necessary.
268  if(numFrontClips != 0)
269  {
270  // Add the softclip to the front of the read.
271  updatedCigar.Add(Cigar::softClip, numFrontClips);
272 
273  // Add the rest of the last Cigar operation if
274  // it is not entirely clipped.
275  int32_t newCount = numPositions - numFrontClips;
276  if(newCount > 0)
277  {
278  // Before adding it, check to see if the same
279  // operation is clipped from the end.
280  // numPositions greater than the endClipPos
281  // means that it is equal or past that position,
282  // so shorten the number of positions.
283  if(numPositions > endClipPos)
284  {
285  newCount -= (numPositions - endClipPos);
286  }
287  if(newCount > 0)
288  {
289  updatedCigar.Add(op->operation, newCount);
290  if(!Cigar::isClip(op->operation))
291  {
292  onlyClips = false;
293  }
294  }
295  }
296  }
297 
298  // Add operations until the point of the end clip is reached.
299  // For example...
300  // 2M1D3M = MMDMMM readLength = 5
301  // readIndex: 01 234
302  // at cigarOpIndex 0 (2M), numPositions = 2.
303  // at cigarOpIndex 1 (1D), numPositions = 2.
304  // at cigarOpIndex 2 (3M), numPositions = 5.
305  // if endClipPos = 2, we still want to consume the 1D, so
306  // need to keep looping until numPositions > endClipPos
307  while((origCigarOpIndex < oldCigar.size()) &&
308  (numPositions <= endClipPos))
309  {
310  op = &(oldCigar.getOperator(origCigarOpIndex));
311 
312  // Update the numPositions count if the operations indicates
313  // bases within the read.
314  if(!Cigar::foundInQuery(op->operation))
315  {
316  // This operation is not in the query read sequence,
317  // so it is not yet to the endClipPos, just add the
318  // operation do not increment the number of positions.
319  updatedCigar += *op;
320  if(!Cigar::isClip(op->operation))
321  {
322  onlyClips = false;
323  }
324  }
325  else
326  {
327  // This operation appears in the query sequence, so
328  // check to see if the clip occurs in this operation.
329 
330  // endClipPos is 0 based & numPositions is a count.
331  // If endClipPos is 4, then it is the 5th position.
332  // If 4 positions are covered so far (numPositions = 4),
333  // then we are right at endCLipPos: 4-4 = 0, none of
334  // this operation should be kept.
335  // If only 3 positions were covered, then we are at offset
336  // 3, so offset 3 should be added: 4-3 = 1.
337  uint32_t numPosTilClip = endClipPos - numPositions;
338 
339  if(numPosTilClip < op->count)
340  {
341  // this operation is partially clipped, write the part
342  // that was not clipped if it is not all clipped.
343  if(numPosTilClip != 0)
344  {
345  updatedCigar.Add(op->operation,
346  numPosTilClip);
347  if(!Cigar::isClip(op->operation))
348  {
349  onlyClips = false;
350  }
351  }
352  }
353  else
354  {
355  // This operation is not clipped, so add it
356  updatedCigar += *op;
357  if(!Cigar::isClip(op->operation))
358  {
359  onlyClips = false;
360  }
361  }
362  // This operation occurs in the query sequence, so
363  // increment the number of positions covered.
364  numPositions += op->count;
365  }
366 
367  // Move to the next cigar position.
368  ++origCigarOpIndex;
369  }
370 
371  //////////////////
372  // Add the softclip to the back.
373  if(numBackClips != 0)
374  {
375  // Add the softclip to the end
376  updatedCigar.Add(Cigar::softClip, numBackClips);
377  }
378 
379  //////////////////
380  // Add any hardclips remaining in the original cigar to the back.
381  while(origCigarOpIndex < oldCigar.size())
382  {
383  op = &(oldCigar.getOperator(origCigarOpIndex));
384  if(op->operation == Cigar::hardClip)
385  {
386  // Keep this operation as the new clips do not
387  // affect other clips.
388  updatedCigar += *op;
389  }
390  ++origCigarOpIndex;
391  }
392 
393  // Check to see if the new cigar is only clips.
394  if(onlyClips)
395  {
396  // Only clips in the new cigar, so mark the read as filtered
397  // instead of updating the cigar.
398  /////////////////////////////
399  // The entire read was clipped.
400  status = FILTERED;
401  }
402  else
403  {
404  // Part of the read was clipped.
405  // Update the starting position if a clip was added to
406  // the front.
407  if(numFrontClips > 0)
408  {
409  // Convert from query index to reference position (from the
410  // old cigar)
411  // Get the position for the last front clipped position by
412  // getting the position associated with the clipped base on
413  // the reference. Then add one to get to the first
414  // non-clipped position.
415  int32_t lastFrontClipPos = numFrontClips - 1;
416  int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos,
417  startPos);
418  if(newStartPos != Cigar::INDEX_NA)
419  {
420  // Add one to get first non-clipped position.
421  startPos = newStartPos + 1;
422  }
423  }
424  }
425  }
426  return(status);
427 }
428 
429 
431  GenomeSequence& refSequence,
432  uint32_t qualityThreshold,
433  uint8_t defaultQualityInt)
434 {
435  uint32_t totalMismatchQuality =
436  sumMismatchQuality(record, refSequence, defaultQualityInt);
437 
438  // If the total mismatch quality is over the threshold,
439  // filter the read.
440  if(totalMismatchQuality > qualityThreshold)
441  {
442  filterRead(record);
443  return(FILTERED);
444  }
445  return(NONE);
446 }
447 
448 
449 // NOTE: Only positions where the reference and read both have bases that
450 // are different and not 'N' are considered mismatches.
452  GenomeSequence& refSequence,
453  uint8_t defaultQualityInt)
454 {
455  // Track the mismatch info.
456  int mismatchQual = 0;
457  int numMismatch = 0;
458 
459  SamQuerySeqWithRefIter sequenceIter(record, refSequence);
460 
461  SamSingleBaseMatchInfo baseMatchInfo;
462  while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
463  {
464  if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
465  {
466  // Got a mismatch, get the associated quality.
467  char readQualityChar =
468  record.getQuality(baseMatchInfo.getQueryIndex());
469  uint8_t readQualityInt =
470  BaseUtilities::getPhredBaseQuality(readQualityChar);
471 
472  if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
473  {
474  // Quality was not specified, so use the configured setting.
475  readQualityInt = defaultQualityInt;
476  }
477  mismatchQual += readQualityInt;
478  ++numMismatch;
479  }
480  }
481 
482  return(mismatchQual);
483 }
484 
485 
487 {
488  // Filter the read by marking it as unmapped.
489  uint16_t flag = record.getFlag();
490  SamFlag::setUnmapped(flag);
491  // Clear N/A flags.
492  flag &= ~SamFlag::PROPER_PAIR;
493  flag &= ~SamFlag::SECONDARY_ALIGNMENT;
494  flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT;
495  record.setFlag(flag);
496  // Clear Cigar
497  record.setCigar("*");
498  // Clear mapping quality
499  record.setMapQuality(0);
500 }
int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos)
Return the reference position associated with the specified query index or INDEX_NA based on this cig...
Definition: Cigar.cpp:217
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
static bool isClip(Operation op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:261
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
Filtering clipped the read.
Definition: SamFilter.h:31
static uint32_t sumMismatchQuality(SamRecord &record, GenomeSequence &refSequence, uint8_t defaultQualityInt)
Get the sum of the qualities of all mismatches in the record.
Definition: SamFilter.cpp:451
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition: Cigar.h:93
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
Definition: SamRecord.cpp:251
no operation has been set.
Definition: Cigar.h:88
The filter did not affect the read.
Definition: SamFilter.h:30
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1824
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
static void setUnmapped(uint16_t &flag)
Mark the passed in flag as unmapped.
Definition: SamFlag.h:104
Type getType()
Get the type (match/mismatch/unknown) for this object.
Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H".
Definition: Cigar.h:95
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1379
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:219
bool getNextMatchMismatch(SamSingleBaseMatchInfo &matchMismatchInfo)
Returns information for the next position where the query and the reference match or mismatch...
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that)...
Definition: Cigar.h:83
Iterates through the query and compare with reference.
static void filterRead(SamRecord &record)
Filter the read by marking it as unmapped.
Definition: SamFilter.cpp:486
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1626
static FilterStatus filterOnMismatchQuality(SamRecord &record, GenomeSequence &refSequence, uint32_t qualityThreshold, uint8_t defaultQualityInt)
Filter the read based on the specified quality threshold.
Definition: SamFilter.cpp:430
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
Definition: SamRecord.cpp:215
mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:90
Filtering caused the read to be modified to unmapped.
Definition: SamFilter.h:32
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)...
Definition: Cigar.h:94
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
Class for extracting information from a SAM Flag.
Definition: SamFlag.h:28
int32_t getQueryIndex()
Get the query index for this object.
static const uint8_t UNKNOWN_QUALITY_INT
Int value used when the quality is unknown.
Definition: BaseUtilities.h:51
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
Definition: SamRecord.cpp:259
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition: Cigar.cpp:95
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
FilterStatus
Enum describing what sort of filtering was done.
Definition: SamFilter.h:29
const CigarOperator & getOperator(int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition: Cigar.h:354
static FilterStatus clipOnMismatchThreshold(SamRecord &record, GenomeSequence &refSequence, double mismatchThreshold)
Clip the read based on the specified mismatch threshold.
Definition: SamFilter.cpp:27
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record...
Definition: SamRecord.h:51
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1372
bool set0BasedPosition(int32_t position)
Set the leftmost position using the specified 0-based (BAM format) value.
Definition: SamRecord.cpp:242
static FilterStatus softClip(SamRecord &record, int32_t numFrontClips, int32_t numBackClips)
Soft clip the record from the front and/or the back.
Definition: SamFilter.cpp:155
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object...
Definition: CigarRoller.h:66
This class contains the match/mismatch information between the reference and a read for a single base...
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1307