libStatGen Software  1
SamValidation.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <iostream>
19 
20 #include "SamValidation.h"
21 #include "CigarRoller.h"
22 #include "SamTags.h"
23 
24 const char* SamValidationError::enumSeverityString[] = {
25  "WARNING", "ERROR"};
26 
27 const char* SamValidationError::enumTypeString[] = {
28  "INVALID_QNAME",
29  "INVALID_REF_ID",
30  "INVALID_RNAME",
31  "INVALID_POS",
32  "INVALID_MAPQ",
33  "INVALID_CIGAR",
34  "INVALID_MRNM",
35  "INVALID_QUAL",
36  "INVALID_TAG"
37 };
38 
40 {
41  return(enumTypeString[type]);
42 }
43 
44 
46  std::string message)
47 {
48  myType = type;
49  mySeverity = severity;
50  myMessage = message;
51 }
52 
53 
55 {
56  return(myType);
57 }
58 
59 
61 {
62  return(mySeverity);
63 }
64 
65 
66 const char* SamValidationError::getMessage() const
67 {
68  return(myMessage.c_str());
69 }
70 
71 
73 {
74  return(enumTypeString[myType]);
75 }
76 
77 
79 {
80  return(enumSeverityString[mySeverity]);
81 }
82 
83 
84 void SamValidationError::getErrorString(std::string& errorString) const
85 {
86  errorString = getTypeString();
87  errorString += " (";
88  errorString += getSeverityString();
89  errorString += ") : ";
90  errorString += getMessage();
91  errorString += "\n";
92 }
93 
94 
96 {
97  std::cerr << this;
98 }
99 
100 
101 
102 // Constructor.
104  : myValidationErrors()
105 {
106  myErrorIter = myValidationErrors.begin();
107 }
108 
109 
110 // Destructor
112 {
113  clear();
114 }
115 
116 
118 {
119  // Clear the errors.
120  std::list<const SamValidationError*>::iterator errorIter;
121  for(errorIter = myValidationErrors.begin();
122  errorIter != myValidationErrors.end(); ++errorIter)
123  {
124  delete *errorIter;
125  *errorIter = NULL;
126  }
127  myValidationErrors.clear();
128  myErrorIter = myValidationErrors.end();
129 }
130 
131 
133  SamValidationError::Severity newSeverity,
134  const char* newMessage)
135 {
136  myValidationErrors.push_back(new SamValidationError(newType,
137  newSeverity,
138  newMessage));
139 
140  // If this is the first element in the list, set the iterator.
141  if(myValidationErrors.size() == 1)
142  {
143  // set the iterator to the first element.
144  myErrorIter = myValidationErrors.begin();
145  }
146 }
147 
148 
149 
150 // Return the number of validation errors that are contained in this object.
152 {
153  return(myValidationErrors.size());
154 }
155 
156 
157 // Return a pointer to the next error. It does not remove it from the list.
158 // Returns null once all errors have been retrieved until resetErrorIter
159 // is called.
161 {
162  if(myErrorIter == myValidationErrors.end())
163  {
164  // at the end of the list, return null.
165  return(NULL);
166  }
167  // Not at the end of the list, return the last element and increment.
168  return(*myErrorIter++);
169 }
170 
171 
172 // Resets the iterator to the begining of the errors.
174 {
175  myErrorIter = myValidationErrors.begin();
176 }
177 
178 
179 // Appends the error messages to the passed in string.
180 void SamValidationErrors::getErrorString(std::string& errorString) const
181 {
182  for(std::list<const SamValidationError*>::
183  const_iterator validationErrorIter =
184  myValidationErrors.begin();
185  validationErrorIter != myValidationErrors.end();
186  validationErrorIter++)
187  {
188  std::string error = "";
189  (*validationErrorIter)->getErrorString(error);
190  errorString += error;
191  }
192 }
193 
194 
195 bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord,
196  SamValidationErrors& validationErrors)
197 {
198  bool status = true;
199  status &= isValidQname(samRecord.getReadName(),
200  samRecord.getReadNameLength(),
201  validationErrors);
202 
203  status &= isValidFlag(samRecord.getFlag(),
204  validationErrors);
205 
206  // Validate the RName including validating it against the header.
207  status &= isValidRname(samHeader,
208  samRecord.getReferenceName(),
209  validationErrors);
210 
211  status &= isValidRefID(samRecord.getReferenceID(),
212  samHeader.getReferenceInfo(),
213  validationErrors);
214 
215  status &= isValid1BasedPos(samRecord.get1BasedPosition(),
216  validationErrors);
217 
218  status &= isValidMapQuality(samRecord.getMapQuality(), validationErrors);
219 
220  status &= isValidSequence(samRecord, validationErrors);
221 
222  status &= isValidCigar(samRecord, validationErrors);
223 
224  status &= isValidQuality(samRecord, validationErrors);
225 
226  status &= isValidTags(samRecord, validationErrors);
227 
228  return(status);
229 }
230 
231 
232 // qname is the query (read) name - result of SamRecord::getReadName().
233 // readNameLen is the length of the read name including the null (the result
234 // of SamRecord::getReadNameLength()).
235 // For some invalid records, the getReadNameLength may be different than the
236 // length of qname.
237 // NOTE: Query Name and Read Name both refer to the same field.
238 bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen,
239  SamValidationErrors& validationErrors)
240 {
241  // Validation for QNAME is:
242  // a) length of the qname string is the same as the read name length
243  // b) length is between 1 and 254.
244  // c) [ \t\n\r] are not allowed in the name.
245 
246  bool status = true;
247 
248  // Get the length of the qname string.
249  int32_t qnameLenNull = strlen(qname) + 1;
250 
251  ////////////////////////////////////
252  // a) length of the qname string is the same as the read name length
253  if(qnameLenNull != readNameLen)
254  {
255  // This results from a poorly formatted bam file, where the null
256  // terminated read_name field is not the same length as specified by
257  // read_name_len.
258  String message = "Invalid Query Name - the string length (";
259  message += qnameLenNull;
260  message += ") does not match the specified query name length (";
261  message += readNameLen;
262  message += ").";
263 
266  message.c_str());
267  status = false;
268  }
269 
270  ////////////////////////////////////
271  // b) length is between 1 and 254
272  // The length with the terminating null must be between 2 & 255,
273  if((qnameLenNull < 2) || (qnameLenNull > 255))
274  {
275  String message = "Invalid Query Name (QNAME) length: ";
276  message += qnameLenNull;
277  message += ". Length with the terminating null must be between 2 & 255.";
278 
281  message.c_str());
282  status = false;
283  }
284 
285  ////////////////////////////////////
286  // Loop through and validate they all characters are valid.
287  // c) [ \t\n\r] are not allowed in the name.
288  String message;
289  for(int i = 0; i < qnameLenNull; ++i)
290  {
291  switch(qname[i])
292  {
293  case ' ':
294  // Invalid character.
295  message = "Invalid character in the Query Name (QNAME): ' ' at position ";
296  message += i;
297  message += ".";
300  message.c_str());
301  status = false;
302  break;
303  case '\t':
304  // Invalid character.
305  message = "Invalid character in the Query Name (QNAME): '\t' at position ";
306  message += i;
307  message += ".";
310  message.c_str());
311  status = false;
312  break;
313  case '\n':
314  // Invalid character.
315  message = "Invalid character in the Query Name (QNAME): '\n' at position ";
316  message += i;
317  message += ".";
320  message.c_str());
321  status = false;
322  break;
323  case '\r':
324  // Invalid character.
325  message = "Invalid character in the Query Name (QNAME): '\r' at position ";
326  message += i;
327  message += ".";
330  message.c_str());
331  status = false;
332  break;
333  }
334  }
335 
336  return(status);
337 }
338 
339 
340 bool SamValidator::isValidFlag(uint16_t flag,
341  SamValidationErrors& validationErrors)
342 {
343  // All values in a uint16_t are valid, so return true.
344  return(true);
345 }
346 
347 
349  const char* rname,
350  SamValidationErrors& validationErrors)
351 {
352  bool status = true;
353 
354  // Cross validate the rname and the header.
355  // If the rname is not '*'
356  // AND there are any SQ records in the header,
357  // Then the rname must be in one of them.
358  if((strcmp(rname, "*") != 0) &&
359  (samHeader.getNumSQs() != 0) &&
360  (samHeader.getSQ(rname) == NULL))
361  {
362  // There are SQ fields, but the ref name is not in it.
363  status = false;
364  std::string message = "RNAME, ";
365  message += rname;
366  message += ", was not found in a SAM Header SQ record";
369  message.c_str());
370  }
371  status &= isValidRname(rname, validationErrors);
372  return(status);
373 }
374 
375 
376 bool SamValidator::isValidRname(const char* rname,
377  SamValidationErrors& validationErrors)
378 {
379  // Validation for RNAME is:
380  // a) cannot be 0 length.
381  // b) [ \t\n\r@=] are not allowed in the name.
382 
383  bool status = true;
384 
385  // Get the length of the rname string.
386  int32_t rnameLen = strlen(rname);
387 
388  String message;
389 
390  if(rnameLen == 0)
391  {
394  "Reference Sequence Name (RNAME) cannot have 0 length.");
395  status = false;
396  }
397 
398  ////////////////////////////////////
399  ////////////////////////////////////
400  // Loop through and validate they all characters are valid.
401  // b) [ \t\n\r] are not allowed in the name.
402  for(int i = 0; i < rnameLen; ++i)
403  {
404  switch(rname[i])
405  {
406  case ' ':
407  // Invalid character.
408  message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position ";
409  message += i;
410  message += ".";
413  message.c_str());
414  status = false;
415  break;
416  case '\t':
417  // Invalid character.
418  message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position ";
419  message += i;
420  message += ".";
423  message.c_str());
424  status = false;
425  break;
426  case '\n':
427  // Invalid character.
428  message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position ";
429  message += i;
430  message += ".";
433  message.c_str());
434  status = false;
435  break;
436  case '\r':
437  // Invalid character.
438  message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position ";
439  message += i;
440  message += ".";
443  message.c_str());
444  status = false;
445  break;
446  case '@':
447  // Invalid character.
448  message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position ";
449  message += i;
450  message += ".";
453  message.c_str());
454  status = false;
455  break;
456  case '=':
457  // Invalid character.
458  message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position ";
459  message += i;
460  message += ".";
463  message.c_str());
464  status = false;
465  break;
466  default:
467  // Allowed character.
468  break;
469  }
470  }
471 
472  return(status);
473 }
474 
475 
476 bool SamValidator::isValidRefID(int32_t refID,
477  const SamReferenceInfo& refInfo,
478  SamValidationErrors& validationErrors)
479 {
480  // Validation for rID is:
481  // a) must be between -1 and the number of refInfo.
482  // -1 is allowed, and otherwise it must properly index into the array.
483 
484  bool status = true;
485  if((refID < -1) || (refID >= refInfo.getNumEntries()))
486  {
487  // Reference ID is too large or too small.
488  String message = "Invalid Reference ID, out of range (";
489  message += refID;
490  message += ") must be between -1 and ";
491  message += refInfo.getNumEntries() - 1;
492  message += ".";
493 
496  message.c_str());
497  status = false;
498  }
499 
500  return(status);
501 }
502 
503 
505  SamValidationErrors& validationErrors)
506 {
507  // Validation for pos is:
508  // a) must be between 0 and (2^29)-1.
509 
510  bool status = true;
511 
512  if((pos < 0) || (pos > 536870911))
513  {
514  String message = "POS out of range (";
515  message += pos;
516  message += ") must be between 0 and (2^29)-1.";
517 
518  validationErrors.addError(SamValidationError::INVALID_POS,
520  message.c_str());
521  status = false;
522  }
523 
524  return(status);
525 }
526 
527 
528 bool SamValidator::isValidMapQuality(uint8_t mapQuality,
529  SamValidationErrors& validationErrors)
530 {
531  // All values in a uint8_t are valid, so return true.
532  return(true);
533 }
534 
535 
537  SamValidationErrors& validationErrors)
538 {
539  return(true);
540 }
541 
542 
544  SamValidationErrors& validationErrors)
545 {
546  return(isValidCigar(samRecord.getCigar(),
547  samRecord.getReadLength(),
548  validationErrors));
549 }
550 
551 bool SamValidator::isValidCigar(const char* cigar,
552  const char* sequence,
553  SamValidationErrors& validationErrors)
554 {
555  return(isValidCigar(cigar, strlen(sequence), validationErrors));
556 }
557 
558 bool SamValidator::isValidCigar(const char* cigar,
559  int seqLen,
560  SamValidationErrors& validationErrors)
561 {
562  // Validation for CIGAR is:
563  // a) cannot be 0 length.
564  // if not "*", validate the following:
565  // b) must have an integer length for each operator (if not "*"). TODO
566  // c) all operators must be valid (if not "*"). TODO
567  // d) evaluates to the same read length as the sequence string.
568  bool status = true;
569  String message;
570 
571  int32_t cigarLen = strlen(cigar);
572 
573  // a) cannot be 0 length.
574  if(cigarLen == 0)
575  {
578  "Cigar must not be blank.");
579  status = false;
580  }
581 
582  if(strcmp(cigar, "*") != 0)
583  {
584  // The cigar is not "*", so validate it.
585  CigarRoller cigarRoller(cigar);
586 
587  // b) must have an integer length for each operator.
588  // TODO
589  // c) all operators must be valid.
590  // TODO
591 
592  // d) is the same length as the sequence string.
593  int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount();
594  if(cigarSeqLen != seqLen)
595  {
596  message = "CIGAR does not evaluate to the same length as SEQ, (";
597  message += cigarSeqLen;
598  message += " != ";
599  message += seqLen;
600  message += ").";
603  message.c_str());
604  status = false;
605  }
606  }
607  return(status);
608 }
609 
610 
612  SamValidationErrors& validationErrors)
613 {
614  return(isValidQuality(samRecord.getQuality(),
615  samRecord.getReadLength(),
616  validationErrors));
617 }
618 
619 
620 bool SamValidator::isValidQuality(const char* quality,
621  const char* sequence,
622  SamValidationErrors& validationErrors)
623 {
624  // Determine the length of the sequence.
625  int seqLen = strlen(sequence);
626 
627  // Check if the sequence is '*' since then the seqLength is 0.
628  if(strcmp(sequence, "*") == 0)
629  {
630  seqLen = 0;
631  }
632  return(isValidQuality(quality, seqLen, validationErrors));
633 }
634 
635 
636 bool SamValidator::isValidQuality(const char* quality,
637  int seqLength,
638  SamValidationErrors& validationErrors)
639 {
640  bool status = true;
641 
642  // If the quality or the sequence are non-"*", validate that the quality
643  // and sequence have the same length.
644  if((seqLength != 0) && (strcmp(quality, "*") != 0))
645  {
646  int qualLen = strlen(quality);
647  // Both the sequence and the quality are not "*", so validate
648  // that they are the same length.
649  if(seqLength != qualLen)
650  {
651  // Both fields are specified but are different lengths.
652 
653  String message = "QUAL is not the same length as SEQ, (";
654  message += qualLen;
655  message += " != ";
656  message += seqLength;
657  message += ").";
658 
661  message.c_str());
662  status = false;
663  }
664  }
665  return(status);
666 }
667 
668 
670  SamValidationErrors& validationErrors)
671 {
672  bool status = true;
673 
674  GenomeSequence* reference = samRecord.getReference();
675  // If the reference is not null, check the MD tag.
676  if(reference != NULL)
677  {
678  const String* recordMD = samRecord.getStringTag(SamTags::MD_TAG);
679  if(recordMD != NULL)
680  {
681  // The record has an MD tag so check to see if it is correct.
682  if(!SamTags::isMDTagCorrect(samRecord, *reference))
683  {
684  // Invalid MD tags.
685  String correctMD;
686  if(!SamTags::createMDTag(correctMD, samRecord, *reference))
687  {
688  // Failed to get the MD tag, so indicate that it is unknown.
689  correctMD = "UNKNOWN";
690  }
691  String message = "Incorrect MD Tag, ";
692  message += *recordMD;
693  message += ", should be ";
694  message += correctMD;
695  message += ".";
696 
697  validationErrors.addError(SamValidationError::INVALID_TAG,
699  message.c_str());
700 
701  status = false;
702  }
703  }
704  }
705 
706  return(status);
707 }
static bool isValidSequence(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the sequence, but not against the cigar or quality string.
The SamValidationError class describes a validation error that occured, containing the error type...
Definition: SamValidation.h:34
void getErrorString(std::string &errorString) const
Append the error messages contained in this container to the passed in string.
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
Definition: SamRecord.cpp:1328
static bool createMDTag(String &outputMDtag, SamRecord &inputRec, GenomeSequence &genome)
Create the MD tag for the specified input record and the genome.
Definition: SamTags.cpp:34
Severity getSeverity() const
Return the severity enum of this validation error object.
static bool isValidTags(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the tags.
SamValidationErrors()
Constructor.
static bool isValidFlag(uint16_t flag, SamValidationErrors &validationErrors)
Determines whether or not the flag is valid.
unsigned int numErrors()
Return the number of validation errors contained in this object.
const char * getMessage() const
Return the error message of this validation error object.
const char * getTypeString() const
Return the string representing this object&#39;s type of validation error.
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2168
Invalid reference name.
Definition: SamValidation.h:51
SamHeaderSQ * getSQ(const char *name)
Get the SQ object with the specified sequence name, returning NULL if there is no SQ object with that...
Type
Type of the error.
Definition: SamValidation.h:47
Invalid base quality.
Definition: SamValidation.h:56
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1530
void clear()
Remove all the errors from the container.
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
Definition: SamRecord.cpp:1293
int getNumSQs()
Get the number of SQ objects.
void getErrorString(std::string &errorString) const
Get the error string representing this object&#39;s error.
const char * getSeverityString() const
Return the string representing this object&#39;s severity of validation error.
const SamReferenceInfo & getReferenceInfo() const
Get the Reference Information.
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1379
Error is used if parsing could not succeed.
Definition: SamValidation.h:41
static bool isValidMapQuality(uint8_t mapQuality, SamValidationErrors &validationErrors)
Validate the mapping quality.
Class for tracking the reference information mapping between the reference ids and the reference name...
uint8_t getReadNameLength()
Get the length of the readname (QNAME) including the null.
Definition: SamRecord.cpp:1314
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1300
Type getType() const
Return the type enum of this validation error object.
const SamValidationError * getNextError()
Return a pointer to the next error without removing it from the container, and returning null once al...
static bool isValidQname(const char *qname, uint8_t qnameLen, SamValidationErrors &validationErrors)
Determines whether or not the specified qname is valid.
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1626
void resetErrorIter()
Reset the iterator to the begining of the errors.
static bool isValidRefID(int32_t refID, const SamReferenceInfo &refInfo, SamValidationErrors &validationErrors)
Validate whether or not the specified reference id is valid.
static bool isValidRname(SamFileHeader &samHeader, const char *rname, SamValidationErrors &validationErrors)
Validate the reference name including validating against the header.
int32_t getNumEntries() const
Get the number of entries contained here.
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:34
static bool isValid1BasedPos(int32_t pos, SamValidationErrors &validationErrors)
Validate the refeference position.
static bool isMDTagCorrect(SamRecord &inputRec, GenomeSequence &genome)
Check to see if the MD tag in the record is accurate.
Definition: SamTags.cpp:126
Severity
Severity of the error.
Definition: SamValidation.h:38
Warning is used if it is just an invalid value.
Definition: SamValidation.h:40
static bool isValid(SamFileHeader &samHeader, SamRecord &samRecord, SamValidationErrors &validationErrors)
Validates whether or not the specified SamRecord is valid, calling all of the other validations...
The SamValidationErrors class is a container class that holds SamValidationError Objects, allowing a validation method to return all of the invalid errors rather than just one.
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition: Cigar.cpp:95
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Invalid read/query name.
Definition: SamValidation.h:49
SamValidationError(Type type, Severity severity, std::string Message)
Constructor that sets the type, severity, and message for the validation error.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record...
Definition: SamRecord.h:51
GenomeSequence * getReference()
Returns a pointer to the genome sequence object associated with this record if it was set (NULL if it...
Definition: SamRecord.cpp:1911
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1372
static bool isValidQuality(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the base quality.
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object...
Definition: CigarRoller.h:66
void addError(SamValidationError::Type newType, SamValidationError::Severity newSeverity, const char *newMessage)
Add the specified error to this container.
static bool isValidCigar(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the cigar.
void printError() const
Print a formatted output of the error to cerr.
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1543
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1286
~SamValidationErrors()
Destructor.