libStatGen Software  1
SamRecord.cpp
1 /*
2  * Copyright (C) 2010-2012 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdlib.h>
19 #include <limits>
20 #include <stdexcept>
21 
22 #include "bam.h"
23 
24 #include "SamRecord.h"
25 #include "SamValidation.h"
26 
27 #include "BaseUtilities.h"
28 #include "SamQuerySeqWithRefHelper.h"
29 
30 const char* SamRecord::DEFAULT_READ_NAME = "UNKNOWN";
31 const char* SamRecord::FIELD_ABSENT_STRING = "=";
32 int SamRecord::myNumWarns = 0;
33 
35  : myStatus(),
36  myRefPtr(NULL),
37  mySequenceTranslation(NONE)
38 {
39  int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
40 
41  myRecordPtr =
42  (bamRecordStruct *) malloc(defaultAllocSize);
43 
44  myCigarTempBuffer = NULL;
45  myCigarTempBufferAllocatedSize = 0;
46 
47  allocatedSize = defaultAllocSize;
48 
49  resetRecord();
50 }
51 
52 
54  : myStatus(errorHandlingType),
55  myRefPtr(NULL),
56  mySequenceTranslation(NONE)
57 {
58  int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
59 
60  myRecordPtr =
61  (bamRecordStruct *) malloc(defaultAllocSize);
62 
63  myCigarTempBuffer = NULL;
64  myCigarTempBufferAllocatedSize = 0;
65 
66  allocatedSize = defaultAllocSize;
67 
68  resetRecord();
69 }
70 
71 
73 {
74  resetRecord();
75 
76  if(myRecordPtr != NULL)
77  {
78  free(myRecordPtr);
79  myRecordPtr = NULL;
80  }
81  if(myCigarTempBuffer != NULL)
82  {
83  free(myCigarTempBuffer);
84  myCigarTempBuffer = NULL;
85  myCigarTempBufferAllocatedSize = 0;
86  }
87 }
88 
89 
90 // Resets the fields of the record to a default value.
92 {
93  myIsBufferSynced = true;
94 
95  myRecordPtr->myBlockSize = DEFAULT_BLOCK_SIZE;
96  myRecordPtr->myReferenceID = -1;
97  myRecordPtr->myPosition = -1;
98  myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
99  myRecordPtr->myMapQuality = 0;
100  myRecordPtr->myBin = DEFAULT_BIN;
101  myRecordPtr->myCigarLength = 0;
102  myRecordPtr->myFlag = 0;
103  myRecordPtr->myReadLength = 0;
104  myRecordPtr->myMateReferenceID = -1;
105  myRecordPtr->myMatePosition = -1;
106  myRecordPtr->myInsertSize = 0;
107 
108  // Set the sam values for the variable length fields.
109  // TODO - one way to speed this up might be to not set to "*" and just
110  // clear them, and write out a '*' for SAM if it is empty.
111  myReadName = DEFAULT_READ_NAME;
112  myReferenceName = "*";
113  myMateReferenceName = "*";
114  myCigar = "*";
115  mySequence = "*";
116  mySeqWithEq.clear();
117  mySeqWithoutEq.clear();
118  myQuality = "*";
119  myNeedToSetTagsFromBuffer = false;
120  myNeedToSetTagsInBuffer = false;
121 
122  // Initialize the calculated alignment info to the uncalculated value.
123  myAlignmentLength = -1;
124  myUnclippedStartOffset = -1;
125  myUnclippedEndOffset = -1;
126 
127  clearTags();
128 
129  // Set the bam values for the variable length fields.
130  // Only the read name needs to be set, the others are a length of 0.
131  // Set the read name. The min size of myRecordPtr includes the size for
132  // the default read name.
133  memcpy(&(myRecordPtr->myData), myReadName.c_str(),
134  myRecordPtr->myReadNameLength);
135 
136  // Set that the variable length buffer fields are valid.
137  myIsReadNameBufferValid = true;
138  myIsCigarBufferValid = true;
139  myPackedSequence =
140  (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength +
141  myRecordPtr->myCigarLength * sizeof(int);
142  myIsSequenceBufferValid = true;
143  myBufferSequenceTranslation = NONE;
144 
145  myPackedQuality = myPackedSequence;
146  myIsQualityBufferValid = true;
147  myIsTagsBufferValid = true;
148  myIsBinValid = true;
149 
150  myCigarTempBufferLength = -1;
151 
152  myStatus = SamStatus::SUCCESS;
153 
154  NOT_FOUND_TAG_STRING = "";
155  NOT_FOUND_TAG_INT = -1; // TODO - deprecate
156 }
157 
158 
159 // Returns whether or not the record is valid.
160 // Header is needed to perform some validation against it.
162 {
163  myStatus = SamStatus::SUCCESS;
164  SamValidationErrors invalidSamErrors;
165  if(!SamValidator::isValid(header, *this, invalidSamErrors))
166  {
167  // The record is not valid.
168  std::string errorMessage = "";
169  invalidSamErrors.getErrorString(errorMessage);
170  myStatus.setStatus(SamStatus::INVALID, errorMessage.c_str());
171  return(false);
172  }
173  // The record is valid.
174  return(true);
175 }
176 
177 
179 {
180  myRefPtr = reference;
181 }
182 
183 
184 // Set the type of sequence translation to use when getting
185 // the sequence. The default type (if this method is never called) is
186 // NONE (the sequence is left as-is). This is used
188 {
189  mySequenceTranslation = translation;
190 }
191 
192 
193 bool SamRecord::setReadName(const char* readName)
194 {
195  myReadName = readName;
196  myIsBufferSynced = false;
197  myIsReadNameBufferValid = false;
198  myStatus = SamStatus::SUCCESS;
199 
200  // The read name must at least have some length, otherwise this is a parsing
201  // error.
202  if(myReadName.Length() == 0)
203  {
204  // Invalid - reset ReadName return false.
205  myReadName = DEFAULT_READ_NAME;
206  myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
207  myStatus.setStatus(SamStatus::INVALID, "0 length Query Name.");
208  return(false);
209  }
210 
211  return true;
212 }
213 
214 
215 bool SamRecord::setFlag(uint16_t flag)
216 {
217  myStatus = SamStatus::SUCCESS;
218  myRecordPtr->myFlag = flag;
219  return true;
220 }
221 
222 
224  const char* referenceName)
225 {
226  myStatus = SamStatus::SUCCESS;
227 
228  myReferenceName = referenceName;
229  // If the reference ID does not already exist, add it (pass true)
230  myRecordPtr->myReferenceID = header.getReferenceID(referenceName, true);
231 
232  return true;
233 }
234 
235 
236 bool SamRecord::set1BasedPosition(int32_t position)
237 {
238  return(set0BasedPosition(position - 1));
239 }
240 
241 
242 bool SamRecord::set0BasedPosition(int32_t position)
243 {
244  myStatus = SamStatus::SUCCESS;
245  myRecordPtr->myPosition = position;
246  myIsBinValid = false;
247  return true;
248 }
249 
250 
251 bool SamRecord::setMapQuality(uint8_t mapQuality)
252 {
253  myStatus = SamStatus::SUCCESS;
254  myRecordPtr->myMapQuality = mapQuality;
255  return true;
256 }
257 
258 
259 bool SamRecord::setCigar(const char* cigar)
260 {
261  myStatus = SamStatus::SUCCESS;
262  myCigar = cigar;
263 
264  myIsBufferSynced = false;
265  myIsCigarBufferValid = false;
266  myCigarTempBufferLength = -1;
267  myIsBinValid = false;
268 
269  // Initialize the calculated alignment info to the uncalculated value.
270  myAlignmentLength = -1;
271  myUnclippedStartOffset = -1;
272  myUnclippedEndOffset = -1;
273 
274  return true;
275 }
276 
277 
278 bool SamRecord::setCigar(const Cigar& cigar)
279 {
280  myStatus = SamStatus::SUCCESS;
281  cigar.getCigarString(myCigar);
282 
283  myIsBufferSynced = false;
284  myIsCigarBufferValid = false;
285  myCigarTempBufferLength = -1;
286  myIsBinValid = false;
287 
288  // Initialize the calculated alignment info to the uncalculated value.
289  myAlignmentLength = -1;
290  myUnclippedStartOffset = -1;
291  myUnclippedEndOffset = -1;
292 
293  return true;
294 }
295 
296 
298  const char* mateReferenceName)
299 {
300  myStatus = SamStatus::SUCCESS;
301  // Set the mate reference, if it is "=", set it to be equal
302  // to myReferenceName. This assumes that myReferenceName has already
303  // been called.
304  if(strcmp(mateReferenceName, FIELD_ABSENT_STRING) == 0)
305  {
306  myMateReferenceName = myReferenceName;
307  }
308  else
309  {
310  myMateReferenceName = mateReferenceName;
311  }
312 
313  // Set the Mate Reference ID.
314  // If the reference ID does not already exist, add it (pass true)
315  myRecordPtr->myMateReferenceID =
316  header.getReferenceID(myMateReferenceName, true);
317 
318  return true;
319 }
320 
321 
322 bool SamRecord::set1BasedMatePosition(int32_t matePosition)
323 {
324  return(set0BasedMatePosition(matePosition - 1));
325 }
326 
327 
328 bool SamRecord::set0BasedMatePosition(int32_t matePosition)
329 {
330  myStatus = SamStatus::SUCCESS;
331  myRecordPtr->myMatePosition = matePosition;
332  return true;
333 }
334 
335 
336 bool SamRecord::setInsertSize(int32_t insertSize)
337 {
338  myStatus = SamStatus::SUCCESS;
339  myRecordPtr->myInsertSize = insertSize;
340  return true;
341 }
342 
343 
344 bool SamRecord::setSequence(const char* seq)
345 {
346  myStatus = SamStatus::SUCCESS;
347  mySequence = seq;
348  mySeqWithEq.clear();
349  mySeqWithoutEq.clear();
350 
351  myIsBufferSynced = false;
352  myIsSequenceBufferValid = false;
353  return true;
354 }
355 
356 
357 bool SamRecord::setQuality(const char* quality)
358 {
359  myStatus = SamStatus::SUCCESS;
360  myQuality = quality;
361  myIsBufferSynced = false;
362  myIsQualityBufferValid = false;
363  return true;
364 }
365 
366 
367 //Shift indels to the left
369 {
370  // Check to see whether or not the Cigar has already been
371  // set - this is determined by checking if alignment length
372  // is set since alignment length and the cigar are set
373  // at the same time.
374  if(myAlignmentLength == -1)
375  {
376  // Not been set, so calculate it.
377  parseCigar();
378  }
379 
380  // Track whether or not there was a shift.
381  bool shifted = false;
382 
383  // Cigar is set, so now myCigarRoller can be used.
384  // Track where in the read we are.
385  uint32_t currentPos = 0;
386 
387  // Since the loop starts at 1 because the first operation can't be shifted,
388  // increment the currentPos past the first operation.
389  if(Cigar::foundInQuery(myCigarRoller[0]))
390  {
391  // This op was found in the read, increment the current position.
392  currentPos += myCigarRoller[0].count;
393  }
394 
395  int numOps = myCigarRoller.size();
396 
397  // Loop through the cigar operations from the 2nd operation since
398  // the first operation is already on the end and can't shift.
399  for(int currentOp = 1; currentOp < numOps; currentOp++)
400  {
401  if(myCigarRoller[currentOp].operation == Cigar::insert)
402  {
403  // For now, only shift a max of 1 operation.
404  int prevOpIndex = currentOp-1;
405  // Track the next op for seeing if it is the same as the
406  // previous for merging reasons.
407  int nextOpIndex = currentOp+1;
408  if(nextOpIndex == numOps)
409  {
410  // There is no next op, so set it equal to the current one.
411  nextOpIndex = currentOp;
412  }
413  // The start of the previous operation, so we know when we hit it
414  // so we don't shift past it.
415  uint32_t prevOpStart =
416  currentPos - myCigarRoller[prevOpIndex].count;
417 
418  // We can only shift if the previous operation
419  if(!Cigar::isMatchOrMismatch(myCigarRoller[prevOpIndex]))
420  {
421  // TODO - shift past pads
422  // An insert is in the read, so increment the position.
423  currentPos += myCigarRoller[currentOp].count;
424  // Not a match/mismatch, so can't shift into it.
425  continue;
426  }
427 
428  // It is a match or mismatch, so check to see if we can
429  // shift into it.
430 
431  // The end of the insert is calculated by adding the size
432  // of this insert minus 1 to the start of the insert.
433  uint32_t insertEndPos =
434  currentPos + myCigarRoller[currentOp].count - 1;
435 
436  // The insert starts at the current position.
437  uint32_t insertStartPos = currentPos;
438 
439  // Loop as long as the position before the insert start
440  // matches the last character in the insert. If they match,
441  // the insert can be shifted one index left because the
442  // implied reference will not change. If they do not match,
443  // we can't shift because the implied reference would change.
444  // Stop loop when insertStartPos = prevOpStart, because we
445  // don't want to move past that.
446  while((insertStartPos > prevOpStart) &&
447  (getSequence(insertEndPos,BASES) ==
448  getSequence(insertStartPos - 1, BASES)))
449  {
450  // We can shift, so move the insert start & end one left.
451  --insertEndPos;
452  --insertStartPos;
453  }
454 
455  // Determine if a shift has occurred.
456  int shiftLen = currentPos - insertStartPos;
457  if(shiftLen > 0)
458  {
459  // Shift occured, so adjust the cigar if the cigar will
460  // not become more operations.
461  // If the next operation is the same as the previous or
462  // if the insert and the previous operation switch positions
463  // then the cigar has the same number of operations.
464  // If the next operation is different, and the shift splits
465  // the previous operation in 2, then the cigar would
466  // become longer, so we do not want to shift.
467  if(myCigarRoller[nextOpIndex].operation ==
468  myCigarRoller[prevOpIndex].operation)
469  {
470  // The operations are the same, so merge them by adding
471  // the length of the shift to the next operation.
472  myCigarRoller.IncrementCount(nextOpIndex, shiftLen);
473  myCigarRoller.IncrementCount(prevOpIndex, -shiftLen);
474 
475  // If the previous op length is 0, just remove that
476  // operation.
477  if(myCigarRoller[prevOpIndex].count == 0)
478  {
479  myCigarRoller.Remove(prevOpIndex);
480  }
481  shifted = true;
482  }
483  else
484  {
485  // Can only shift if the insert shifts past the
486  // entire previous operation, otherwise an operation
487  // would need to be added.
488  if(insertStartPos == prevOpStart)
489  {
490  // Swap the positions of the insert and the
491  // previous operation.
492  myCigarRoller.Update(currentOp,
493  myCigarRoller[prevOpIndex].operation,
494  myCigarRoller[prevOpIndex].count);
495  // Size of the previous op is the entire
496  // shift length.
497  myCigarRoller.Update(prevOpIndex,
499  shiftLen);
500  shifted = true;
501  }
502  }
503  }
504  // An insert is in the read, so increment the position.
505  currentPos += myCigarRoller[currentOp].count;
506  }
507  else if(Cigar::foundInQuery(myCigarRoller[currentOp]))
508  {
509  // This op was found in the read, increment the current position.
510  currentPos += myCigarRoller[currentOp].count;
511  }
512  }
513  if(shifted)
514  {
515  // TODO - setCigar is currently inefficient because later the cigar
516  // roller will be recalculated, but for now it will work.
517  setCigar(myCigarRoller);
518  }
519  return(shifted);
520 }
521 
522 
523 // Set the BAM record from the passeed in buffer of the specified size.
524 // Note: The size includes the block size.
526  uint32_t fromBufferSize,
527  SamFileHeader& header)
528 {
529  myStatus = SamStatus::SUCCESS;
530  if((fromBuffer == NULL) || (fromBufferSize == 0))
531  {
532  // Buffer is empty.
534  "Cannot parse an empty file.");
535  return(SamStatus::FAIL_PARSE);
536  }
537 
538  // Clear the record.
539  resetRecord();
540 
541  // allocate space for the record size.
542  if(!allocateRecordStructure(fromBufferSize))
543  {
544  // Failed to allocate space.
545  return(SamStatus::FAIL_MEM);
546  }
547 
548  memcpy(myRecordPtr, fromBuffer, fromBufferSize);
549 
550  setVariablesForNewBuffer(header);
551 
552  // Return the status of the record.
553  return(SamStatus::SUCCESS);
554 }
555 
556 
557 // Read the BAM record from a file.
559  SamFileHeader& header)
560 {
561  myStatus = SamStatus::SUCCESS;
562  if((filePtr == NULL) || (filePtr->isOpen() == false))
563  {
564  // File is not open, return failure.
566  "Can't read from an unopened file.");
567  return(SamStatus::FAIL_ORDER);
568  }
569 
570  // Clear the record.
571  resetRecord();
572 
573  // read the record size.
574  int numBytes =
575  ifread(filePtr, &(myRecordPtr->myBlockSize), sizeof(int32_t));
576 
577  // Check to see if the end of the file was hit and no bytes were read.
578  if(ifeof(filePtr) && (numBytes == 0))
579  {
580  // End of file, nothing was read, no more records.
582  "No more records left to read.");
583  return(SamStatus::NO_MORE_RECS);
584  }
585 
586  if(numBytes != sizeof(int32_t))
587  {
588  // Failed to read the entire block size. Either the end of the file
589  // was reached early or there was an error.
590  if(ifeof(filePtr))
591  {
592  // Error: end of the file reached prior to reading the rest of the
593  // record.
595  "EOF reached in the middle of a record.");
596  return(SamStatus::FAIL_PARSE);
597  }
598  else
599  {
600  // Error reading.
601  myStatus.setStatus(SamStatus::FAIL_IO,
602  "Failed to read the record size.");
603  return(SamStatus::FAIL_IO);
604  }
605  }
606 
607  // allocate space for the record size.
608  if(!allocateRecordStructure(myRecordPtr->myBlockSize + sizeof(int32_t)))
609  {
610  // Failed to allocate space.
611  // Status is set by allocateRecordStructure.
612  return(SamStatus::FAIL_MEM);
613  }
614 
615  // Read the rest of the alignment block, starting at the reference id.
616  if(ifread(filePtr, &(myRecordPtr->myReferenceID), myRecordPtr->myBlockSize)
617  != (unsigned int)myRecordPtr->myBlockSize)
618  {
619  // Error reading the record. Reset it and return failure.
620  resetRecord();
621  myStatus.setStatus(SamStatus::FAIL_IO,
622  "Failed to read the record");
623  return(SamStatus::FAIL_IO);
624  }
625 
626  setVariablesForNewBuffer(header);
627 
628  // Return the status of the record.
629  return(SamStatus::SUCCESS);
630 }
631 
632 
633 // Add the specified tag to the record.
634 // Returns true if the tag was successfully added, false otherwise.
635 bool SamRecord::addIntTag(const char* tag, int32_t value)
636 {
637  myStatus = SamStatus::SUCCESS;
638  int key = 0;
639  int index = 0;
640  char bamvtype;
641 
642  int tagBufferSize = 0;
643 
644  // First check to see if the tags need to be synced to the buffer.
645  if(myNeedToSetTagsFromBuffer)
646  {
647  if(!setTagsFromBuffer())
648  {
649  // Failed to read tags from the buffer, so cannot add new ones.
650  return(false);
651  }
652  }
653 
654  // Ints come in as int. But it can be represented in fewer bits.
655  // So determine a more specific type that is in line with the
656  // types for BAM files.
657  // First check to see if it is a negative.
658  if(value < 0)
659  {
660  // The int is negative, so it will need to use a signed type.
661  // See if it is greater than the min value for a char.
662  if(value > ((std::numeric_limits<char>::min)()))
663  {
664  // It can be stored in a signed char.
665  bamvtype = 'c';
666  tagBufferSize += 4;
667  }
668  else if(value > ((std::numeric_limits<short>::min)()))
669  {
670  // It fits in a signed short.
671  bamvtype = 's';
672  tagBufferSize += 5;
673  }
674  else
675  {
676  // Just store it as a signed int.
677  bamvtype = 'i';
678  tagBufferSize += 7;
679  }
680  }
681  else
682  {
683  // It is positive, so an unsigned type can be used.
684  if(value < ((std::numeric_limits<unsigned char>::max)()))
685  {
686  // It is under the max of an unsigned char.
687  bamvtype = 'C';
688  tagBufferSize += 4;
689  }
690  else if(value < ((std::numeric_limits<unsigned short>::max)()))
691  {
692  // It is under the max of an unsigned short.
693  bamvtype = 'S';
694  tagBufferSize += 5;
695  }
696  else
697  {
698  // Just store it as an unsigned int.
699  bamvtype = 'I';
700  tagBufferSize += 7;
701  }
702  }
703 
704  // Check to see if the tag is already there.
705  key = MAKEKEY(tag[0], tag[1], bamvtype);
706  unsigned int hashIndex = extras.Find(key);
707  if(hashIndex != LH_NOTFOUND)
708  {
709  // Tag was already found.
710  index = extras[hashIndex];
711 
712  // Since the tagBufferSize was already updated with the new value,
713  // subtract the size for the previous tag (even if they are the same).
714  switch(intType[index])
715  {
716  case 'c':
717  case 'C':
718  case 'A':
719  tagBufferSize -= 4;
720  break;
721  case 's':
722  case 'S':
723  tagBufferSize -= 5;
724  break;
725  case 'i':
726  case 'I':
727  tagBufferSize -= 7;
728  break;
729  default:
730  myStatus.setStatus(SamStatus::INVALID,
731  "unknown tag inttype type found.\n");
732  return(false);
733  }
734 
735  // Tag already existed, print message about overwriting.
736  // WARN about dropping duplicate tags.
737  if(myNumWarns++ < myMaxWarns)
738  {
739  String newVal;
740  String origVal;
741  appendIntArrayValue(index, origVal);
742  appendIntArrayValue(bamvtype, value, newVal);
743  fprintf(stderr, "WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n",
744  tag[0], tag[1], intType[index], origVal.c_str(), tag[0], tag[1], bamvtype, newVal.c_str());
745  if(myNumWarns == myMaxWarns)
746  {
747  fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n");
748  }
749  }
750 
751  // Update the integer value and type.
752  integers[index] = value;
753  intType[index] = bamvtype;
754  }
755  else
756  {
757  // Tag is not already there, so add it.
758  index = integers.Length();
759 
760  integers.Push(value);
761  intType.push_back(bamvtype);
762 
763  extras.Add(key, index);
764  }
765 
766  // The buffer tags are now out of sync.
767  myNeedToSetTagsInBuffer = true;
768  myIsTagsBufferValid = false;
769  myIsBufferSynced = false;
770  myTagBufferSize += tagBufferSize;
771 
772  return(true);
773 }
774 
775 
776 // Add the specified tag to the record, replacing it if it is already there and
777 // is different from the previous value.
778 // Returns true if the tag was successfully added (or was already there), false otherwise.
779 bool SamRecord::addTag(const char* tag, char vtype, const char* valuePtr)
780 {
781  if(vtype == 'i')
782  {
783  // integer type. Call addIntTag to handle it.
784  int intVal = atoi(valuePtr);
785  return(addIntTag(tag, intVal));
786  }
787 
788  // Non-int type.
789  myStatus = SamStatus::SUCCESS;
790  bool status = true; // default to successful.
791  int key = 0;
792  int index = 0;
793 
794  int tagBufferSize = 0;
795 
796  // First check to see if the tags need to be synced to the buffer.
797  if(myNeedToSetTagsFromBuffer)
798  {
799  if(!setTagsFromBuffer())
800  {
801  // Failed to read tags from the buffer, so cannot add new ones.
802  return(false);
803  }
804  }
805 
806  // First check to see if the tag is already there.
807  key = MAKEKEY(tag[0], tag[1], vtype);
808  unsigned int hashIndex = extras.Find(key);
809  if(hashIndex != LH_NOTFOUND)
810  {
811  // The key was found in the hash, so get the lookup index.
812  index = extras[hashIndex];
813 
814  String origTag;
815  char origType = vtype;
816 
817  // Adjust the currently pointed to value to the new setting.
818  switch (vtype)
819  {
820  case 'A' :
821  // First check to see if the value changed.
822  if((integers[index] == (const int)*(valuePtr)) &&
823  (intType[index] == vtype))
824  {
825  // The value & type has not changed, so do nothing.
826  return(true);
827  }
828  else
829  {
830  // Tag buffer size changes if type changes, so subtract & add.
831  origType = intType[index];
832  appendIntArrayValue(index, origTag);
833  tagBufferSize -= getNumericTagTypeSize(intType[index]);
834  tagBufferSize += getNumericTagTypeSize(vtype);
835  integers[index] = (const int)*(valuePtr);
836  intType[index] = vtype;
837  }
838  break;
839  case 'Z' :
840  // First check to see if the value changed.
841  if(strings[index] == valuePtr)
842  {
843  // The value has not changed, so do nothing.
844  return(true);
845  }
846  else
847  {
848  // Adjust the tagBufferSize by removing the size of the old string.
849  origTag = strings[index];
850  tagBufferSize -= strings[index].Length();
851  strings[index] = valuePtr;
852  // Adjust the tagBufferSize by adding the size of the new string.
853  tagBufferSize += strings[index].Length();
854  }
855  break;
856  case 'B' :
857  // First check to see if the value changed.
858  if(strings[index] == valuePtr)
859  {
860  // The value has not changed, so do nothing.
861  return(true);
862  }
863  else
864  {
865  // Adjust the tagBufferSize by removing the size of the old field.
866  origTag = strings[index];
867  tagBufferSize -= getBtagBufferSize(strings[index]);
868  strings[index] = valuePtr;
869  // Adjust the tagBufferSize by adding the size of the new field.
870  tagBufferSize += getBtagBufferSize(strings[index]);
871  }
872  break;
873  case 'f' :
874  // First check to see if the value changed.
875  if(floats[index] == (float)atof(valuePtr))
876  {
877  // The value has not changed, so do nothing.
878  return(true);
879  }
880  else
881  {
882  // Tag buffer size doesn't change between different 'f' entries.
883  origTag.appendFullFloat(floats[index]);
884  floats[index] = (float)atof(valuePtr);
885  }
886  break;
887  default :
888  fprintf(stderr,
889  "samRecord::addTag() - Unknown custom field of type %c\n",
890  vtype);
892  "Unknown custom field in a tag");
893  status = false;
894  break;
895  }
896 
897  // Duplicate tag in this record.
898  // Tag already existed, print message about overwriting.
899  // WARN about dropping duplicate tags.
900  if(myNumWarns++ < myMaxWarns)
901  {
902  fprintf(stderr, "WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n",
903  tag[0], tag[1], origType, origTag.c_str(), tag[0], tag[1], vtype, valuePtr);
904  if(myNumWarns == myMaxWarns)
905  {
906  fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n");
907  }
908  }
909  }
910  else
911  {
912  // The key was not found in the hash, so add it.
913  switch (vtype)
914  {
915  case 'A' :
916  index = integers.Length();
917  integers.Push((const int)*(valuePtr));
918  intType.push_back(vtype);
919  tagBufferSize += 4;
920  break;
921  case 'Z' :
922  index = strings.Length();
923  strings.Push(valuePtr);
924  tagBufferSize += 4 + strings.Last().Length();
925  break;
926  case 'B' :
927  index = strings.Length();
928  strings.Push(valuePtr);
929  tagBufferSize += 3 + getBtagBufferSize(strings[index]);
930  break;
931  case 'f' :
932  index = floats.size();
933  floats.push_back((float)atof(valuePtr));
934  tagBufferSize += 7;
935  break;
936  default :
937  fprintf(stderr,
938  "samRecord::addTag() - Unknown custom field of type %c\n",
939  vtype);
941  "Unknown custom field in a tag");
942  status = false;
943  break;
944  }
945  if(status)
946  {
947  // If successful, add the key to extras.
948  extras.Add(key, index);
949  }
950  }
951 
952  // Only add the tag if it has so far been successfully processed.
953  if(status)
954  {
955  // The buffer tags are now out of sync.
956  myNeedToSetTagsInBuffer = true;
957  myIsTagsBufferValid = false;
958  myIsBufferSynced = false;
959  myTagBufferSize += tagBufferSize;
960  }
961  return(status);
962 }
963 
964 
966 {
967  if(extras.Entries() != 0)
968  {
969  extras.Clear();
970  }
971  strings.Clear();
972  integers.Clear();
973  intType.clear();
974  floats.clear();
975  myTagBufferSize = 0;
976  resetTagIter();
977 }
978 
979 
980 bool SamRecord::rmTag(const char* tag, char type)
981 {
982  // Check the length of tag.
983  if(strlen(tag) != 2)
984  {
985  // Tag is the wrong length.
986  myStatus.setStatus(SamStatus::INVALID,
987  "rmTag called with tag that is not 2 characters\n");
988  return(false);
989  }
990 
991  myStatus = SamStatus::SUCCESS;
992  if(myNeedToSetTagsFromBuffer)
993  {
994  if(!setTagsFromBuffer())
995  {
996  // Failed to read the tags from the buffer, so cannot
997  // get tags.
998  return(false);
999  }
1000  }
1001 
1002  // Construct the key.
1003  int key = MAKEKEY(tag[0], tag[1], type);
1004  // Look to see if the key exsists in the hash.
1005  int offset = extras.Find(key);
1006 
1007  if(offset < 0)
1008  {
1009  // Not found, so return true, successfully removed since
1010  // it is not in tag.
1011  return(true);
1012  }
1013 
1014  // Offset is set, so the key was found.
1015  // First if it is an integer, determine the actual type of the int.
1016  char vtype;
1017  getTypeFromKey(key, vtype);
1018  if(vtype == 'i')
1019  {
1020  vtype = getIntegerType(offset);
1021  }
1022 
1023  // Offset is set, so recalculate the buffer size without this entry.
1024  // Do NOT remove from strings, integers, or floats because then
1025  // extras would need to be updated for all entries with the new indexes
1026  // into those variables.
1027  int rmBuffSize = 0;
1028  switch(vtype)
1029  {
1030  case 'A':
1031  case 'c':
1032  case 'C':
1033  rmBuffSize = 4;
1034  break;
1035  case 's':
1036  case 'S':
1037  rmBuffSize = 5;
1038  break;
1039  case 'i':
1040  case 'I':
1041  rmBuffSize = 7;
1042  break;
1043  case 'f':
1044  rmBuffSize = 7;
1045  break;
1046  case 'Z':
1047  rmBuffSize = 4 + getString(offset).Length();
1048  break;
1049  case 'B':
1050  rmBuffSize = 3 + getBtagBufferSize(getString(offset));
1051  break;
1052  default:
1053  myStatus.setStatus(SamStatus::INVALID,
1054  "rmTag called with unknown type.\n");
1055  return(false);
1056  break;
1057  };
1058 
1059  // The buffer tags are now out of sync.
1060  myNeedToSetTagsInBuffer = true;
1061  myIsTagsBufferValid = false;
1062  myIsBufferSynced = false;
1063  myTagBufferSize -= rmBuffSize;
1064 
1065  // Remove from the hash.
1066  extras.Delete(offset);
1067  return(true);
1068 }
1069 
1070 
1071 bool SamRecord::rmTags(const char* tags)
1072 {
1073  const char* currentTagPtr = tags;
1074 
1075  myStatus = SamStatus::SUCCESS;
1076  if(myNeedToSetTagsFromBuffer)
1077  {
1078  if(!setTagsFromBuffer())
1079  {
1080  // Failed to read the tags from the buffer, so cannot
1081  // get tags.
1082  return(false);
1083  }
1084  }
1085 
1086  bool returnStatus = true;
1087 
1088  int rmBuffSize = 0;
1089  while(*currentTagPtr != '\0')
1090  {
1091 
1092  // Tags are formatted as: XY:Z
1093  // Where X is [A-Za-z], Y is [A-Za-z], and
1094  // Z is A,i,f,Z,H (cCsSI are also excepted)
1095  if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') ||
1096  (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0'))
1097  {
1098  myStatus.setStatus(SamStatus::INVALID,
1099  "rmTags called with improperly formatted tags.\n");
1100  returnStatus = false;
1101  break;
1102  }
1103 
1104  // Construct the key.
1105  int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1],
1106  currentTagPtr[3]);
1107  // Look to see if the key exsists in the hash.
1108  int offset = extras.Find(key);
1109 
1110  if(offset >= 0)
1111  {
1112  // Offset is set, so the key was found.
1113  // First if it is an integer, determine the actual type of the int.
1114  char vtype;
1115  getTypeFromKey(key, vtype);
1116  if(vtype == 'i')
1117  {
1118  vtype = getIntegerType(offset);
1119  }
1120 
1121  // Offset is set, so recalculate the buffer size without this entry.
1122  // Do NOT remove from strings, integers, or floats because then
1123  // extras would need to be updated for all entries with the new indexes
1124  // into those variables.
1125  switch(vtype)
1126  {
1127  case 'A':
1128  case 'c':
1129  case 'C':
1130  rmBuffSize += 4;
1131  break;
1132  case 's':
1133  case 'S':
1134  rmBuffSize += 5;
1135  break;
1136  case 'i':
1137  case 'I':
1138  rmBuffSize += 7;
1139  break;
1140  case 'f':
1141  rmBuffSize += 7;
1142  break;
1143  case 'Z':
1144  rmBuffSize += 4 + getString(offset).Length();
1145  break;
1146  case 'B':
1147  rmBuffSize += 3 + getBtagBufferSize(getString(offset));
1148  break;
1149  default:
1150  myStatus.setStatus(SamStatus::INVALID,
1151  "rmTag called with unknown type.\n");
1152  returnStatus = false;
1153  break;
1154  };
1155 
1156  // Remove from the hash.
1157  extras.Delete(offset);
1158  }
1159  // Increment to the next tag.
1160  if((currentTagPtr[4] == ';') || (currentTagPtr[4] == ','))
1161  {
1162  // Increment once more.
1163  currentTagPtr += 5;
1164  }
1165  else if(currentTagPtr[4] != '\0')
1166  {
1167  // Invalid tag format.
1168  myStatus.setStatus(SamStatus::INVALID,
1169  "rmTags called with improperly formatted tags.\n");
1170  returnStatus = false;
1171  break;
1172  }
1173  else
1174  {
1175  // Last Tag.
1176  currentTagPtr += 4;
1177  }
1178  }
1179 
1180  // The buffer tags are now out of sync.
1181  myNeedToSetTagsInBuffer = true;
1182  myIsTagsBufferValid = false;
1183  myIsBufferSynced = false;
1184  myTagBufferSize -= rmBuffSize;
1185 
1186 
1187  return(returnStatus);
1188 }
1189 
1190 
1191 // Get methods for record fields.
1193 {
1194  return(getRecordBuffer(mySequenceTranslation));
1195 }
1196 
1197 
1198 // Get methods for record fields.
1200 {
1201  myStatus = SamStatus::SUCCESS;
1202  bool status = true;
1203  // If the buffer is not synced or the sequence in the buffer is not
1204  // properly translated, fix the buffer.
1205  if((myIsBufferSynced == false) ||
1206  (myBufferSequenceTranslation != translation))
1207  {
1208  status &= fixBuffer(translation);
1209  }
1210  // If the buffer is synced, check to see if the tags need to be synced.
1211  if(myNeedToSetTagsInBuffer)
1212  {
1213  status &= setTagsInBuffer();
1214  }
1215  if(!status)
1216  {
1217  return(NULL);
1218  }
1219  return (const void *)myRecordPtr;
1220 }
1221 
1222 
1223 // Write the record as a buffer into the file using the class's
1224 // sequence translation setting.
1226 {
1227  return(writeRecordBuffer(filePtr, mySequenceTranslation));
1228 }
1229 
1230 
1231 // Write the record as a buffer into the file using the specified translation.
1233  SequenceTranslation translation)
1234 {
1235  myStatus = SamStatus::SUCCESS;
1236  if((filePtr == NULL) || (filePtr->isOpen() == false))
1237  {
1238  // File is not open, return failure.
1240  "Can't write to an unopened file.");
1241  return(SamStatus::FAIL_ORDER);
1242  }
1243 
1244  if((myIsBufferSynced == false) ||
1245  (myBufferSequenceTranslation != translation))
1246  {
1247  if(!fixBuffer(translation))
1248  {
1249  return(myStatus.getStatus());
1250  }
1251  }
1252 
1253  // Write the record.
1254  unsigned int numBytesToWrite = myRecordPtr->myBlockSize + sizeof(int32_t);
1255  unsigned int numBytesWritten =
1256  ifwrite(filePtr, myRecordPtr, numBytesToWrite);
1257 
1258  // Return status based on if the correct number of bytes were written.
1259  if(numBytesToWrite == numBytesWritten)
1260  {
1261  return(SamStatus::SUCCESS);
1262  }
1263  // The correct number of bytes were not written.
1264  myStatus.setStatus(SamStatus::FAIL_IO, "Failed to write the entire record.");
1265  return(SamStatus::FAIL_IO);
1266 }
1267 
1268 
1270 {
1271  myStatus = SamStatus::SUCCESS;
1272  // If the buffer isn't synced, sync the buffer to determine the
1273  // block size.
1274  if(myIsBufferSynced == false)
1275  {
1276  // Since this just returns the block size, the translation of
1277  // the sequence does not matter, so just use the currently set
1278  // value.
1279  fixBuffer(myBufferSequenceTranslation);
1280  }
1281  return myRecordPtr->myBlockSize;
1282 }
1283 
1284 
1285 // This method returns the reference name.
1287 {
1288  myStatus = SamStatus::SUCCESS;
1289  return myReferenceName.c_str();
1290 }
1291 
1292 
1294 {
1295  myStatus = SamStatus::SUCCESS;
1296  return myRecordPtr->myReferenceID;
1297 }
1298 
1299 
1301 {
1302  myStatus = SamStatus::SUCCESS;
1303  return (myRecordPtr->myPosition + 1);
1304 }
1305 
1306 
1308 {
1309  myStatus = SamStatus::SUCCESS;
1310  return myRecordPtr->myPosition;
1311 }
1312 
1313 
1315 {
1316  myStatus = SamStatus::SUCCESS;
1317  // If the buffer is valid, return the size from there, otherwise get the
1318  // size from the string length + 1 (ending null).
1319  if(myIsReadNameBufferValid)
1320  {
1321  return(myRecordPtr->myReadNameLength);
1322  }
1323 
1324  return(myReadName.Length() + 1);
1325 }
1326 
1327 
1329 {
1330  myStatus = SamStatus::SUCCESS;
1331  return myRecordPtr->myMapQuality;
1332 }
1333 
1334 
1336 {
1337  myStatus = SamStatus::SUCCESS;
1338  if(!myIsBinValid)
1339  {
1340  // The bin that is set in the record is not valid, so
1341  // reset it.
1342  myRecordPtr->myBin =
1343  bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());
1344  myIsBinValid = true;
1345  }
1346  return(myRecordPtr->myBin);
1347 }
1348 
1349 
1351 {
1352  myStatus = SamStatus::SUCCESS;
1353  // If the cigar buffer is valid
1354  // then get the length from there.
1355  if(myIsCigarBufferValid)
1356  {
1357  return myRecordPtr->myCigarLength;
1358  }
1359 
1360  if(myCigarTempBufferLength == -1)
1361  {
1362  // The cigar buffer is not valid and the cigar temp buffer is not set,
1363  // so parse the string.
1364  parseCigarString();
1365  }
1366 
1367  // The temp buffer is now set, so return the size.
1368  return(myCigarTempBufferLength);
1369 }
1370 
1371 
1373 {
1374  myStatus = SamStatus::SUCCESS;
1375  return myRecordPtr->myFlag;
1376 }
1377 
1378 
1380 {
1381  myStatus = SamStatus::SUCCESS;
1382  if(myIsSequenceBufferValid == false)
1383  {
1384  // If the sequence is "*", then return 0.
1385  if((mySequence.Length() == 1) && (mySequence[0] == '*'))
1386  {
1387  return(0);
1388  }
1389  // Do not add 1 since it is not null terminated.
1390  return(mySequence.Length());
1391  }
1392  return(myRecordPtr->myReadLength);
1393 }
1394 
1395 
1396 // This method returns the mate reference name. If it is equal to the
1397 // reference name, it still returns the reference name.
1399 {
1400  myStatus = SamStatus::SUCCESS;
1401  return myMateReferenceName.c_str();
1402 }
1403 
1404 
1405 // This method returns the mate reference name. If it is equal to the
1406 // reference name, it returns "=", unless they are both "*" in which case
1407 // "*" is returned.
1409 {
1410  myStatus = SamStatus::SUCCESS;
1411  if(myMateReferenceName == "*")
1412  {
1413  return(myMateReferenceName);
1414  }
1415  if(myMateReferenceName == getReferenceName())
1416  {
1417  return(FIELD_ABSENT_STRING);
1418  }
1419  else
1420  {
1421  return(myMateReferenceName);
1422  }
1423 }
1424 
1425 
1427 {
1428  myStatus = SamStatus::SUCCESS;
1429  return myRecordPtr->myMateReferenceID;
1430 }
1431 
1432 
1434 {
1435  myStatus = SamStatus::SUCCESS;
1436  return (myRecordPtr->myMatePosition + 1);
1437 }
1438 
1439 
1441 {
1442  myStatus = SamStatus::SUCCESS;
1443  return myRecordPtr->myMatePosition;
1444 }
1445 
1446 
1448 {
1449  myStatus = SamStatus::SUCCESS;
1450  return myRecordPtr->myInsertSize;
1451 }
1452 
1453 
1454 // Returns the inclusive rightmost position of the clipped sequence.
1456 {
1457  myStatus = SamStatus::SUCCESS;
1458  if(myAlignmentLength == -1)
1459  {
1460  // Alignment end has not been set, so calculate it.
1461  parseCigar();
1462  }
1463  // If alignment length > 0, subtract 1 from it to get the end.
1464  if(myAlignmentLength == 0)
1465  {
1466  // Length is 0, just return the start position.
1467  return(myRecordPtr->myPosition);
1468  }
1469  return(myRecordPtr->myPosition + myAlignmentLength - 1);
1470 }
1471 
1472 
1473 // Returns the inclusive rightmost position of the clipped sequence.
1475 {
1476  return(get0BasedAlignmentEnd() + 1);
1477 }
1478 
1479 
1480 // Return the length of the alignment.
1482 {
1483  myStatus = SamStatus::SUCCESS;
1484  if(myAlignmentLength == -1)
1485  {
1486  // Alignment end has not been set, so calculate it.
1487  parseCigar();
1488  }
1489  // Return the alignment length.
1490  return(myAlignmentLength);
1491 }
1492 
1493 // Returns the inclusive left-most position adjust for clipped bases.
1495 {
1496  myStatus = SamStatus::SUCCESS;
1497  if(myUnclippedStartOffset == -1)
1498  {
1499  // Unclipped has not yet been calculated, so parse the cigar to get it
1500  parseCigar();
1501  }
1502  return(myRecordPtr->myPosition - myUnclippedStartOffset);
1503 }
1504 
1505 
1506 // Returns the inclusive left-most position adjust for clipped bases.
1508 {
1509  return(get0BasedUnclippedStart() + 1);
1510 }
1511 
1512 
1513 // Returns the inclusive right-most position adjust for clipped bases.
1515 {
1516  // myUnclippedEndOffset will be set by get0BasedAlignmentEnd if the
1517  // cigar has not yet been parsed, so no need to check it here.
1518  return(get0BasedAlignmentEnd() + myUnclippedEndOffset);
1519 }
1520 
1521 
1522 // Returns the inclusive right-most position adjust for clipped bases.
1524 {
1525  return(get0BasedUnclippedEnd() + 1);
1526 }
1527 
1528 
1529 // Get the read name.
1531 {
1532  myStatus = SamStatus::SUCCESS;
1533  if(myReadName.Length() == 0)
1534  {
1535  // 0 Length, means that it is in the buffer, but has not yet
1536  // been synced to the string, so do the sync.
1537  myReadName = (char*)&(myRecordPtr->myData);
1538  }
1539  return myReadName.c_str();
1540 }
1541 
1542 
1543 const char* SamRecord::getCigar()
1544 {
1545  myStatus = SamStatus::SUCCESS;
1546  if(myCigar.Length() == 0)
1547  {
1548  // 0 Length, means that it is in the buffer, but has not yet
1549  // been synced to the string, so do the sync.
1550  parseCigarBinary();
1551  }
1552  return myCigar.c_str();
1553 }
1554 
1555 
1557 {
1558  return(getSequence(mySequenceTranslation));
1559 }
1560 
1561 
1563 {
1564  myStatus = SamStatus::SUCCESS;
1565  if(mySequence.Length() == 0)
1566  {
1567  // 0 Length, means that it is in the buffer, but has not yet
1568  // been synced to the string, so do the sync.
1569  setSequenceAndQualityFromBuffer();
1570  }
1571 
1572  // Determine if translation needs to be done.
1573  if((translation == NONE) || (myRefPtr == NULL))
1574  {
1575  return mySequence.c_str();
1576  }
1577  else if(translation == EQUAL)
1578  {
1579  if(mySeqWithEq.length() == 0)
1580  {
1581  // Check to see if the sequence is defined.
1582  if(mySequence == "*")
1583  {
1584  // Sequence is undefined, so no translation necessary.
1585  mySeqWithEq = '*';
1586  }
1587  else
1588  {
1589  // Sequence defined, so translate it.
1590  SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(),
1591  myRecordPtr->myPosition,
1592  *(getCigarInfo()),
1593  getReferenceName(),
1594  *myRefPtr,
1595  mySeqWithEq);
1596  }
1597  }
1598  return(mySeqWithEq.c_str());
1599  }
1600  else
1601  {
1602  // translation == BASES
1603  if(mySeqWithoutEq.length() == 0)
1604  {
1605  if(mySequence == "*")
1606  {
1607  // Sequence is undefined, so no translation necessary.
1608  mySeqWithoutEq = '*';
1609  }
1610  else
1611  {
1612  // Sequence defined, so translate it.
1613  SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(),
1614  myRecordPtr->myPosition,
1615  *(getCigarInfo()),
1616  getReferenceName(),
1617  *myRefPtr,
1618  mySeqWithoutEq);
1619  }
1620  }
1621  return(mySeqWithoutEq.c_str());
1622  }
1623 }
1624 
1625 
1627 {
1628  myStatus = SamStatus::SUCCESS;
1629  if(myQuality.Length() == 0)
1630  {
1631  // 0 Length, means that it is in the buffer, but has not yet
1632  // been synced to the string, so do the sync.
1633  setSequenceAndQualityFromBuffer();
1634  }
1635  return myQuality.c_str();
1636 }
1637 
1638 
1639 char SamRecord::getSequence(int index)
1640 {
1641  return(getSequence(index, mySequenceTranslation));
1642 }
1643 
1644 
1645 char SamRecord::getSequence(int index, SequenceTranslation translation)
1646 {
1647  static const char * asciiBases = "=AC.G...T......N";
1648 
1649  // Determine the read length.
1650  int32_t readLen = getReadLength();
1651 
1652  // If the read length is 0, this method should not be called.
1653  if(readLen == 0)
1654  {
1655  String exceptionString = "SamRecord::getSequence(";
1656  exceptionString += index;
1657  exceptionString += ") is not allowed since sequence = '*'";
1658  throw std::runtime_error(exceptionString.c_str());
1659  }
1660  else if((index < 0) || (index >= readLen))
1661  {
1662  // Only get here if the index was out of range, so thow an exception.
1663  String exceptionString = "SamRecord::getSequence(";
1664  exceptionString += index;
1665  exceptionString += ") is out of range. Index must be between 0 and ";
1666  exceptionString += (readLen - 1);
1667  throw std::runtime_error(exceptionString.c_str());
1668  }
1669 
1670  // Determine if translation needs to be done.
1671  if((translation == NONE) || (myRefPtr == NULL))
1672  {
1673  // No translation needs to be done.
1674  if(mySequence.Length() == 0)
1675  {
1676  // Parse BAM sequence.
1677  if(myIsSequenceBufferValid)
1678  {
1679  return(index & 1 ?
1680  asciiBases[myPackedSequence[index / 2] & 0xF] :
1681  asciiBases[myPackedSequence[index / 2] >> 4]);
1682  }
1683  else
1684  {
1685  String exceptionString = "SamRecord::getSequence(";
1686  exceptionString += index;
1687  exceptionString += ") called with no sequence set";
1688  throw std::runtime_error(exceptionString.c_str());
1689  }
1690  }
1691  // Already have string.
1692  return(mySequence[index]);
1693  }
1694  else
1695  {
1696  // Need to translate the sequence either to have '=' or to not
1697  // have it.
1698  // First check to see if the sequence has been set.
1699  if(mySequence.Length() == 0)
1700  {
1701  // 0 Length, means that it is in the buffer, but has not yet
1702  // been synced to the string, so do the sync.
1703  setSequenceAndQualityFromBuffer();
1704  }
1705 
1706  // Check the type of translation.
1707  if(translation == EQUAL)
1708  {
1709  // Check whether or not the string has already been
1710  // retrieved that has the '=' in it.
1711  if(mySeqWithEq.length() == 0)
1712  {
1713  // The string with '=' has not yet been determined,
1714  // so get the string.
1715  // Check to see if the sequence is defined.
1716  if(mySequence == "*")
1717  {
1718  // Sequence is undefined, so no translation necessary.
1719  mySeqWithEq = '*';
1720  }
1721  else
1722  {
1723  // Sequence defined, so translate it.
1724  SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(),
1725  myRecordPtr->myPosition,
1726  *(getCigarInfo()),
1727  getReferenceName(),
1728  *myRefPtr,
1729  mySeqWithEq);
1730  }
1731  }
1732  // Sequence is set, so return it.
1733  return(mySeqWithEq[index]);
1734  }
1735  else
1736  {
1737  // translation == BASES
1738  // Check whether or not the string has already been
1739  // retrieved that does not have the '=' in it.
1740  if(mySeqWithoutEq.length() == 0)
1741  {
1742  // The string with '=' has not yet been determined,
1743  // so get the string.
1744  // Check to see if the sequence is defined.
1745  if(mySequence == "*")
1746  {
1747  // Sequence is undefined, so no translation necessary.
1748  mySeqWithoutEq = '*';
1749  }
1750  else
1751  {
1752  // Sequence defined, so translate it.
1753  // The string without '=' has not yet been determined,
1754  // so get the string.
1755  SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(),
1756  myRecordPtr->myPosition,
1757  *(getCigarInfo()),
1758  getReferenceName(),
1759  *myRefPtr,
1760  mySeqWithoutEq);
1761  }
1762  }
1763  // Sequence is set, so return it.
1764  return(mySeqWithoutEq[index]);
1765  }
1766  }
1767 }
1768 
1769 
1770 char SamRecord::getQuality(int index)
1771 {
1772  // Determine the read length.
1773  int32_t readLen = getReadLength();
1774 
1775  // If the read length is 0, return ' ' whose ascii code is below
1776  // the minimum ascii code for qualities.
1777  if(readLen == 0)
1778  {
1780  }
1781  else if((index < 0) || (index >= readLen))
1782  {
1783  // Only get here if the index was out of range, so thow an exception.
1784  String exceptionString = "SamRecord::getQuality(";
1785  exceptionString += index;
1786  exceptionString += ") is out of range. Index must be between 0 and ";
1787  exceptionString += (readLen - 1);
1788  throw std::runtime_error(exceptionString.c_str());
1789  }
1790 
1791  if(myQuality.Length() == 0)
1792  {
1793  // Parse BAM Quality.
1794  // Know that myPackedQuality is correct since readLen != 0.
1795  return(myPackedQuality[index] + 33);
1796  }
1797  else
1798  {
1799  // Already have string.
1800  if((myQuality.Length() == 1) && (myQuality[0] == '*'))
1801  {
1802  // Return the unknown quality character.
1804  }
1805  else if(index >= myQuality.Length())
1806  {
1807  // Only get here if the index was out of range, so thow an exception.
1808  // Technically the myQuality string is not guaranteed to be the same length
1809  // as the sequence, so this catches that error.
1810  String exceptionString = "SamRecord::getQuality(";
1811  exceptionString += index;
1812  exceptionString += ") is out of range. Index must be between 0 and ";
1813  exceptionString += (myQuality.Length() - 1);
1814  throw std::runtime_error(exceptionString.c_str());
1815  }
1816  else
1817  {
1818  return(myQuality[index]);
1819  }
1820  }
1821 }
1822 
1823 
1825 {
1826  // Check to see whether or not the Cigar has already been
1827  // set - this is determined by checking if alignment length
1828  // is set since alignment length and the cigar are set
1829  // at the same time.
1830  if(myAlignmentLength == -1)
1831  {
1832  // Not been set, so calculate it.
1833  parseCigar();
1834  }
1835  return(&myCigarRoller);
1836 }
1837 
1838 
1839 // Return the number of bases in this read that overlap the passed in
1840 // region. (start & end are 0-based)
1841 uint32_t SamRecord::getNumOverlaps(int32_t start, int32_t end)
1842 {
1843  // Determine whether or not the cigar has been parsed, which sets up
1844  // the cigar roller. This is determined by checking the alignment length.
1845  if(myAlignmentLength == -1)
1846  {
1847  parseCigar();
1848  }
1849  return(myCigarRoller.getNumOverlaps(start, end, get0BasedPosition()));
1850 }
1851 
1852 
1853 // Returns the values of all fields except the tags.
1854 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName,
1855  String& cigar, String& sequence, String& quality)
1856 {
1857  return(getFields(recStruct, readName, cigar, sequence, quality,
1858  mySequenceTranslation));
1859 }
1860 
1861 
1862 // Returns the values of all fields except the tags.
1863 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName,
1864  String& cigar, String& sequence, String& quality,
1865  SequenceTranslation translation)
1866 {
1867  myStatus = SamStatus::SUCCESS;
1868  if(myIsBufferSynced == false)
1869  {
1870  if(!fixBuffer(translation))
1871  {
1872  // failed to set the buffer, return false.
1873  return(false);
1874  }
1875  }
1876  memcpy(&recStruct, myRecordPtr, sizeof(bamRecordStruct));
1877 
1878  readName = getReadName();
1879  // Check the status.
1880  if(myStatus != SamStatus::SUCCESS)
1881  {
1882  // Failed to set the fields, return false.
1883  return(false);
1884  }
1885  cigar = getCigar();
1886  // Check the status.
1887  if(myStatus != SamStatus::SUCCESS)
1888  {
1889  // Failed to set the fields, return false.
1890  return(false);
1891  }
1892  sequence = getSequence(translation);
1893  // Check the status.
1894  if(myStatus != SamStatus::SUCCESS)
1895  {
1896  // Failed to set the fields, return false.
1897  return(false);
1898  }
1899  quality = getQuality();
1900  // Check the status.
1901  if(myStatus != SamStatus::SUCCESS)
1902  {
1903  // Failed to set the fields, return false.
1904  return(false);
1905  }
1906  return(true);
1907 }
1908 
1909 
1910 // Returns the reference pointer.
1912 {
1913  return(myRefPtr);
1914 }
1915 
1916 
1918 {
1919  myStatus = SamStatus::SUCCESS;
1920  if(myNeedToSetTagsFromBuffer)
1921  {
1922  // Tags are only set in the buffer, so the size of the tags is
1923  // the length of the record minus the starting location of the tags.
1924  unsigned char * tagStart =
1925  (unsigned char *)myRecordPtr->myData
1926  + myRecordPtr->myReadNameLength
1927  + myRecordPtr->myCigarLength * sizeof(int)
1928  + (myRecordPtr->myReadLength + 1) / 2 + myRecordPtr->myReadLength;
1929 
1930  // The non-tags take up from the start of the record to the tag start.
1931  // Do not include the block size part of the record since it is not
1932  // included in the size.
1933  uint32_t nonTagSize =
1934  tagStart - (unsigned char*)&(myRecordPtr->myReferenceID);
1935  // Tags take up the size of the block minus the non-tag section.
1936  uint32_t tagSize = myRecordPtr->myBlockSize - nonTagSize;
1937  return(tagSize);
1938  }
1939 
1940  // Tags are stored outside the buffer, so myTagBufferSize is set.
1941  return(myTagBufferSize);
1942 }
1943 
1944 
1945 // Returns true if there is another tag and sets tag and vtype to the
1946 // appropriate values, and returns a pointer to the value.
1947 // Sets the Status to SUCCESS when a tag is successfully returned or
1948 // when there are no more tags. Otherwise the status is set to describe
1949 // why it failed (parsing, etc).
1950 bool SamRecord::getNextSamTag(char* tag, char& vtype, void** value)
1951 {
1952  myStatus = SamStatus::SUCCESS;
1953  if(myNeedToSetTagsFromBuffer)
1954  {
1955  if(!setTagsFromBuffer())
1956  {
1957  // Failed to read the tags from the buffer, so cannot
1958  // get tags.
1959  return(false);
1960  }
1961  }
1962 
1963  // Increment the tag index to start looking at the next tag.
1964  // At the beginning, it is set to -1.
1965  myLastTagIndex++;
1966  int maxTagIndex = extras.Capacity();
1967  if(myLastTagIndex >= maxTagIndex)
1968  {
1969  // Hit the end of the tags, return false, no more tags.
1970  // Status is still success since this is not an error,
1971  // it is just the end of the list.
1972  return(false);
1973  }
1974 
1975  bool tagFound = false;
1976  // Loop until a tag is found or the end of extras is hit.
1977  while((tagFound == false) && (myLastTagIndex < maxTagIndex))
1978  {
1979  if(extras.SlotInUse(myLastTagIndex))
1980  {
1981  // Found a slot to use.
1982  int key = extras.GetKey(myLastTagIndex);
1983  getTag(key, tag);
1984  getTypeFromKey(key, vtype);
1985  tagFound = true;
1986  // Get the value associated with the key based on the vtype.
1987  switch (vtype)
1988  {
1989  case 'f' :
1990  *value = getFloatPtr(myLastTagIndex);
1991  break;
1992  case 'i' :
1993  *value = getIntegerPtr(myLastTagIndex, vtype);
1994  if(vtype != 'A')
1995  {
1996  // Convert all int types to 'i'
1997  vtype = 'i';
1998  }
1999  break;
2000  case 'Z' :
2001  case 'B' :
2002  *value = getStringPtr(myLastTagIndex);
2003  break;
2004  default:
2006  "Unknown tag type");
2007  tagFound = false;
2008  break;
2009  }
2010  }
2011  if(!tagFound)
2012  {
2013  // Increment the index since a tag was not found.
2014  myLastTagIndex++;
2015  }
2016  }
2017  return(tagFound);
2018 }
2019 
2020 
2021 // Reset the tag iterator to the beginning of the tags.
2023 {
2024  myLastTagIndex = -1;
2025 }
2026 
2027 
2029 {
2030  if((vtype == 'c') || (vtype == 'C') ||
2031  (vtype == 's') || (vtype == 'S') ||
2032  (vtype == 'i') || (vtype == 'I'))
2033  {
2034  return(true);
2035  }
2036  return(false);
2037 }
2038 
2039 
2040 bool SamRecord::isFloatType(char vtype)
2041 {
2042  if(vtype == 'f')
2043  {
2044  return(true);
2045  }
2046  return(false);
2047 }
2048 
2049 
2050 bool SamRecord::isCharType(char vtype)
2051 {
2052  if(vtype == 'A')
2053  {
2054  return(true);
2055  }
2056  return(false);
2057 }
2058 
2059 
2060 bool SamRecord::isStringType(char vtype)
2061 {
2062  if((vtype == 'Z') || (vtype == 'B'))
2063  {
2064  return(true);
2065  }
2066  return(false);
2067 }
2068 
2069 
2070 bool SamRecord::getTagsString(const char* tags, String& returnString, char delim)
2071 {
2072  const char* currentTagPtr = tags;
2073 
2074  returnString.Clear();
2075  myStatus = SamStatus::SUCCESS;
2076  if(myNeedToSetTagsFromBuffer)
2077  {
2078  if(!setTagsFromBuffer())
2079  {
2080  // Failed to read the tags from the buffer, so cannot
2081  // get tags.
2082  return(false);
2083  }
2084  }
2085 
2086  bool returnStatus = true;
2087 
2088  while(*currentTagPtr != '\0')
2089  {
2090  // Tags are formatted as: XY:Z
2091  // Where X is [A-Za-z], Y is [A-Za-z], and
2092  // Z is A,i,f,Z,H (cCsSI are also excepted)
2093  if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') ||
2094  (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0'))
2095  {
2096  myStatus.setStatus(SamStatus::INVALID,
2097  "getTagsString called with improperly formatted tags.\n");
2098  returnStatus = false;
2099  break;
2100  }
2101 
2102  // Construct the key.
2103  int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1],
2104  currentTagPtr[3]);
2105  // Look to see if the key exsists in the hash.
2106  int offset = extras.Find(key);
2107 
2108  if(offset >= 0)
2109  {
2110  // Offset is set, so the key was found.
2111  if(!returnString.IsEmpty())
2112  {
2113  returnString += delim;
2114  }
2115  returnString += currentTagPtr[0];
2116  returnString += currentTagPtr[1];
2117  returnString += ':';
2118  returnString += currentTagPtr[3];
2119  returnString += ':';
2120 
2121  // First if it is an integer, determine the actual type of the int.
2122  char vtype;
2123  getTypeFromKey(key, vtype);
2124 
2125  switch(vtype)
2126  {
2127  case 'i':
2128  returnString += *(int*)getIntegerPtr(offset, vtype);
2129  break;
2130  case 'f':
2131  returnString += *(float*)getFloatPtr(offset);
2132  break;
2133  case 'Z':
2134  case 'B':
2135  returnString += *(String*)getStringPtr(offset);
2136  break;
2137  default:
2138  myStatus.setStatus(SamStatus::INVALID,
2139  "rmTag called with unknown type.\n");
2140  returnStatus = false;
2141  break;
2142  };
2143  }
2144  // Increment to the next tag.
2145  if((currentTagPtr[4] == ';') || (currentTagPtr[4] == ','))
2146  {
2147  // Increment once more.
2148  currentTagPtr += 5;
2149  }
2150  else if(currentTagPtr[4] != '\0')
2151  {
2152  // Invalid tag format.
2153  myStatus.setStatus(SamStatus::INVALID,
2154  "rmTags called with improperly formatted tags.\n");
2155  returnStatus = false;
2156  break;
2157  }
2158  else
2159  {
2160  // Last Tag.
2161  currentTagPtr += 4;
2162  }
2163  }
2164  return(returnStatus);
2165 }
2166 
2167 
2168 const String* SamRecord::getStringTag(const char * tag)
2169 {
2170  // Parse the buffer if necessary.
2171  if(myNeedToSetTagsFromBuffer)
2172  {
2173  if(!setTagsFromBuffer())
2174  {
2175  // Failed to read the tags from the buffer, so cannot
2176  // get tags. setTagsFromBuffer set the errors,
2177  // so just return null.
2178  return(NULL);
2179  }
2180  }
2181 
2182  int key = MAKEKEY(tag[0], tag[1], 'Z');
2183  int offset = extras.Find(key);
2184 
2185  int value;
2186  if (offset < 0)
2187  {
2188  // Check for 'B' tag.
2189  key = MAKEKEY(tag[0], tag[1], 'B');
2190  offset = extras.Find(key);
2191  if(offset < 0)
2192  {
2193  // Tag not found.
2194  return(NULL);
2195  }
2196  }
2197 
2198  // Offset is valid, so return the tag.
2199  value = extras[offset];
2200  return(&(strings[value]));
2201 }
2202 
2203 
2204 int* SamRecord::getIntegerTag(const char * tag)
2205 {
2206  // Init to success.
2207  myStatus = SamStatus::SUCCESS;
2208  // Parse the buffer if necessary.
2209  if(myNeedToSetTagsFromBuffer)
2210  {
2211  if(!setTagsFromBuffer())
2212  {
2213  // Failed to read the tags from the buffer, so cannot
2214  // get tags. setTagsFromBuffer set the errors,
2215  // so just return NULL.
2216  return(NULL);
2217  }
2218  }
2219 
2220  int key = MAKEKEY(tag[0], tag[1], 'i');
2221  int offset = extras.Find(key);
2222 
2223  int value;
2224  if (offset < 0)
2225  {
2226  // Failed to find the tag.
2227  return(NULL);
2228  }
2229  else
2230  value = extras[offset];
2231 
2232  return(&(integers[value]));
2233 }
2234 
2235 
2236 bool SamRecord::getIntegerTag(const char * tag, int& tagVal)
2237 {
2238  // Init to success.
2239  myStatus = SamStatus::SUCCESS;
2240  // Parse the buffer if necessary.
2241  if(myNeedToSetTagsFromBuffer)
2242  {
2243  if(!setTagsFromBuffer())
2244  {
2245  // Failed to read the tags from the buffer, so cannot
2246  // get tags. setTagsFromBuffer set the errors,
2247  // so just return false.
2248  return(false);
2249  }
2250  }
2251 
2252  int key = MAKEKEY(tag[0], tag[1], 'i');
2253  int offset = extras.Find(key);
2254 
2255  int value;
2256  if (offset < 0)
2257  {
2258  // Failed to find the tag.
2259  return(false);
2260  }
2261  else
2262  value = extras[offset];
2263 
2264  tagVal = integers[value];
2265  return(true);
2266 }
2267 
2268 
2269 bool SamRecord::getFloatTag(const char * tag, float& tagVal)
2270 {
2271  // Init to success.
2272  myStatus = SamStatus::SUCCESS;
2273  // Parse the buffer if necessary.
2274  if(myNeedToSetTagsFromBuffer)
2275  {
2276  if(!setTagsFromBuffer())
2277  {
2278  // Failed to read the tags from the buffer, so cannot
2279  // get tags. setTagsFromBuffer set the errors,
2280  // so just return false.
2281  return(false);
2282  }
2283  }
2284 
2285  int key = MAKEKEY(tag[0], tag[1], 'f');
2286  int offset = extras.Find(key);
2287 
2288  int value;
2289  if (offset < 0)
2290  {
2291  // Failed to find the tag.
2292  return(false);
2293  }
2294  else
2295  value = extras[offset];
2296 
2297  tagVal = floats[value];
2298  return(true);
2299 }
2300 
2301 
2302 const String & SamRecord::getString(const char * tag)
2303 {
2304  // Init to success.
2305  myStatus = SamStatus::SUCCESS;
2306  // Parse the buffer if necessary.
2307  if(myNeedToSetTagsFromBuffer)
2308  {
2309  if(!setTagsFromBuffer())
2310  {
2311  // Failed to read the tags from the buffer, so cannot
2312  // get tags.
2313  // TODO - what do we want to do on failure?
2314  }
2315  }
2316 
2317  int key = MAKEKEY(tag[0], tag[1], 'Z');
2318  int offset = extras.Find(key);
2319 
2320  int value;
2321  if (offset < 0)
2322  {
2323 
2324  key = MAKEKEY(tag[0], tag[1], 'B');
2325  offset = extras.Find(key);
2326  if (offset < 0)
2327  {
2328  // TODO - what do we want to do on failure?
2329  return(NOT_FOUND_TAG_STRING);
2330  }
2331  }
2332  value = extras[offset];
2333 
2334  return strings[value];
2335 }
2336 
2337 
2338 int & SamRecord::getInteger(const char * tag)
2339 {
2340  // Init to success.
2341  myStatus = SamStatus::SUCCESS;
2342  // Parse the buffer if necessary.
2343  if(myNeedToSetTagsFromBuffer)
2344  {
2345  if(!setTagsFromBuffer())
2346  {
2347  // Failed to read the tags from the buffer, so cannot
2348  // get tags. setTagsFromBuffer set the error.
2349  // TODO - what do we want to do on failure?
2350  }
2351  }
2352 
2353  int key = MAKEKEY(tag[0], tag[1], 'i');
2354  int offset = extras.Find(key);
2355 
2356  int value;
2357  if (offset < 0)
2358  {
2359  // TODO - what do we want to do on failure?
2360  return NOT_FOUND_TAG_INT;
2361  }
2362  else
2363  value = extras[offset];
2364 
2365  return integers[value];
2366 }
2367 
2368 
2369 bool SamRecord::checkTag(const char * tag, char type)
2370 {
2371  // Init to success.
2372  myStatus = SamStatus::SUCCESS;
2373  // Parse the buffer if necessary.
2374  if(myNeedToSetTagsFromBuffer)
2375  {
2376  if(!setTagsFromBuffer())
2377  {
2378  // Failed to read the tags from the buffer, so cannot
2379  // get tags. setTagsFromBuffer set the error.
2380  return("");
2381  }
2382  }
2383 
2384  int key = MAKEKEY(tag[0], tag[1], type);
2385 
2386  return (extras.Find(key) != LH_NOTFOUND);
2387 }
2388 
2389 
2390 // Return the error after a failed SamRecord call.
2392 {
2393  return(myStatus);
2394 }
2395 
2396 
2397 // Allocate space for the record - does a realloc.
2398 // The passed in size is the size of the entire record including the
2399 // block size field.
2400 bool SamRecord::allocateRecordStructure(int size)
2401 {
2402  if (allocatedSize < size)
2403  {
2404  bamRecordStruct* tmpRecordPtr =
2405  (bamRecordStruct *)realloc(myRecordPtr, size);
2406  if(tmpRecordPtr == NULL)
2407  {
2408  // FAILED to allocate memory
2409  fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
2410  myStatus.addError(SamStatus::FAIL_MEM, "Failed Memory Allocation.");
2411  return(false);
2412  }
2413  // Successfully allocated memory, so set myRecordPtr.
2414  myRecordPtr = tmpRecordPtr;
2415 
2416  // Reset the pointers into the record.
2417  if(myIsSequenceBufferValid)
2418  {
2419  myPackedSequence = (unsigned char *)myRecordPtr->myData +
2420  myRecordPtr->myReadNameLength +
2421  myRecordPtr->myCigarLength * sizeof(int);
2422  }
2423  if(myIsQualityBufferValid)
2424  {
2425  myPackedQuality = (unsigned char *)myRecordPtr->myData +
2426  myRecordPtr->myReadNameLength +
2427  myRecordPtr->myCigarLength * sizeof(int) +
2428  (myRecordPtr->myReadLength + 1) / 2;
2429  }
2430 
2431  allocatedSize = size;
2432  }
2433  return(true);
2434 }
2435 
2436 
2437 // Index is the index into the strings array.
2438 void* SamRecord::getStringPtr(int index)
2439 {
2440  int value = extras[index];
2441 
2442  return &(strings[value]);
2443 }
2444 
2445 void* SamRecord::getIntegerPtr(int offset, char& type)
2446 {
2447  int value = extras[offset];
2448 
2449  type = intType[value];
2450 
2451  return &(integers[value]);
2452 }
2453 
2454 void* SamRecord::getFloatPtr(int offset)
2455 {
2456  int value = extras[offset];
2457 
2458  return &(floats[value]);
2459 }
2460 
2461 
2462 // Fixes the buffer to match the variable length fields if they are set.
2463 bool SamRecord::fixBuffer(SequenceTranslation translation)
2464 {
2465  // Check to see if the buffer is already synced.
2466  if(myIsBufferSynced &&
2467  (myBufferSequenceTranslation == translation))
2468  {
2469  // Already synced, nothing to do.
2470  return(true);
2471  }
2472 
2473  // Set the bin if necessary.
2474  if(!myIsBinValid)
2475  {
2476  // The bin that is set in the record is not valid, so
2477  // reset it.
2478  myRecordPtr->myBin =
2479  bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());
2480  myIsBinValid = true;
2481  }
2482 
2483  // Not synced.
2484  bool status = true;
2485 
2486  // First determine the size the buffer needs to be.
2487  uint8_t newReadNameLen = getReadNameLength();
2488  uint16_t newCigarLen = getCigarLength();
2489  int32_t newReadLen = getReadLength();
2490  uint32_t newTagLen = getTagLength();
2491  uint32_t bamSequenceLen = (newReadLen+1)/2;
2492 
2493  // The buffer size extends from the start of the record to data
2494  // plus the length of the variable fields,
2495  // Multiply the cigar length by 4 since it is the number of uint32_t fields.
2496  int newBufferSize =
2497  ((unsigned char*)(&(myRecordPtr->myData)) -
2498  (unsigned char*)myRecordPtr) +
2499  newReadNameLen + ((newCigarLen)*4) +
2500  newReadLen + bamSequenceLen + newTagLen;
2501 
2502  if(!allocateRecordStructure(newBufferSize))
2503  {
2504  // Failed to allocate space.
2505  return(false);
2506  }
2507 
2508  // Now that space has been added to the buffer, check to see what if
2509  // any fields need to be extracted from the buffer prior to starting to
2510  // overwrite it. Fields need to be extracted from the buffer if the
2511  // buffer is valid for the field and a previous variable length field has
2512  // changed length.
2513  bool readNameLenChange = (newReadNameLen != myRecordPtr->myReadNameLength);
2514  bool cigarLenChange = (newCigarLen != myRecordPtr->myCigarLength);
2515  bool readLenChange = (newReadLen != myRecordPtr->myReadLength);
2516 
2517  // If the tags are still stored in the buffer and any other fields changed
2518  // lengths, they need to be extracted.
2519  if(myIsTagsBufferValid &&
2520  (readNameLenChange | cigarLenChange | readLenChange))
2521  {
2522  status &= setTagsFromBuffer();
2523  // The tag buffer will not be valid once the other fields
2524  // are written, so set it to not valid.
2525  myIsTagsBufferValid = false;
2526  }
2527 
2528  // If the sequence or quality strings are still stored in the buffer, and
2529  // any of the previous fields have changed length, extract it from the
2530  // current buffer.
2531  if((myIsQualityBufferValid | myIsSequenceBufferValid) &&
2532  (readNameLenChange | cigarLenChange | readLenChange))
2533  {
2534  setSequenceAndQualityFromBuffer();
2535  // The quality and sequence buffers will not be valid once the other
2536  // fields are written, so set them to not valid.
2537  myIsQualityBufferValid = false;
2538  myIsSequenceBufferValid = false;
2539  }
2540 
2541  // If the cigar is still stored in the buffer, and any of the
2542  // previous fields have changed length, extract it from the current buffer.
2543  if((myIsCigarBufferValid) &&
2544  (readNameLenChange))
2545  {
2546  status &= parseCigarBinary();
2547  myIsCigarBufferValid = false;
2548  }
2549 
2550  // Set each value in the buffer if it is not already valid.
2551  if(!myIsReadNameBufferValid)
2552  {
2553  memcpy(&(myRecordPtr->myData), myReadName.c_str(),
2554  newReadNameLen);
2555 
2556  // Set the new ReadNameLength.
2557  myRecordPtr->myReadNameLength = newReadNameLen;
2558  myIsReadNameBufferValid = true;
2559  }
2560 
2561  unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) +
2562  myRecordPtr->myReadNameLength;
2563 
2564  // Set the Cigar. Need to reformat from the string to
2565  unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
2566 
2567  if(!myIsCigarBufferValid)
2568  {
2569  // The cigar was already parsed when it was set, so just copy
2570  // data from the temporary buffer.
2571  myRecordPtr->myCigarLength = newCigarLen;
2572  memcpy(packedCigar, myCigarTempBuffer,
2573  myRecordPtr->myCigarLength * sizeof(uint32_t));
2574 
2575  myIsCigarBufferValid = true;
2576  }
2577 
2578  unsigned char * packedSequence = readNameEnds +
2579  myRecordPtr->myCigarLength * sizeof(int);
2580  unsigned char * packedQuality = packedSequence + bamSequenceLen;
2581 
2582  if(!myIsSequenceBufferValid || !myIsQualityBufferValid ||
2583  (myBufferSequenceTranslation != translation))
2584  {
2585  myRecordPtr->myReadLength = newReadLen;
2586  // Determine if the quality needs to be set and is just a * and needs to
2587  // be set to 0xFF.
2588  bool noQuality = false;
2589  if((myQuality.Length() == 1) && (myQuality[0] == '*'))
2590  {
2591  noQuality = true;
2592  }
2593 
2594  const char* translatedSeq = NULL;
2595  // If the sequence is not valid in the buffer or it is not
2596  // properly translated, get the properly translated sequence
2597  // that needs to be put into the buffer.
2598  if((!myIsSequenceBufferValid) ||
2599  (translation != myBufferSequenceTranslation))
2600  {
2601  translatedSeq = getSequence(translation);
2602  }
2603 
2604  for (int i = 0; i < myRecordPtr->myReadLength; i++)
2605  {
2606  if((!myIsSequenceBufferValid) ||
2607  (translation != myBufferSequenceTranslation))
2608  {
2609  // Sequence buffer is not valid, so set the sequence.
2610  int seqVal = 0;
2611  switch(translatedSeq[i])
2612  {
2613  case '=':
2614  seqVal = 0;
2615  break;
2616  case 'A':
2617  case 'a':
2618  seqVal = 1;
2619  break;
2620  case 'C':
2621  case 'c':
2622  seqVal = 2;
2623  break;
2624  case 'G':
2625  case 'g':
2626  seqVal = 4;
2627  break;
2628  case 'T':
2629  case 't':
2630  seqVal = 8;
2631  break;
2632  case 'N':
2633  case 'n':
2634  case '.':
2635  seqVal = 15;
2636  break;
2637  default:
2639  "Unknown Sequence character found.");
2640  status = false;
2641  break;
2642  };
2643 
2644  if(i & 1)
2645  {
2646  // Odd number i's go in the lower 4 bits, so OR in the
2647  // lower bits
2648  packedSequence[i/2] |= seqVal;
2649  }
2650  else
2651  {
2652  // Even i's go in the upper 4 bits and are always set first.
2653  packedSequence[i/2] = seqVal << 4;
2654  }
2655  }
2656 
2657  if(!myIsQualityBufferValid)
2658  {
2659  // Set the quality.
2660  if((noQuality) || (myQuality.Length() <= i))
2661  {
2662  // No quality or the quality is smaller than the sequence,
2663  // so set it to 0xFF
2664  packedQuality[i] = 0xFF;
2665  }
2666  else
2667  {
2668  // Copy the quality string.
2669  packedQuality[i] = myQuality[i] - 33;
2670  }
2671  }
2672  }
2673  myPackedSequence = (unsigned char *)myRecordPtr->myData +
2674  myRecordPtr->myReadNameLength +
2675  myRecordPtr->myCigarLength * sizeof(int);
2676  myPackedQuality = myPackedSequence +
2677  (myRecordPtr->myReadLength + 1) / 2;
2678  myIsSequenceBufferValid = true;
2679  myIsQualityBufferValid = true;
2680  myBufferSequenceTranslation = translation;
2681  }
2682 
2683  if(!myIsTagsBufferValid)
2684  {
2685  status &= setTagsInBuffer();
2686  }
2687 
2688  // Set the lengths in the buffer.
2689  myRecordPtr->myReadNameLength = newReadNameLen;
2690  myRecordPtr->myCigarLength = newCigarLen;
2691  myRecordPtr->myReadLength = newReadLen;
2692 
2693  // Set the buffer block size to the size of the buffer minus the
2694  // first field.
2695  myRecordPtr->myBlockSize = newBufferSize - sizeof(int32_t);
2696 
2697  if(status)
2698  {
2699  myIsBufferSynced = true;
2700  }
2701 
2702  return(status);
2703 }
2704 
2705 
2706 // Sets the Sequence and Quality strings from the buffer.
2707 // They are done together in one method because they require the same
2708 // loop, so might as well be done at the same time.
2709 void SamRecord::setSequenceAndQualityFromBuffer()
2710 {
2711  // NOTE: If the sequence buffer is not valid, do not set the sequence
2712  // string from the buffer.
2713  // NOTE: If the quality buffer is not valid, do not set the quality string
2714  // from the buffer.
2715 
2716  // Extract the sequence if the buffer is valid and the string's length is 0.
2717  bool extractSequence = false;
2718  if(myIsSequenceBufferValid && (mySequence.Length() == 0))
2719  {
2720  extractSequence = true;
2721  }
2722 
2723  // Extract the quality if the buffer is valid and the string's length is 0.
2724  bool extractQuality = false;
2725  if(myIsQualityBufferValid && (myQuality.Length() == 0))
2726  {
2727  extractQuality = true;
2728  }
2729 
2730  // If neither the quality nor the sequence need to be extracted,
2731  // just return.
2732  if(!extractSequence && !extractQuality)
2733  {
2734  return;
2735  }
2736 
2737  // Set the sequence and quality strings..
2738  if(extractSequence)
2739  {
2740  mySequence.SetLength(myRecordPtr->myReadLength);
2741  }
2742  if(extractQuality)
2743  {
2744  myQuality.SetLength(myRecordPtr->myReadLength);
2745  }
2746 
2747  const char * asciiBases = "=AC.G...T......N";
2748 
2749  // Flag to see if the quality is specified - the quality contains a value
2750  // other than 0xFF. If all values are 0xFF, then there is no quality.
2751  bool qualitySpecified = false;
2752 
2753  for (int i = 0; i < myRecordPtr->myReadLength; i++)
2754  {
2755  if(extractSequence)
2756  {
2757  mySequence[i] = i & 1 ?
2758  asciiBases[myPackedSequence[i / 2] & 0xF] :
2759  asciiBases[myPackedSequence[i / 2] >> 4];
2760  }
2761 
2762  if(extractQuality)
2763  {
2764  if(myPackedQuality[i] != 0xFF)
2765  {
2766  // Quality is specified, so mark the flag.
2767  qualitySpecified = true;
2768  }
2769 
2770  myQuality[i] = myPackedQuality[i] + 33;
2771  }
2772  }
2773 
2774  // If the read length is 0, then set the sequence and quality to '*'
2775  if(myRecordPtr->myReadLength == 0)
2776  {
2777  if(extractSequence)
2778  {
2779  mySequence = "*";
2780  }
2781  if(extractQuality)
2782  {
2783  myQuality = "*";
2784  }
2785  }
2786  else if(extractQuality && !qualitySpecified)
2787  {
2788  // No quality was specified, so set it to "*"
2789  myQuality = "*";
2790  }
2791 }
2792 
2793 
2794 // Parse the cigar to calculate the alignment/unclipped end.
2795 bool SamRecord::parseCigar()
2796 {
2797  // Determine if the cigar string or cigar binary needs to be parsed.
2798  if(myCigar.Length() == 0)
2799  {
2800  // The cigar string is not yet set, so parse the binary.
2801  return(parseCigarBinary());
2802  }
2803  return(parseCigarString());
2804 }
2805 
2806 // Parse the cigar to calculate the alignment/unclipped end.
2807 bool SamRecord::parseCigarBinary()
2808 {
2809  // Only need to parse if the string is not already set.
2810  // The length of the cigar string is set to zero when the
2811  // record is read from a file into the buffer.
2812  if(myCigar.Length() != 0)
2813  {
2814  // Already parsed.
2815  return(true);
2816  }
2817 
2818  unsigned char * readNameEnds =
2819  (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
2820 
2821  unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
2822 
2823  myCigarRoller.Set(packedCigar, myRecordPtr->myCigarLength);
2824 
2825  myCigarRoller.getCigarString(myCigar);
2826 
2827  myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
2828 
2829  myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
2830  myUnclippedEndOffset = myCigarRoller.getNumEndClips();
2831 
2832  // if the cigar length is 0, then set the cigar string to "*"
2833  if(myRecordPtr->myCigarLength == 0)
2834  {
2835  myCigar = "*";
2836  return(true);
2837  }
2838 
2839  // Copy the cigar into a temporary buffer.
2840  int newBufferSize = myRecordPtr->myCigarLength * sizeof(uint32_t);
2841  if(newBufferSize > myCigarTempBufferAllocatedSize)
2842  {
2843  uint32_t* tempBufferPtr =
2844  (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
2845  if(tempBufferPtr == NULL)
2846  {
2847  // Failed to allocate memory.
2848  // Do not parse, just return.
2849  fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
2850  myStatus.addError(SamStatus::FAIL_MEM,
2851  "Failed to Allocate Memory.");
2852  return(false);
2853  }
2854  myCigarTempBuffer = tempBufferPtr;
2855  myCigarTempBufferAllocatedSize = newBufferSize;
2856  }
2857 
2858  memcpy(myCigarTempBuffer, packedCigar,
2859  myRecordPtr->myCigarLength * sizeof(uint32_t));
2860 
2861  // Set the length of the temp buffer.
2862  myCigarTempBufferLength = myRecordPtr->myCigarLength;
2863 
2864  return(true);
2865 }
2866 
2867 // Parse the cigar string to calculate the cigar length and alignment end.
2868 bool SamRecord::parseCigarString()
2869 {
2870  myCigarTempBufferLength = 0;
2871  if(myCigar == "*")
2872  {
2873  // Cigar is empty, so initialize the variables.
2874  myAlignmentLength = 0;
2875  myUnclippedStartOffset = 0;
2876  myUnclippedEndOffset = 0;
2877  myCigarRoller.clear();
2878  return(true);
2879  }
2880 
2881  myCigarRoller.Set(myCigar);
2882 
2883  myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
2884 
2885  myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
2886  myUnclippedEndOffset = myCigarRoller.getNumEndClips();
2887 
2888  // Check to see if the Temporary Cigar Buffer is large enough to contain
2889  // this cigar. If we make it the size of the length of the cigar string,
2890  // it will be more than large enough.
2891  int newBufferSize = myCigar.Length() * sizeof(uint32_t);
2892  if(newBufferSize > myCigarTempBufferAllocatedSize)
2893  {
2894  uint32_t* tempBufferPtr =
2895  (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
2896  if(tempBufferPtr == NULL)
2897  {
2898  // Failed to allocate memory.
2899  // Do not parse, just return.
2900  fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
2901  myStatus.addError(SamStatus::FAIL_MEM,
2902  "Failed to Allocate Memory.");
2903  return(false);
2904  }
2905  myCigarTempBuffer = tempBufferPtr;
2906  myCigarTempBufferAllocatedSize = newBufferSize;
2907  }
2908 
2909  // Track if there were any errors.
2910  bool status = true;
2911 
2912  // Track the index into the cigar string that is being parsed.
2913  char *cigarOp;
2914  const char* cigarEntryStart = myCigar.c_str();
2915  int opLen = 0;
2916  int op = 0;
2917 
2918  unsigned int * packedCigar = myCigarTempBuffer;
2919  // TODO - maybe one day make a cigar list... or maybe make a
2920  // reference cigar string for ease of lookup....
2921  const char* endCigarString = cigarEntryStart + myCigar.Length();
2922  while(cigarEntryStart < endCigarString)
2923  {
2924  bool validCigarEntry = true;
2925  // Get the opLen from the string. cigarOp will then point to
2926  // the operation.
2927  opLen = strtol(cigarEntryStart, &cigarOp, 10);
2928  // Switch on the type of operation.
2929  switch(*cigarOp)
2930  {
2931  case('M'):
2932  op = 0;
2933  break;
2934  case('I'):
2935  // Insert into the reference position, so do not increment the
2936  // reference end position.
2937  op = 1;
2938  break;
2939  case('D'):
2940  op = 2;
2941  break;
2942  case('N'):
2943  op = 3;
2944  break;
2945  case('S'):
2946  op = 4;
2947  break;
2948  case('H'):
2949  op = 5;
2950  break;
2951  case('P'):
2952  op = 6;
2953  break;
2954  default:
2955  fprintf(stderr, "ERROR parsing cigar\n");
2956  validCigarEntry = false;
2957  status = false;
2959  "Unknown operation found when parsing the Cigar.");
2960  break;
2961  };
2962  if(validCigarEntry)
2963  {
2964  // Increment the cigar length.
2965  ++myCigarTempBufferLength;
2966  *packedCigar = (opLen << 4) | op;
2967  packedCigar++;
2968  }
2969  // The next Entry starts at one past the cigar op, so set the start.
2970  cigarEntryStart = ++cigarOp;
2971  }
2972 
2973  // Check clipLength to adjust the end position.
2974  return(status);
2975 }
2976 
2977 
2978 bool SamRecord::setTagsFromBuffer()
2979 {
2980  // If the tags do not need to be set from the buffer, return true.
2981  if(myNeedToSetTagsFromBuffer == false)
2982  {
2983  // Already been set from the buffer.
2984  return(true);
2985  }
2986 
2987  // Mark false, as they are being set now.
2988  myNeedToSetTagsFromBuffer = false;
2989 
2990  unsigned char * extraPtr = myPackedQuality + myRecordPtr->myReadLength;
2991 
2992  // Default to success, will be changed to false on failure.
2993  bool status = true;
2994 
2995  // Clear any previously set tags.
2996  clearTags();
2997  while (myRecordPtr->myBlockSize + 4 -
2998  (extraPtr - (unsigned char *)myRecordPtr) > 0)
2999  {
3000  int key = 0;
3001  int value = 0;
3002  void * content = extraPtr + 3;
3003  int tagBufferSize = 0;
3004 
3005  key = MAKEKEY(extraPtr[0], extraPtr[1], extraPtr[2]);
3006 
3007  // First check if the tag already exists.
3008  unsigned int location = extras.Find(key);
3009  int origIndex = 0;
3010  String* duplicate = NULL;
3011  String* origTag = NULL;
3012  if(location != LH_NOTFOUND)
3013  {
3014  duplicate = new String;
3015  origTag = new String;
3016  origIndex = extras[location];
3017 
3018  *duplicate = (char)(extraPtr[0]);
3019  *duplicate += (char)(extraPtr[1]);
3020  *duplicate += ':';
3021 
3022  *origTag = *duplicate;
3023  *duplicate += (char)(extraPtr[2]);
3024  *duplicate += ':';
3025  }
3026 
3027  switch (extraPtr[2])
3028  {
3029  case 'A' :
3030  if(duplicate != NULL)
3031  {
3032  *duplicate += (* (char *) content);
3033  *origTag += intType[origIndex];
3034  *origTag += ':';
3035  appendIntArrayValue(origIndex, *origTag);
3036  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3037  integers[origIndex] = *(char *)content;
3038  intType[origIndex] = extraPtr[2];
3039  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3040  }
3041  else
3042  {
3043  value = integers.Length();
3044  integers.Push(* (char *) content);
3045  intType.push_back(extraPtr[2]);
3046  tagBufferSize += 4;
3047  }
3048  extraPtr += 4;
3049  break;
3050  case 'c' :
3051  if(duplicate != NULL)
3052  {
3053  *duplicate += (* (char *) content);
3054  *origTag += intType[origIndex];
3055  *origTag += ':';
3056  appendIntArrayValue(origIndex, *origTag);
3057  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3058  integers[origIndex] = *(char *)content;
3059  intType[origIndex] = extraPtr[2];
3060  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3061  }
3062  else
3063  {
3064  value = integers.Length();
3065  integers.Push(* (char *) content);
3066  intType.push_back(extraPtr[2]);
3067  tagBufferSize += 4;
3068  }
3069  extraPtr += 4;
3070  break;
3071  case 'C' :
3072  if(duplicate != NULL)
3073  {
3074  *duplicate += (* (unsigned char *) content);
3075  *origTag += intType[origIndex];
3076  *origTag += ':';
3077  appendIntArrayValue(origIndex, *origTag);
3078  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3079  integers[origIndex] = *(unsigned char *)content;
3080  intType[origIndex] = extraPtr[2];
3081  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3082  }
3083  else
3084  {
3085  value = integers.Length();
3086  integers.Push(* (unsigned char *) content);
3087  intType.push_back(extraPtr[2]);
3088  tagBufferSize += 4;
3089  }
3090  extraPtr += 4;
3091  break;
3092  case 's' :
3093  if(duplicate != NULL)
3094  {
3095  *duplicate += (* (short *) content);
3096  *origTag += intType[origIndex];
3097  *origTag += ':';
3098  appendIntArrayValue(origIndex, *origTag);
3099  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3100  integers[origIndex] = *(short *)content;
3101  intType[origIndex] = extraPtr[2];
3102  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3103  }
3104  else
3105  {
3106  value = integers.Length();
3107  integers.Push(* (short *) content);
3108  intType.push_back(extraPtr[2]);
3109  tagBufferSize += 5;
3110  }
3111  extraPtr += 5;
3112  break;
3113  case 'S' :
3114  if(duplicate != NULL)
3115  {
3116  *duplicate += (* (unsigned short *) content);
3117  *origTag += intType[origIndex];
3118  *origTag += ':';
3119  appendIntArrayValue(origIndex, *origTag);
3120  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3121  integers[origIndex] = *(unsigned short *)content;
3122  intType[origIndex] = extraPtr[2];
3123  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3124  }
3125  else
3126  {
3127  value = integers.Length();
3128  integers.Push(* (unsigned short *) content);
3129  intType.push_back(extraPtr[2]);
3130  tagBufferSize += 5;
3131  }
3132  extraPtr += 5;
3133  break;
3134  case 'i' :
3135  if(duplicate != NULL)
3136  {
3137  *duplicate += (* (int *) content);
3138  *origTag += intType[origIndex];
3139  *origTag += ':';
3140  appendIntArrayValue(origIndex, *origTag);
3141  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3142  integers[origIndex] = *(int *)content;
3143  intType[origIndex] = extraPtr[2];
3144  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3145  }
3146  else
3147  {
3148  value = integers.Length();
3149  integers.Push(* (int *) content);
3150  intType.push_back(extraPtr[2]);
3151  tagBufferSize += 7;
3152  }
3153  extraPtr += 7;
3154  break;
3155  case 'I' :
3156  if(duplicate != NULL)
3157  {
3158  *duplicate += (* (unsigned int *) content);
3159  *origTag += intType[origIndex];
3160  *origTag += ':';
3161  appendIntArrayValue(origIndex, *origTag);
3162  tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3163  integers[origIndex] = *(unsigned int *)content;
3164  intType[origIndex] = extraPtr[2];
3165  tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3166  }
3167  else
3168  {
3169  value = integers.Length();
3170  integers.Push((int) * (unsigned int *) content);
3171  intType.push_back(extraPtr[2]);
3172  tagBufferSize += 7;
3173  }
3174  extraPtr += 7;
3175  break;
3176  case 'Z' :
3177  if(duplicate != NULL)
3178  {
3179  *duplicate += ((const char *) content);
3180  *origTag += 'Z';
3181  *origTag += ':';
3182  *origTag += (char*)(strings[origIndex]);
3183  tagBufferSize -= strings[origIndex].Length();
3184  strings[origIndex] = (const char *) content;
3185  extraPtr += 4 + strings[origIndex].Length();
3186  tagBufferSize += strings[origIndex].Length();
3187  }
3188  else
3189  {
3190  value = strings.Length();
3191  strings.Push((const char *) content);
3192  tagBufferSize += 4 + strings.Last().Length();
3193  extraPtr += 4 + strings.Last().Length();
3194  }
3195  break;
3196  case 'B' :
3197  if(duplicate != NULL)
3198  {
3199  *origTag += 'B';
3200  *origTag += ':';
3201  *origTag += (char*)(strings[origIndex]);
3202  tagBufferSize -=
3203  getBtagBufferSize(strings[origIndex]);
3204  int bufferSize =
3205  getStringFromBtagBuffer((unsigned char*)content,
3206  strings[origIndex]);
3207  *duplicate += (char *)(strings[origIndex]);
3208  tagBufferSize += bufferSize;
3209  extraPtr += 3 + bufferSize;
3210  }
3211  else
3212  {
3213  value = strings.Length();
3214  String tempBTag;
3215  int bufferSize =
3216  getStringFromBtagBuffer((unsigned char*)content,
3217  tempBTag);
3218  strings.Push(tempBTag);
3219  tagBufferSize += 3 + bufferSize;
3220  extraPtr += 3 + bufferSize;
3221  }
3222  break;
3223  case 'f' :
3224  if(duplicate != NULL)
3225  {
3226  duplicate->appendFullFloat(* (float *) content);
3227  *origTag += 'f';
3228  *origTag += ':';
3229  origTag->appendFullFloat(floats[origIndex]);
3230  floats[origIndex] = *(float *)content;
3231  }
3232  else
3233  {
3234  value = floats.size();
3235  floats.push_back(* (float *) content);
3236  tagBufferSize += 7;
3237  }
3238  extraPtr += 7;
3239  break;
3240  default :
3241  fprintf(stderr,
3242  "parsing BAM - Unknown custom field of type %c%c:%c\n",
3243  extraPtr[0], extraPtr[1], extraPtr[2]);
3244  fprintf(stderr, "BAM Tags: \n");
3245 
3246  unsigned char* tagInfo = myPackedQuality + myRecordPtr->myReadLength;
3247 
3248  fprintf(stderr, "\n\n");
3249  tagInfo = myPackedQuality + myRecordPtr->myReadLength;
3250  while(myRecordPtr->myBlockSize + 4 -
3251  (tagInfo - (unsigned char *)myRecordPtr) > 0)
3252  {
3253  fprintf(stderr, "%02x",tagInfo[0]);
3254  ++tagInfo;
3255  }
3256  fprintf(stderr, "\n");
3257 
3258  // Failed on read.
3259  // Increment extraPtr just by the size of the 3 known fields
3260  extraPtr += 3;
3262  "Unknown tag type.");
3263  status = false;
3264  }
3265 
3266  if(duplicate != NULL)
3267  {
3268  // Duplicate tag in this record.
3269  // Tag already existed, print message about overwriting.
3270  // WARN about dropping duplicate tags.
3271  if(myNumWarns++ < myMaxWarns)
3272  {
3273  fprintf(stderr, "WARNING: Duplicate Tags, overwritting %s with %s\n",
3274  origTag->c_str(), duplicate->c_str());
3275  if(myNumWarns == myMaxWarns)
3276  {
3277  fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n");
3278  }
3279  }
3280 
3281  continue;
3282  }
3283 
3284  // Only add the tag if it has so far been successfully processed.
3285  if(status)
3286  {
3287  // Add the tag.
3288  extras.Add(key, value);
3289  myTagBufferSize += tagBufferSize;
3290  }
3291  }
3292  return(status);
3293 }
3294 
3295 
3296 bool SamRecord::setTagsInBuffer()
3297 {
3298  // The buffer size extends from the start of the record to data
3299  // plus the length of the variable fields,
3300  // Multiply the cigar length by 4 since it is the number of uint32_t fields.
3301  int bamSequenceLength = (myRecordPtr->myReadLength+1)/2;
3302  int newBufferSize = ((unsigned char*)(&(myRecordPtr->myData)) -
3303  (unsigned char*)myRecordPtr) +
3304  myRecordPtr->myReadNameLength + ((myRecordPtr->myCigarLength)*4) +
3305  myRecordPtr->myReadLength + bamSequenceLength + myTagBufferSize;
3306 
3307  // Make sure the buffer is big enough.
3308  if(!allocateRecordStructure(newBufferSize))
3309  {
3310  // Failed to allocate space.
3311  return(false);
3312  }
3313 
3314  char * extraPtr = (char*)myPackedQuality + myRecordPtr->myReadLength;
3315 
3316  bool status = true;
3317 
3318  // Set the tags in the buffer.
3319  if (extras.Entries())
3320  {
3321  for (int i = 0; i < extras.Capacity(); i++)
3322  {
3323  if (extras.SlotInUse(i))
3324  {
3325  int key = extras.GetKey(i);
3326  getTag(key, extraPtr);
3327  extraPtr += 2;
3328  char vtype;
3329  getTypeFromKey(key, vtype);
3330 
3331  if(vtype == 'i')
3332  {
3333  vtype = getIntegerType(i);
3334  }
3335 
3336  extraPtr[0] = vtype;
3337 
3338  // increment the pointer to where the value is.
3339  extraPtr += 1;
3340 
3341  switch (vtype)
3342  {
3343  case 'A' :
3344  *(char*)extraPtr = (char)getInteger(i);
3345  // sprintf(extraPtr, "%d", getInteger(i));
3346  extraPtr += 1;
3347  break;
3348  case 'c' :
3349  *(int8_t*)extraPtr = (int8_t)getInteger(i);
3350  // sprintf(extraPtr, "%.4d", getInteger(i));
3351  extraPtr += 1;
3352  break;
3353  case 'C' :
3354  *(uint8_t*)extraPtr = (uint8_t)getInteger(i);
3355  // sprintf(extraPtr, "%.4d", getInteger(i));
3356  extraPtr += 1;
3357  break;
3358  case 's' :
3359  *(int16_t*)extraPtr = (int16_t)getInteger(i);
3360  // sprintf(extraPtr, "%.4d", getInteger(i));
3361  extraPtr += 2;
3362  break;
3363  case 'S' :
3364  *(uint16_t*)extraPtr = (uint16_t)getInteger(i);
3365  // sprintf(extraPtr, "%.4d", getInteger(i));
3366  extraPtr += 2;
3367  break;
3368  case 'i' :
3369  *(int32_t*)extraPtr = (int32_t)getInteger(i);
3370  // sprintf(extraPtr, "%.4d", getInteger(i));
3371  extraPtr += 4;
3372  break;
3373  case 'I' :
3374  *(uint32_t*)extraPtr = (uint32_t)getInteger(i);
3375  // sprintf(extraPtr, "%.4d", getInteger(i));
3376  extraPtr += 4;
3377  break;
3378  case 'Z' :
3379  sprintf(extraPtr, "%s", getString(i).c_str());
3380  extraPtr += getString(i).Length() + 1;
3381  break;
3382  case 'B' :
3383  extraPtr += setBtagBuffer(getString(i), extraPtr);
3384  //--TODO-- Set buffer with correct B tag
3385  //sprintf(extraPtr, "%s", getString(i).c_str());
3386  // extraPtr += getBtagBufferSize(getString(i));
3387  break;
3388  case 'f' :
3389  *(float*)extraPtr = getFloat(i);
3390  extraPtr += 4;
3391  break;
3392  default :
3394  "Unknown tag type.");
3395  status = false;
3396  break;
3397  }
3398  }
3399  }
3400  }
3401 
3402  // Validate that the extra pointer is at the end of the allocated buffer.
3403  // If not then there was a problem.
3404  if(extraPtr != (char*)myRecordPtr + newBufferSize)
3405  {
3406  fprintf(stderr, "ERROR updating the buffer. Incorrect size.");
3408  "ERROR updating the buffer. Incorrect size.");
3409  status = false;
3410  }
3411 
3412 
3413  // The buffer tags are now in sync.
3414  myNeedToSetTagsInBuffer = false;
3415  myIsTagsBufferValid = true;
3416  return(status);
3417 }
3418 
3419 
3420 // Reset the variables for a newly set buffer. The buffer must be set first
3421 // since this looks up the reference ids in the buffer to set the reference
3422 // names.
3423 void SamRecord::setVariablesForNewBuffer(SamFileHeader& header)
3424 {
3425  // Lookup the reference name & mate reference name associated with this
3426  // record.
3427  myReferenceName =
3428  header.getReferenceLabel(myRecordPtr->myReferenceID);
3429  myMateReferenceName =
3430  header.getReferenceLabel(myRecordPtr->myMateReferenceID);
3431 
3432  // Clear the SAM Strings that are now not in-sync with the buffer.
3433  myReadName.SetLength(0);
3434  myCigar.SetLength(0);
3435  mySequence.SetLength(0);
3436  mySeqWithEq.clear();
3437  mySeqWithoutEq.clear();
3438  myQuality.SetLength(0);
3439  myNeedToSetTagsFromBuffer = true;
3440  myNeedToSetTagsInBuffer = false;
3441 
3442  //Set that the buffer is valid.
3443  myIsBufferSynced = true;
3444  // Set that the variable length buffer fields are valid.
3445  myIsReadNameBufferValid = true;
3446  myIsCigarBufferValid = true;
3447  myPackedSequence = (unsigned char *)myRecordPtr->myData +
3448  myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int);
3449  myIsSequenceBufferValid = true;
3450  myBufferSequenceTranslation = NONE;
3451  myPackedQuality = myPackedSequence +
3452  (myRecordPtr->myReadLength + 1) / 2;
3453  myIsQualityBufferValid = true;
3454  myIsTagsBufferValid = true;
3455  myIsBinValid = true;
3456 }
3457 
3458 
3459 // Extract the vtype from the key.
3460 void SamRecord::getTypeFromKey(int key, char& type) const
3461 {
3462  // Extract the type from the key.
3463  type = (key >> 16) & 0xFF;
3464 }
3465 
3466 
3467 // Extract the tag from the key.
3468 void SamRecord::getTag(int key, char* tag) const
3469 {
3470  // Extract the tag from the key.
3471  tag[0] = key & 0xFF;
3472  tag[1] = (key >> 8) & 0xFF;
3473  tag[2] = 0;
3474 }
3475 
3476 
3477 // Index is the index into the strings array.
3478 String & SamRecord::getString(int index)
3479 {
3480  int value = extras[index];
3481 
3482  return strings[value];
3483 }
3484 
3485 int & SamRecord::getInteger(int offset)
3486 {
3487  int value = extras[offset];
3488 
3489  return integers[value];
3490 }
3491 
3492 const char & SamRecord::getIntegerType(int offset) const
3493 {
3494  int value = extras[offset];
3495 
3496  return intType[value];
3497 }
3498 
3499 float & SamRecord::getFloat(int offset)
3500 {
3501  int value = extras[offset];
3502 
3503  return floats[value];
3504  }
3505 
3506 
3507 void SamRecord::appendIntArrayValue(char type, int value, String& strVal) const
3508 {
3509  switch(type)
3510  {
3511  case 'A':
3512  strVal += (char)value;
3513  break;
3514  case 'c':
3515  case 's':
3516  case 'i':
3517  case 'C':
3518  case 'S':
3519  case 'I':
3520  strVal += value;
3521  break;
3522  default:
3523  // Do nothing.
3524  ;
3525  }
3526 }
3527 
3528 
3529 int SamRecord::getBtagBufferSize(String& tagStr)
3530 {
3531  if(tagStr.Length() < 1)
3532  {
3533  // ERROR, needs at least the type.
3534  myStatus.addError(SamStatus::FAIL_PARSE,
3535  "SamRecord::getBtagBufferSize no tag subtype specified");
3536  return(0);
3537  }
3538  char type = tagStr[0];
3539  int elementSize = getNumericTagTypeSize(type);
3540  if(elementSize <= 0)
3541  {
3542  // ERROR, 'B' tag subtype must be numeric, so should be non-zero
3543  String errorMsg = "SamRecord::getBtagBufferSize invalid tag subtype, ";
3544  errorMsg += type;
3545  myStatus.addError(SamStatus::FAIL_PARSE, errorMsg.c_str());
3546  return(0);
3547  }
3548 
3549  // Separated by ',', so count the number of commas.
3550  int numElements = 0;
3551  int index = tagStr.FastFindChar(',', 0);
3552  while(index > 0)
3553  {
3554  ++numElements;
3555  index = tagStr.FastFindChar(',', index+1);
3556  }
3557  // Add 5 bytes: type & numElements
3558  return(numElements * elementSize + 5);
3559 }
3560 
3561 
3562 int SamRecord::setBtagBuffer(String& tagStr, char* extraPtr)
3563 {
3564  if(tagStr.Length() < 1)
3565  {
3566  // ERROR, needs at least the type.
3567  myStatus.addError(SamStatus::FAIL_PARSE,
3568  "SamRecord::getBtagBufferSize no tag subtype specified");
3569  return(0);
3570  }
3571  char type = tagStr[0];
3572  int elementSize = getNumericTagTypeSize(type);
3573  if(elementSize <= 0)
3574  {
3575  // ERROR, 'B' tag subtype must be numeric, so should be non-zero
3576  String errorMsg = "SamRecord::getBtagBufferSize invalid tag subtype, ";
3577  errorMsg += type;
3578  myStatus.addError(SamStatus::FAIL_PARSE, errorMsg.c_str());
3579  return(0);
3580  }
3581 
3582  int totalInc = 0;
3583  // Write the type.
3584  *(char*)extraPtr = type;
3585  ++extraPtr;
3586  ++totalInc;
3587 
3588  // Get the number of elements by counting ','s
3589  uint32_t numElements = 0;
3590  int index = tagStr.FastFindChar(',', 0);
3591  while(index > 0)
3592  {
3593  ++numElements;
3594  index = tagStr.FastFindChar(',', index+1);
3595  }
3596  *(uint32_t*)extraPtr = numElements;
3597  extraPtr += 4;
3598  totalInc += 4;
3599 
3600  const char* stringPtr = tagStr.c_str();
3601  const char* endPtr = stringPtr + tagStr.Length();
3602  // increment past the subtype and ','.
3603  stringPtr += 2;
3604 
3605  char* newPtr = NULL;
3606  while(stringPtr < endPtr)
3607  {
3608  switch(type)
3609  {
3610  case 'f':
3611  *(float*)extraPtr = (float)(strtod(stringPtr, &newPtr));
3612  break;
3613  case 'c':
3614  *(int8_t*)extraPtr = (int8_t)strtol(stringPtr, &newPtr, 0);
3615  break;
3616  case 's':
3617  *(int16_t*)extraPtr = (int16_t)strtol(stringPtr, &newPtr, 0);
3618  break;
3619  case 'i':
3620  *(int32_t*)extraPtr = (int32_t)strtol(stringPtr, &newPtr, 0);
3621  break;
3622  case 'C':
3623  *(uint8_t*)extraPtr = (uint8_t)strtoul(stringPtr, &newPtr, 0);
3624  break;
3625  case 'S':
3626  *(uint16_t*)extraPtr = (uint16_t)strtoul(stringPtr, &newPtr, 0);
3627  break;
3628  case 'I':
3629  *(uint32_t*)extraPtr = (uint32_t)strtoul(stringPtr, &newPtr, 0);
3630  break;
3631  default :
3633  "Unknown 'B' tag subtype.");
3634  break;
3635  }
3636  extraPtr += elementSize;
3637  totalInc += elementSize;
3638  stringPtr = newPtr + 1; // skip the ','
3639  }
3640  return(totalInc);
3641 }
3642 
3643 
3644 int SamRecord::getStringFromBtagBuffer(unsigned char* buffer,
3645  String& tagStr)
3646 {
3647  tagStr.Clear();
3648 
3649  int bufferSize = 0;
3650 
3651  // 1st byte is the type.
3652  char type = *buffer;
3653  ++buffer;
3654  ++bufferSize;
3655  tagStr = type;
3656 
3657  // 2nd-5th bytes are the size
3658  unsigned int numEntries = *(unsigned int *)buffer;
3659  buffer += 4;
3660  bufferSize += 4;
3661  // Num Entries is not included in the string.
3662 
3663  int subtypeSize = getNumericTagTypeSize(type);
3664 
3665  for(unsigned int i = 0; i < numEntries; i++)
3666  {
3667  tagStr += ',';
3668  switch(type)
3669  {
3670  case 'f':
3671  tagStr.appendFullFloat(*(float *)buffer);
3672  break;
3673  case 'c':
3674  tagStr += *(int8_t *)buffer;
3675  break;
3676  case 's':
3677  tagStr += *(int16_t *)buffer;
3678  break;
3679  case 'i':
3680  tagStr += *(int32_t *)buffer;
3681  break;
3682  case 'C':
3683  tagStr += *(uint8_t *)buffer;
3684  break;
3685  case 'S':
3686  tagStr += *(uint16_t *)buffer;
3687  break;
3688  case 'I':
3689  tagStr += *(uint32_t *)buffer;
3690  break;
3691  default :
3693  "Unknown 'B' tag subtype.");
3694  break;
3695  }
3696  buffer += subtypeSize;
3697  bufferSize += subtypeSize;
3698  }
3699  return(bufferSize);
3700 }
static bool isMatchOrMismatch(Operation op)
Return true if the specified operation is a match/mismatch operation, false if not.
Definition: Cigar.h:298
static const char UNKNOWN_QUALITY_CHAR
Character used when the quality is unknown.
Definition: BaseUtilities.h:49
void Set(const char *cigarString)
Sets this object to the specified cigarString.
static void seqWithEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence with &#39;=&#39; in any position where the sequence matches the reference.
bool shiftIndelsLeft()
Shift the indels (if any) to the left by updating the CIGAR.
Definition: SamRecord.cpp:368
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
void getErrorString(std::string &errorString) const
Append the error messages contained in this container to the passed in string.
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
Definition: SamRecord.cpp:1328
SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader &header)
Read the BAM record from a file.
Definition: SamRecord.cpp:558
int32_t get0BasedUnclippedStart()
Returns the 0-based inclusive left-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1494
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1455
void resetRecord()
Reset the fields of the record to a default value.
Definition: SamRecord.cpp:91
bool getNextSamTag(char *tag, char &vtype, void **value)
Get the next tag from the record.
Definition: SamRecord.cpp:1950
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
Definition: SamRecord.h:57
int getNumEndClips() const
Return the number of clips that are at the end of the cigar.
Definition: Cigar.cpp:166
uint16_t getCigarLength()
Get the length of the BAM formatted CIGAR.
Definition: SamRecord.cpp:1350
This class is used to track the status results of some methods in the BAM classes.
Definition: StatGenStatus.h:26
int32_t get0BasedMatePosition()
Get the 0-based(BAM) leftmost mate/next fragment&#39;s position.
Definition: SamRecord.cpp:1440
Translate &#39;=&#39; to the actual base.
Definition: SamRecord.h:60
bool getFloatTag(const char *tag, float &tagVal)
Get the float value for the specified tag.
Definition: SamRecord.cpp:2269
unsigned int ifread(IFILE file, void *buffer, unsigned int size)
Read up to size bytes from the file into the buffer.
Definition: InputFile.h:600
int32_t get1BasedAlignmentEnd()
Returns the 1-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1474
bool getFields(bamRecordStruct &recStruct, String &readName, String &cigar, String &sequence, String &quality)
Returns the values of all fields except the tags.
Definition: SamRecord.cpp:1854
int32_t get0BasedUnclippedEnd()
Returns the 0-based inclusive right-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1514
int & getInteger(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool...
Definition: SamRecord.cpp:2338
method failed due to an I/O issue.
Definition: StatGenStatus.h:37
bool Update(int index, Operation op, int count)
Updates the operation at the specified index to be the specified operation and have the specified cou...
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2168
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
Definition: SamRecord.cpp:251
static bool isStringType(char vtype)
Returns whether or not the specified vtype is a string type.
Definition: SamRecord.cpp:2060
bool checkTag(const char *tag, char type)
Check if the specified tag contains a value of the specified vtype.
Definition: SamRecord.cpp:2369
Status getStatus() const
Return the enum for this status object.
bool addIntTag(const char *tag, int32_t value)
Add the specified integer tag to the record.
Definition: SamRecord.cpp:635
uint32_t getNumOverlaps(int32_t start, int32_t end)
Return the number of bases in this read that overlap the passed in region.
Definition: SamRecord.cpp:1841
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1530
int32_t get1BasedUnclippedEnd()
Returns the 1-based inclusive right-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1523
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
Definition: SamRecord.cpp:1293
bool set1BasedMatePosition(int32_t matePosition)
Set the mate/next fragment&#39;s leftmost position (PNEXT) using the specified 1-based (SAM format) value...
Definition: SamRecord.cpp:322
method completed successfully.
Definition: StatGenStatus.h:32
failed to parse a record/header - invalid format.
Definition: StatGenStatus.h:42
const String & getReferenceLabel(int id) const
Return the reference name (chromosome) for the specified reference id.
uint16_t getBin()
Get the BAM bin for the record.
Definition: SamRecord.cpp:1335
bool set0BasedMatePosition(int32_t matePosition)
Set the mate/next fragment&#39;s leftmost position using the specified 0-based (BAM format) value...
Definition: SamRecord.cpp:328
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1824
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
int getNumBeginClips() const
Return the number of clips that are at the beginning of the cigar.
Definition: Cigar.cpp:144
const char * getMateReferenceNameOrEqual()
Get the mate/next fragment&#39;s reference sequence name (RNEXT), returning "=" if it is the same as the ...
Definition: SamRecord.cpp:1408
int32_t getAlignmentLength()
Returns the length of the clipped sequence, returning 0 if the cigar is &#39;*&#39;.
Definition: SamRecord.cpp:1481
void clearTags()
Clear the tags in this record.
Definition: SamRecord.cpp:965
static bool isCharType(char vtype)
Returns whether or not the specified vtype is a char type.
Definition: SamRecord.cpp:2050
uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos)
Return the number of bases that overlap the reference and the read associated with this cigar that fa...
Definition: Cigar.cpp:334
int32_t get1BasedUnclippedStart()
Returns the 1-based inclusive left-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1507
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1556
bool set1BasedPosition(int32_t position)
Set the leftmost position (POS) using the specified 1-based (SAM format) value.
Definition: SamRecord.cpp:236
invalid other than for sorting.
Definition: StatGenStatus.h:44
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1379
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
Definition: SamRecord.cpp:344
uint32_t getTagLength()
Returns the length of the BAM formatted tags.
Definition: SamRecord.cpp:1917
int * getIntegerTag(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure...
Definition: SamRecord.cpp:2204
~SamRecord()
Destructor.
Definition: SamRecord.cpp:72
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:219
unsigned int ifwrite(IFILE file, const void *buffer, unsigned int size)
Write the specified number of bytes from the specified buffer into the file.
Definition: InputFile.h:669
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition: InputFile.h:423
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
Definition: StatGenStatus.h:36
void setSequenceTranslation(SequenceTranslation translation)
Set the type of sequence translation to use when getting the sequence.
Definition: SamRecord.cpp:187
static bool isFloatType(char vtype)
Returns whether or not the specified vtype is a float type.
Definition: SamRecord.cpp:2040
void clear()
Clear this object so that it has no Cigar Operations.
Structure of a BAM record.
Definition: SamRecord.h:33
void setReference(GenomeSequence *reference)
Set the reference to the specified genome sequence object.
Definition: SamRecord.cpp:178
void getCigarString(String &cigarString) const
Set the passed in String to the string reprentation of the Cigar operations in this object...
Definition: Cigar.cpp:52
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that)...
Definition: Cigar.h:83
uint8_t getReadNameLength()
Get the length of the readname (QNAME) including the null.
Definition: SamRecord.cpp:1314
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1300
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
Definition: SamRecord.cpp:357
const void * getRecordBuffer()
Get a const pointer to the buffer that contains the BAM representation of the record.
Definition: SamRecord.cpp:1192
const SamStatus & getStatus()
Returns the status associated with the last method that sets the status.
Definition: SamRecord.cpp:2391
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1626
Class for easily reading/writing files without having to worry about file type (uncompressed, gzip, bgzf) when reading.
Definition: InputFile.h:36
bool rmTags(const char *tags)
Remove tags.
Definition: SamRecord.cpp:1071
Leave the sequence as is.
Definition: SamRecord.h:58
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
Definition: SamRecord.cpp:215
HandlingType
This specifies how this class should respond to errors.
Definition: ErrorHandler.h:29
bool getTagsString(const char *tags, String &returnString, char delim='\t')
Get the string representation of the tags from the record, formatted as TAG:TYPE:VALUE<delim>TAG:TYPE...
Definition: SamRecord.cpp:2070
void resetTagIter()
Reset the tag iterator to the beginning of the tags.
Definition: SamRecord.cpp:2022
bool setMateReferenceName(SamFileHeader &header, const char *mateReferenceName)
Set the mate/next fragment&#39;s reference sequence name (RNEXT) to the specified name, using the header to determine the mate reference id.
Definition: SamRecord.cpp:297
fail a memory allocation.
Definition: StatGenStatus.h:45
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition: Cigar.cpp:120
bool isValid(SamFileHeader &header)
Returns whether or not the record is valid, setting the status to indicate success or failure...
Definition: SamRecord.cpp:161
bool setReferenceName(SamFileHeader &header, const char *referenceName)
Set the reference sequence name (RNAME) to the specified name, using the header to determine the refe...
Definition: SamRecord.cpp:223
static bool isIntegerType(char vtype)
Returns whether or not the specified vtype is an integer type.
Definition: SamRecord.cpp:2028
bool IncrementCount(int index, int increment)
Increments the count for the operation at the specified index by the specified value, specify a negative value to decrement.
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
Definition: StatGenStatus.h:41
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:34
const String & getString(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2302
int32_t getBlockSize()
Get the block size of the record (BAM format).
Definition: SamRecord.cpp:1269
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
Definition: SamRecord.cpp:259
SamStatus::Status setBuffer(const char *fromBuffer, uint32_t fromBufferSize, SamFileHeader &header)
Sets the SamRecord to contain the information in the BAM formatted fromBuffer.
Definition: SamRecord.cpp:525
SamRecord()
Default Constructor.
Definition: SamRecord.cpp:34
static bool isValid(SamFileHeader &samHeader, SamRecord &samRecord, SamValidationErrors &validationErrors)
Validates whether or not the specified SamRecord is valid, calling all of the other validations...
The SamValidationErrors class is a container class that holds SamValidationError Objects, allowing a validation method to return all of the invalid errors rather than just one.
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
Definition: SamRecord.cpp:779
int32_t getMateReferenceID()
Get the mate reference id of the record (BAM format: mate_rid/next_refID).
Definition: SamRecord.cpp:1426
const char * getMateReferenceName()
Get the mate/next fragment&#39;s reference sequence name (RNEXT).
Definition: SamRecord.cpp:1398
SamStatus::Status writeRecordBuffer(IFILE filePtr)
Write the record as a BAM into the specified already opened file.
Definition: SamRecord.cpp:1225
Translate bases that match the reference to &#39;=&#39;.
Definition: SamRecord.h:59
GenomeSequence * getReference()
Returns a pointer to the genome sequence object associated with this record if it was set (NULL if it...
Definition: SamRecord.cpp:1911
int32_t get1BasedMatePosition()
Get the 1-based(SAM) leftmost mate/next fragment&#39;s position (PNEXT).
Definition: SamRecord.cpp:1433
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1372
int32_t getInsertSize()
Get the inferred insert size of the read pair (ISIZE) or observed template length (TLEN)...
Definition: SamRecord.cpp:1447
static void seqWithoutEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence converting &#39;=&#39; to the appropriate base using the reference.
bool set0BasedPosition(int32_t position)
Set the leftmost position using the specified 0-based (BAM format) value.
Definition: SamRecord.cpp:242
bool setReadName(const char *readName)
Set QNAME to the passed in name.
Definition: SamRecord.cpp:193
int getReferenceID(const String &referenceName, bool addID=false)
Get the reference ID for the specified reference name (chromosome).
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1307
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:31
void addError(Status newStatus, const char *newMessage)
Add the specified error message to the status message, setting the status to newStatus if the current...
bool Remove(int index)
Remove the operation at the specified index.
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1543
bool rmTag(const char *tag, char type)
Remove a tag.
Definition: SamRecord.cpp:980
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1286
bool setInsertSize(int32_t insertSize)
Sets the inferred insert size (ISIZE)/observed template length (TLEN).
Definition: SamRecord.cpp:336