libStatGen Software  1
ReferenceSequence.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "assert.h"
19 #include "ctype.h"
20 #include "stdio.h"
21 #include "Error.h"
22 
23 
24 #include "Generic.h"
25 #include "ReferenceSequence.h"
26 
27 #include <algorithm>
28 #include <istream>
29 #include <fstream>
30 #include <sstream>
31 #include <stdexcept>
32 
33 //
34 // Given a buffer with a fasta format contents, count
35 // the number of chromsomes in it and return that value.
36 //
37 bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
38 {
39  chromosomeCount = 0;
40  baseCount = 0;
41  bool atLineStart = true;
42 
43  //
44  // loop over the fasta file, essentially matching for the
45  // pattern '^>.*$' and counting them.
46  //
47  for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
48  {
49  switch (fastaData[fastaIndex])
50  {
51  case '\n':
52  case '\r':
53  atLineStart = true;
54  break;
55  case '>':
56  {
57  if (!atLineStart) break;
58  chromosomeCount++;
59  //
60  // eat the rest of the line
61  //
62  while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
63  {
64  fastaIndex++;
65  }
66  break;
67  }
68  default:
69  baseCount++;
70  atLineStart = false;
71  break;
72  }
73 
74  }
75  return false;
76 }
77 
78 #if 0
79 // turn this into a template on read/quality/etc...
80 int GenomeSequence::debugPrintReadValidation(
81  std::string &read,
82  std::string &quality,
83  char direction,
84  genomeIndex_t readLocation,
85  int sumQuality,
86  int mismatchCount,
87  bool recurse
88 )
89 {
90  int validateSumQ = 0;
91  int validateMismatchCount = 0;
92  int rc = 0;
93  std::string genomeData;
94 
95  for (uint32_t i=0; i<read.size(); i++)
96  {
97  if (tolower(read[i]) != tolower((*this)[readLocation + i]))
98  {
99  validateSumQ += quality[i] - '!';
100  // XXX no longer valid:
101  if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
102  genomeData.push_back(tolower((*this)[readLocation + i]));
103  }
104  else
105  {
106  genomeData.push_back(toupper((*this)[readLocation + i]));
107  }
108  }
109  assert(validateSumQ>=0);
110  if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
111  {
112  printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
113  genomeData.c_str(),
114  read.c_str(),
115  validateSumQ,
116  sumQuality
117  );
118  rc++;
119  }
120  else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
121  {
122  printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
123  genomeData.c_str(),
124  read.c_str(),
125  validateMismatchCount,
126  mismatchCount
127  );
128  rc++;
129  }
130  else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
131  {
132  printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
133  genomeData.c_str(),
134  read.c_str(),
135  validateSumQ,
136  sumQuality,
137  validateMismatchCount,
138  mismatchCount
139  );
140  rc++;
141  }
142 
143  if (recurse && abs(validateMismatchCount - mismatchCount) > (int) read.size()/2)
144  {
145  printf("large mismatch difference, trying reverse strand: ");
146  std::string reverseRead = read;
147  std::string reverseQuality = quality;
148  getReverseRead(reverseRead);
149  reverse(reverseQuality.begin(), reverseQuality.end());
150  rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
151  }
152  return rc;
153 }
154 #endif
155 
156 
157