libStatGen Software  1
IndexBase.h
1 /*
2  * Copyright (C) 2011-2012 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef __INDEX_BASE_H__
19 #define __INDEX_BASE_H__
20 
21 #include <stdint.h>
22 #include <vector>
23 #include <map>
24 #include <stdlib.h>
25 
26 #include "InputFile.h"
27 #include "StatGenStatus.h"
28 
29 
30 class Chunk
31 {
32 public:
33  uint64_t chunk_beg; // offset of the start of the chunk
34  uint64_t chunk_end; // offset of the end of the chunk
35 
36  static const uint64_t MAX_CHUNK_VALUE = 0xFFFFFFFFFFFFFFFFULL;
37 
38  bool operator< (const Chunk& otherChunk) const
39  {
40  return(this->chunk_beg < otherChunk.chunk_beg);
41  }
42 };
43 
44 
45 // This class contains chunks that are sorted by the beginning position.
46 // This class hides how the chunks are actually stored (map, list ,etc),
47 // so they can be interchanged.
49 {
50 public:
51  // Returns the first chunk in the list and removes it.
52  Chunk pop();
53  bool insert(const Chunk& chunkToInsert);
54  void clear();
55  bool empty();
56  bool mergeOverlapping();
57 
58 private:
59  std::map<uint64_t, Chunk> chunkList;
60 };
61 
62 class IndexBase
63 {
64 public:
65 
66  IndexBase();
67  virtual ~IndexBase();
68 
69  /// Reset the member data for a new index file.
70  virtual void resetIndex();
71 
72  // Read & parse the specified index file.
73  /// \param filename the bam index file to be read.
74  /// \return the status of the read.
75  virtual StatGenStatus::Status readIndex(const char* filename) = 0;
76 
77  /// Get the number of references in this index.
78  /// \return number of references
79  int32_t getNumRefs() const;
80 
81  // Returns the minimum offset of records that cross the 16K block that
82  // contains the specified position for the given reference id.
83  bool getMinOffsetFromLinearIndex(int32_t refID, uint32_t position,
84  uint64_t& minOffset) const;
85 
86 protected:
87  const static uint32_t MAX_NUM_BINS = 37450; // per specs, at most 37450 bins
88 
89  // Maximum allowed position (inclusive 512MB - 1)
90  // NOTE: CSI index may not have this same max position.
91  const static uint32_t MAX_POSITION = 536870911;
92 
93  // Number of bits in 1 linear index - how much to shift a position by
94  // to determine which offset into the linear index to look for it.
95  const static uint32_t LINEAR_INDEX_SHIFT = 14;
96 
97  class Bin
98  {
99  public:
100  Bin(){chunks = NULL; reset();}
101  ~Bin() {reset();}
102  void reset()
103  {
104  if(chunks != NULL)
105  {
106  free(chunks);
107  chunks = NULL;
108  }
109  n_chunk = 0;
110  bin = NOT_USED_BIN;
111  }
112  uint32_t bin; // The bin id.
113  int32_t n_chunk; // The number of chunks.
114  Chunk* chunks; // The chunks for this bin.
115  static const uint32_t NOT_USED_BIN = 0xFFFFFFFF;
116  };
117 
118  class Reference
119  {
120  // Add one to the max since there may now be an extra bin containing
121  // the mapped/unmapped counts.
122  public:
123  static const int32_t UNKNOWN_MAP_INFO = -1;
124  Reference(){ioffsets = NULL; reset();}
125  ~Reference(){reset();}
126  void reset()
127  {
128  bins.clear();
129  if(ioffsets != NULL)
130  {
131  free(ioffsets);
132  ioffsets = NULL;
133  }
134  n_bin = 0;
135  n_intv = 0;
136  minChunkOffset = UNSET_MIN_CHUNK_OFFSET;
137  maxChunkOffset = 0;
138  n_mapped = UNKNOWN_MAP_INFO;
139  n_unmapped = UNKNOWN_MAP_INFO;
140  }
141  int32_t n_bin; // The number of bins.
142  int32_t n_intv; // Number of intervals.
143  std::vector<Bin> bins; // The bins for this reference.
144  uint64_t* ioffsets; // Offsets of intervals first alignments
145  uint64_t minChunkOffset;
146  uint64_t maxChunkOffset;
147  int32_t n_mapped; // Number of mapped reads.
148  int32_t n_unmapped; // Number of unmapped reads.
149 
150  static const uint64_t UNSET_MIN_CHUNK_OFFSET = 0xFFFFFFFFFFFFFFFFULL;
151  };
152 
153  // Set bins in the region to 1 and all other bins to 0.
154  // start is incluive, end is exclusive.
155  static void getBinsForRegion(uint32_t start, uint32_t end, bool binMap[MAX_NUM_BINS+1]);
156 
157  // Number of reference sequences.
158  int32_t n_ref;
159 
160  // The references.
161  std::vector<Reference> myRefs;
162 };
163 
164 
165 #endif
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:31