SeqAn3
The Modern C++ library for sequence analysis.
bgzf_stream_util.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 #include <cstring>
17 #include <memory>
18 #include <thread>
19 
20 #if SEQAN3_HAS_ZLIB
21 // Zlib headers
22 #include <zlib.h>
23 #else
24 #error "This file cannot be used when building without GZip-support."
25 #endif // SEQAN3_HAS_ZLIB
26 
30 #include <seqan3/io/exception.hpp>
31 #include <seqan3/std/algorithm>
32 
33 namespace seqan3::contrib
34 {
35 
39 inline static uint64_t bgzf_thread_count = std::thread::hardware_concurrency();
40 
41 // ============================================================================
42 // Forwards
43 // ============================================================================
44 
45 // ============================================================================
46 // Classes
47 // ============================================================================
48 
49 // Special end-of-file marker defined by the BGZF compression format.
50 // See: https://samtools.github.io/hts-specs/SAMv1.pdf
51 static constexpr std::array<int8_t, 28> BGZF_END_OF_FILE_MARKER {{'\x1f', '\x8b', '\x08', '\x04',
52  '\x00', '\x00', '\x00', '\x00',
53  '\x00', '\xff', '\x06', '\x00',
54  '\x42', '\x43', '\x02', '\x00',
55  '\x1b', '\x00', '\x03', '\x00',
56  '\x00', '\x00', '\x00', '\x00',
57  '\x00', '\x00', '\x00', '\x00'}};
58 
59 template <typename TAlgTag>
60 struct CompressionContext {};
61 
62 template <typename TAlgTag>
63 struct DefaultPageSize;
64 
65 template <>
66 struct CompressionContext<detail::gz_compression>
67 {
68  z_stream strm;
69 
70  CompressionContext()
71  {
72  std::memset(&strm, 0, sizeof(z_stream));
73  }
74 };
75 
76 template <>
77 struct CompressionContext<detail::bgzf_compression>:
78  CompressionContext<detail::gz_compression>
79 {
80  static constexpr size_t BLOCK_HEADER_LENGTH = detail::magic_header<detail::bgzf_compression>.size();
81  unsigned char headerPos;
82 };
83 
84 template <>
85 struct DefaultPageSize<detail::bgzf_compression>
86 {
87  static const unsigned MAX_BLOCK_SIZE = 64 * 1024;
88  static const unsigned BLOCK_FOOTER_LENGTH = 8;
89  // 5 bytes block overhead (see 3.2.4. at http://www.gzip.org/zlib/rfc-deflate.html)
90  static const unsigned ZLIB_BLOCK_OVERHEAD = 5;
91 
92  // Reduce the maximal input size, such that the compressed data
93  // always fits in one block even for level Z_NO_COMPRESSION.
94  enum { BLOCK_HEADER_LENGTH = CompressionContext<detail::bgzf_compression>::BLOCK_HEADER_LENGTH };
95  static const unsigned VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD;
96 };
97 
98 // ============================================================================
99 // Functions
100 // ============================================================================
101 
102 // ----------------------------------------------------------------------------
103 // Function compressInit()
104 // ----------------------------------------------------------------------------
105 
106 inline void
107 compressInit(CompressionContext<detail::gz_compression> & ctx)
108 {
109  const int GZIP_WINDOW_BITS = -15; // no zlib header
110  const int Z_DEFAULT_MEM_LEVEL = 8;
111 
112  ctx.strm.zalloc = NULL;
113  ctx.strm.zfree = NULL;
114 
115  // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out
116  // to be 2x faster and produces only 7% bigger output
117 // int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
118 // GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
119  int status = deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED,
120  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
121  if (status != Z_OK)
122  throw io_error("Calling deflateInit2() failed for gz file.");
123 }
124 
125 // ----------------------------------------------------------------------------
126 // Function compressInit()
127 // ----------------------------------------------------------------------------
128 
129 inline void
130 compressInit(CompressionContext<detail::bgzf_compression> & ctx)
131 {
132  compressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
133  ctx.headerPos = 0;
134 }
135 
136 // ----------------------------------------------------------------------------
137 // Helper Function _bgzfUnpackXX()
138 // ----------------------------------------------------------------------------
139 
140 inline uint16_t
141 _bgzfUnpack16(char const * buffer)
142 {
143  uint16_t tmp;
144  std::uninitialized_copy(buffer, buffer + sizeof(uint16_t), reinterpret_cast<char *>(&tmp));
145  return detail::to_little_endian(tmp);
146 }
147 
148 inline uint32_t
149 _bgzfUnpack32(char const * buffer)
150 {
151  uint32_t tmp;
152  std::uninitialized_copy(buffer, buffer + sizeof(uint32_t), reinterpret_cast<char *>(&tmp));
153  return detail::to_little_endian(tmp);
154 }
155 
156 // ----------------------------------------------------------------------------
157 // Helper Function _bgzfPackXX()
158 // ----------------------------------------------------------------------------
159 
160 inline void
161 _bgzfPack16(char * buffer, uint16_t value)
162 {
163  value = detail::to_little_endian(value);
164  std::uninitialized_copy(reinterpret_cast<char *>(&value),
165  reinterpret_cast<char *>(&value) + sizeof(uint16_t),
166  buffer);
167 }
168 
169 inline void
170 _bgzfPack32(char * buffer, uint32_t value)
171 {
172  value = detail::to_little_endian(value);
173  std::uninitialized_copy(reinterpret_cast<char *>(&value),
174  reinterpret_cast<char *>(&value) + sizeof(uint32_t),
175  buffer);
176 }
177 
178 // ----------------------------------------------------------------------------
179 // Function _compressBlock()
180 // ----------------------------------------------------------------------------
181 
182 template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
183 inline TDestCapacity
184 _compressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
185  TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
186 {
187  const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
188  const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
189 
190  assert(dstCapacity > BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
191  assert(sizeof(TDestValue) == 1u);
192  assert(sizeof(unsigned) == 4u);
193 
194  // 1. COPY HEADER
195  std::ranges::copy(detail::magic_header<detail::bgzf_compression>, dstBegin);
196 
197  // 2. COMPRESS
198  compressInit(ctx);
199  ctx.strm.next_in = (Bytef *)(srcBegin);
200  ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH);
201  ctx.strm.avail_in = srcLength * sizeof(TSourceValue);
202  ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
203 
204  int status = deflate(&ctx.strm, Z_FINISH);
205  if (status != Z_STREAM_END)
206  {
207  deflateEnd(&ctx.strm);
208  throw io_error("Deflation failed. Compressed BGZF data is too big.");
209  }
210 
211  status = deflateEnd(&ctx.strm);
212  if (status != Z_OK)
213  throw io_error("BGZF deflateEnd() failed.");
214 
215 
216  // 3. APPEND FOOTER
217 
218  // Set compressed length into buffer, compute CRC and write CRC into buffer.
219 
220  size_t len = dstCapacity - ctx.strm.avail_out;
221  _bgzfPack16(dstBegin + 16, len - 1);
222 
223  dstBegin += len - BLOCK_FOOTER_LENGTH;
224  _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue)));
225  _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue));
226 
227  return dstCapacity - ctx.strm.avail_out;
228 }
229 
230 // ----------------------------------------------------------------------------
231 // Function _bgzfCheckHeader()
232 // ----------------------------------------------------------------------------
233 
234 inline bool
235 _bgzfCheckHeader(char const * header)
236 {
237  const char FLG_FEXTRA = detail::magic_header<detail::bgzf_compression>[3];
238  const char BGZF_ID1 = detail::magic_header<detail::bgzf_compression>[12];
239  const char BGZF_ID2 = detail::magic_header<detail::bgzf_compression>[13];
240  const char BGZF_SLEN = detail::magic_header<detail::bgzf_compression>[14];
241  const char BGZF_XLEN = detail::magic_header<detail::bgzf_compression>[10];
242 
243  return (header[0] == static_cast<char>(detail::magic_header<detail::gz_compression>[0]) &&
244  header[1] == static_cast<char>(detail::magic_header<detail::gz_compression>[1]) &&
245  header[2] == static_cast<char>(detail::magic_header<detail::gz_compression>[2]) &&
246  (header[3] & FLG_FEXTRA) != 0 &&
247  _bgzfUnpack16(header + 10) == BGZF_XLEN &&
248  header[12] == BGZF_ID1 &&
249  header[13] == BGZF_ID2 &&
250  _bgzfUnpack16(header + 14) == BGZF_SLEN);
251 }
252 
253 // ----------------------------------------------------------------------------
254 // Function decompressInit() - GZIP
255 // ----------------------------------------------------------------------------
256 
257 inline void
258 decompressInit(CompressionContext<detail::gz_compression> & ctx)
259 {
260  const int GZIP_WINDOW_BITS = -15; // no zlib header
261 
262  ctx.strm.zalloc = NULL;
263  ctx.strm.zfree = NULL;
264  int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS);
265  if (status != Z_OK)
266  throw io_error("GZip inflateInit2() failed.");
267 }
268 
269 // ----------------------------------------------------------------------------
270 // Function decompressInit() - BGZF
271 // ----------------------------------------------------------------------------
272 
273 inline void
274 decompressInit(CompressionContext<detail::bgzf_compression> & ctx)
275 {
276  decompressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
277  ctx.headerPos = 0;
278 }
279 
280 // ----------------------------------------------------------------------------
281 // Function _decompressBlock()
282 // ----------------------------------------------------------------------------
283 
284 template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
285 inline TDestCapacity
286 _decompressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
287  TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
288 {
289  const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
290  const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
291 
292  assert(sizeof(TSourceValue) == 1u);
293  assert(sizeof(unsigned) == 4u);
294 
295  // 1. CHECK HEADER
296 
297  if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH)
298  throw io_error("BGZF block too short.");
299 
300  if (!_bgzfCheckHeader(srcBegin))
301  throw io_error("Invalid BGZF block header.");
302 
303  size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u;
304  if (compressedLen != srcLength)
305  throw io_error("BGZF compressed size mismatch.");
306 
307 
308  // 2. DECOMPRESS
309 
310  decompressInit(ctx);
311  ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH);
312  ctx.strm.next_out = (Bytef *)(dstBegin);
313  ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
314  ctx.strm.avail_out = dstCapacity * sizeof(TDestValue);
315 
316  int status = inflate(&ctx.strm, Z_FINISH);
317  if (status != Z_STREAM_END)
318  {
319  inflateEnd(&ctx.strm);
320  throw io_error("Inflation failed. Decompressed BGZF data is too big.");
321  }
322 
323  status = inflateEnd(&ctx.strm);
324  if (status != Z_OK)
325  throw io_error("BGZF inflateEnd() failed.");
326 
327 
328  // 3. CHECK FOOTER
329 
330  // Check compressed length in buffer, compute CRC and compare with CRC in buffer.
331 
332  unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out);
333 
334  srcBegin += compressedLen - BLOCK_FOOTER_LENGTH;
335  if (_bgzfUnpack32(srcBegin) != crc)
336  throw io_error("BGZF wrong checksum.");
337 
338  if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out)
339  throw io_error("BGZF size mismatch.");
340 
341  return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue);
342 }
343 
344 } // namespace seqan3::contrib
Provides exceptions used in the I/O module.
T memset(T... args)
T hardware_concurrency(T... args)
T uninitialized_copy(T... args)
Definition: buffer_queue.hpp:32
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: algorithm:44
Provides seqan3::detail::magic_header.
Adaptations of algorithms from the Ranges TS.
Provides utility functions for bit twiddling.
Provides various transformation traits used by the range module.