libStatGen Software  1
MiniDeflate.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "MiniDeflate.h"
19 
20 // Convenient constants and macros
21 //
22 #define EMPTY_KEY 123
23 #define uchar unsigned char
24 
25 #ifndef min
26 #define min(a,b) (((a)<(b))?(a):(b))
27 #endif
28 
29 MiniDeflate::MiniDeflate()
30 {
31  buffer = new uchar [BUFFER_SIZE + 5];
32  hash_keys = new uchar [HASH_SIZE];
33  hash_values = new uchar * [HASH_SIZE * HASH_DEPTH];
34 }
35 
36 MiniDeflate::~MiniDeflate()
37 {
38  delete [] buffer;
39  delete [] hash_keys;
40  delete [] hash_values;
41 }
42 
43 void MiniDeflate::EvaluateMatch(unsigned char * in, int len, int hash,
44  unsigned char * & best_pos, int & best_match)
45 {
46  int max = min(len, 0xFFFF + 66);
47 
48  for (int i = HASH_DEPTH; i > 0; i--)
49  // Check each possible match (up to HASH_DEPTH)
50  {
51  uchar * pos = hash_values[hash * HASH_DEPTH + ((hash_keys[hash] + i) % HASH_DEPTH)];
52 
53  if (pos == NULL || in - pos >= 0x4001) break;
54 
55  int match = 0;
56 
57  while (match < max && pos[match] == in[match])
58  match++;
59 
60  if (match > best_match)
61  {
62  best_match = match;
63  best_pos = pos;
64  }
65  }
66 
67  // If string seems pretty unique, add to hash table
68  if (best_match < OKAY_MATCH)
69  {
70  int delta = hash_keys[hash] = (uchar)((hash_keys[hash] + 1) & 7);
71  hash_values[hash * 8 + delta] = in;
72  }
73 }
74 
75 void MiniDeflate::QuoteLiterals(unsigned char * & in, int literal,
76  unsigned char * & out, int & buffer_len,
77  FILE * output)
78 {
79  if (buffer_len < 0)
80  {
81  fwrite(buffer, out - buffer, 1, output);
82  buffer_len = BUFFER_SIZE;
83  out = buffer;
84  }
85 
86  while (buffer_len < literal)
87  {
88  literal -= buffer_len;
89  while (buffer_len--)
90  {
91  *out = *in;
92  in++;
93  out++;
94  }
95  fwrite(buffer, BUFFER_SIZE, 1, output);
96  buffer_len = BUFFER_SIZE;
97  out = buffer;
98  }
99 
100  while (literal--)
101  {
102  *out = *in;
103  in++;
104  out++;
105  buffer_len--;
106  }
107 }
108 
109 void MiniDeflate::OutputLiterals(unsigned char * & in, int literal,
110  unsigned char * & out, int & buffer_len,
111  FILE * output)
112 {
113  while (literal > 0)
114  if (literal < 16)
115  {
116  *out = (char) literal;
117  out++;
118  buffer_len--;
119  QuoteLiterals(in, literal, out, buffer_len, output);
120  break;
121  }
122  else if (literal < 31)
123  {
124  *out = 15;
125  out++;
126  buffer_len--;
127  QuoteLiterals(in, 15, out, buffer_len, output);
128  *out = (uchar)(literal - 15);
129  out++;
130  buffer_len--;
131  QuoteLiterals(in, literal - 15, out, buffer_len, output);
132  break;
133  }
134  else
135  {
136  int length = min(literal, 0xFFFF + 31);
137  literal -= length;
138  length -= 31;
139 
140  *out = 0;
141  out++;
142  *out = (uchar)(length >> 8);
143  out++;
144  *out = (uchar)(length & 0xFF);
145  out++;
146  buffer_len -= 3;
147 
148  QuoteLiterals(in, length + 31, out, buffer_len, output);
149  }
150 }
151 
152 
153 void MiniDeflate::Deflate(FILE * output, void * void_input, size_t len)
154 {
155  uchar * in = (uchar *) void_input;
156  uchar * out = (uchar *) buffer;
157  int buffer_len = BUFFER_SIZE;
158 
159  for (int i = 0; i < HASH_SIZE; i++) hash_keys[i] = EMPTY_KEY;
160 
161  uchar * in2 = in;
162 
163  while (len > 2)
164  {
165  // Hash the current input value
166  int hash = ((in[0] << 16) | (in[1] << 8) | in[2]) % HASH_SIZE;
167 
168  if (hash_keys[hash] != EMPTY_KEY)
169  // Possible matches in hash table
170  {
171  int best_match = 0;
172  uchar * best_pos = NULL;
173 
174  EvaluateMatch(in, len, hash, best_pos, best_match);
175 
176  // If there are no decent matches
177  if (best_match < 3)
178  {
179  in++;
180  len--;
181  continue;
182  }
183 
184  // Try look ahead if match isn't great
185  while (best_match < OKAY_MATCH && len > 3)
186  {
187  // Peek to see if we could get a better match
188  int next_hash = ((in[1] << 16) | (in[2] << 8) | in[3]) % HASH_SIZE;
189 
190  if (hash_keys[next_hash] == EMPTY_KEY) break;
191 
192  int next_match = 0;
193  uchar * next_pos = NULL;
194 
195  EvaluateMatch(in + 1, len - 1, next_hash, next_pos, next_match);
196 
197  // Didn't find a better match
198  if (next_match <= best_match + 1) break;
199 
200  // Found a better match, so try again
201  in++;
202  len--;
203  best_match = next_match;
204  best_pos = next_pos;
205  }
206 
207  int best_offset = in - best_pos - 1;
208 
209  // This is where we output stuff
210  // Check if we have some literals to output first
211  OutputLiterals(in2, in - in2, out, buffer_len, output);
212 
213  in2 = in += best_match;
214  len -= best_match;
215 
216  if (best_match < 17 && best_offset < 0x1000)
217  {
218  *out = (uchar)(((best_match - 1) << 4) | (best_offset >> 8));
219  out++;
220  *out = (uchar)(best_offset & 0xFF);
221  out++;
222  buffer_len -= 2;
223  }
224  else if (best_match < 66)
225  {
226  *out = (uchar)(16 | (best_offset >> 10));
227  out++;
228  *out = (uchar)((best_offset >> 2) & 0xFF);
229  out++;
230  *out = (uchar)((best_offset << 6) | (best_match - 2));
231  out++;
232  buffer_len -= 3;
233  }
234  else
235  {
236  *out = (uchar)(16 | (best_offset >> 10));
237  out++;
238  *out = (uchar)((best_offset >> 2) & 0xFF);
239  out++;
240  *out = (uchar)(best_offset << 6);
241  out++;
242  best_match -= 66;
243  *out = (uchar)(best_match >> 8);
244  out++;
245  *out = (uchar)(best_match & 0xFF);
246  out++;
247  buffer_len -= 5;
248  }
249 
250  if (buffer_len <= 0)
251  {
252  fwrite(buffer, out - buffer, 1, output);
253  buffer_len = BUFFER_SIZE;
254  out = buffer;
255  }
256  }
257  // Never seen this sequence before
258  else
259  {
260  hash_keys[hash] = 0;
261  for (int i = 1; i < HASH_DEPTH; i++) hash_values[hash * 8 + i] = NULL;
262  hash_values[hash * 8] = in;
263  in++;
264  len--;
265  }
266  }
267 
268  // Check if we have some trailing literals to output
269  in += len;
270  OutputLiterals(in2, in - in2, out, buffer_len, output);
271 
272  // Flush output
273  if (out != buffer) fwrite(buffer, out - buffer, 1, output);
274 }
275 
276 void MiniDeflate::CiteLiteral(unsigned char * & out, int literal,
277  unsigned char * & in, int & buffer_len,
278  FILE * input)
279 {
280  while (buffer_len < literal)
281  {
282  literal -= buffer_len;
283  while (buffer_len--)
284  {
285  *out = *in;
286  in++;
287  out++;
288  }
289  buffer_len = fread(buffer + 5, 1, BUFFER_SIZE, input);
290  in = buffer + 5;
291  }
292 
293  while (literal--)
294  {
295  *out = *in;
296  in++;
297  out++;
298  buffer_len--;
299  }
300 }
301 
302 
303 void MiniDeflate::Inflate(FILE * input, void * void_output, size_t len)
304 {
305  uchar * out = (uchar *) void_output;
306  uchar * in = (uchar *) buffer + 5;
307  int buffer_len = BUFFER_SIZE;
308 
309  buffer_len = fread(buffer + 5, 1, BUFFER_SIZE, input);
310 
311  while (len)
312  {
313  int match_len = *in >> 4;
314 
315  // Matching a literal
316  if (match_len == 0)
317  {
318  match_len = *in & 0x0F;
319  in++, buffer_len--;
320 
321  // If match_len == 0 then string is longer than 30 characters
322  // Strings of 16 - 30 characters are encoded as two short strings
323  if (match_len == 0)
324  {
325  match_len = (in[0] << 8) + in[1] + 31;
326  in += 2;
327  buffer_len -= 2;
328  }
329 
330  CiteLiteral(out, match_len, in, buffer_len, input);
331  len -= match_len;
332  }
333  // Long match, 14 bit offset
334  else if (match_len == 1)
335  {
336  int offset = (((in[0] & 0x0F) << 10) | (in[1] << 2) | (in[2] >> 6)) + 1;
337  match_len = (in[2] & 0x3F) + 2;
338  in += 3;
339  buffer_len -= 3;
340 
341  if (match_len == 2)
342  {
343  match_len = ((in[0] << 8) | in[1]) + 66;
344  in += 2;
345  buffer_len -= 2;
346  }
347 
348  uchar * match_pos = out - offset;
349  len -= match_len;
350  while (match_len--)
351  {
352  *out = *match_pos;
353  out++, match_pos++;
354  }
355  }
356  // Typical short match
357  else
358  {
359  int offset = (((in[0] & 0x0F) << 8) | in[1]) + 1;
360  in += 2;
361  buffer_len -= 2;
362 
363  uchar * match_pos = out - offset;
364  len -= ++match_len;
365  while (match_len--)
366  {
367  *out = *match_pos;
368  out++, match_pos++;
369  }
370  }
371 
372  if (buffer_len < 5)
373  {
374  uchar * in2 = (uchar *) buffer + 5 - buffer_len;
375  while (in2 != buffer + 5)
376  {
377  *in2 = *in;
378  in2++;
379  in++;
380  }
381 
382  in = buffer + 5 - buffer_len;
383  buffer_len += fread(buffer + 5, 1, BUFFER_SIZE, input);
384  }
385  }
386 
387  if (buffer_len) fseek(input, -buffer_len, SEEK_CUR);
388 }
389 
390 
391