lz4_compression/
decompress.rs

1//! The decompression algorithm.
2
3use byteorder::{LittleEndian, ByteOrder};
4
5#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
6pub enum Error {
7    /// Expected more bytes, but found none
8    UnexpectedEnd,
9
10    /// The offset for a deduplication is out of bounds
11    InvalidDeduplicationOffset,
12}
13
14/// An LZ4 decoder.
15///
16/// This will decode in accordance to the LZ4 format. It represents a particular state of the
17/// decompressor.
18pub struct Decoder<'a> {
19
20    /// The compressed input.
21    input: &'a [u8],
22
23    /// The decompressed output.
24    output: &'a mut Vec<u8>,
25
26    /// The current block's "token".
27    ///
28    /// This token contains to 4-bit "fields", a higher and a lower, representing the literals'
29    /// length and the back reference's length, respectively. LSIC is used if either are their
30    /// maximal values.
31    token: u8,
32}
33
34impl<'a> Decoder<'a> {
35    /// Internal (partial) function for `take`.
36    #[inline]
37    fn take_imp(input: &mut &'a [u8], n: usize) -> Result<&'a [u8], Error> {
38        // Check if we have enough bytes left.
39        if input.len() < n {
40            // No extra bytes. This is clearly not expected, so we return an error.
41            Err(Error::UnexpectedEnd)
42        }
43        else {
44            // Take the first n bytes.
45            let res = Ok(&input[..n]);
46
47            // Shift the stream to left, so that it is no longer the first byte.
48            *input = &input[n..];
49
50            // Return the former first byte.
51            res
52        }
53    }
54
55    /// Pop n bytes from the start of the input stream.
56    fn take(&mut self, n: usize) -> Result<&[u8], Error> {
57        Self::take_imp(&mut self.input, n)
58    }
59
60    /// Write a buffer to the output stream.
61    ///
62    /// The reason this doesn't take `&mut self` is that we need partial borrowing due to the rules
63    /// of the borrow checker. For this reason, we instead take some number of segregated
64    /// references so we can read and write them independently.
65    fn output(output: &mut Vec<u8>, buf: &[u8]) {
66        // We use simple memcpy to extend the vector.
67        output.extend_from_slice(&buf[..buf.len()]);
68    }
69
70    /// Write an already decompressed match to the output stream.
71    ///
72    /// This is used for the essential part of the algorithm: deduplication. We start at some
73    /// position `start` and then keep pushing the following element until we've added
74    /// `match_length` elements.
75    fn duplicate(&mut self, start: usize, match_length: usize) {
76        // We cannot simply use memcpy or `extend_from_slice`, because these do not allow
77        // self-referential copies: http://ticki.github.io/img/lz4_runs_encoding_diagram.svg
78        for i in start..start + match_length {
79            let b = self.output[i];
80            self.output.push(b);
81        }
82    }
83
84    /// Read an integer LSIC (linear small integer code) encoded.
85    ///
86    /// In LZ4, we encode small integers in a way that we can have an arbitrary number of bytes. In
87    /// particular, we add the bytes repeatedly until we hit a non-0xFF byte. When we do, we add
88    /// this byte to our sum and terminate the loop.
89    ///
90    /// # Example
91    ///
92    /// ```notest
93    ///     255, 255, 255, 4, 2, 3, 4, 6, 7
94    /// ```
95    ///
96    /// is encoded to _255 + 255 + 255 + 4 = 769_. The bytes after the first 4 is ignored, because
97    /// 4 is the first non-0xFF byte.
98    #[inline]
99    fn read_integer(&mut self) -> Result<usize, Error> {
100        // We start at zero and count upwards.
101        let mut n = 0;
102
103        // If this byte takes value 255 (the maximum value it can take), another byte is read
104        // and added to the sum. This repeats until a byte lower than 255 is read.
105        while {
106            // We add the next byte until we get a byte which we add to the counting variable.
107            let extra = self.take(1)?[0];
108            n += extra as usize;
109
110            // We continue if we got 255.
111            extra == 0xFF
112        } {}
113
114        Ok(n)
115    }
116
117    /// Read a little-endian 16-bit integer from the input stream.
118    #[inline]
119    fn read_u16(&mut self) -> Result<u16, Error> {
120        // We use byteorder to read an u16 in little endian.
121        Ok(LittleEndian::read_u16(self.take(2)?))
122    }
123
124    /// Read the literals section of a block.
125    ///
126    /// The literals section encodes some bytes which are to be copied to the output without any
127    /// modification.
128    ///
129    /// It consists of two parts:
130    ///
131    /// 1. An LSIC integer extension to the literals length as defined by the first part of the
132    ///    token, if it takes the highest value (15).
133    /// 2. The literals themself.
134    fn read_literal_section(&mut self) -> Result<(), Error> {
135        // The higher token is the literals part of the token. It takes a value from 0 to 15.
136        let mut literal = (self.token >> 4) as usize;
137
138        // If the initial value is 15, it is indicated that another byte will be read and added to
139        // it.
140        if literal == 15 {
141            // The literal length took the maximal value, indicating that there is more than 15
142            // literal bytes. We read the extra integer.
143            literal += self.read_integer()?;
144        }
145
146        // Now we know the literal length. The number will be used to indicate how long the
147        // following literal copied to the output buffer is.
148
149        // Read the literals segment and output them without processing.
150        Self::output(&mut self.output, Self::take_imp(&mut self.input, literal)?);
151
152        Ok(())
153    }
154
155    /// Read the duplicates section of the block.
156    ///
157    /// The duplicates section serves to reference an already decoded segment. This consists of two
158    /// parts:
159    ///
160    /// 1. A 16-bit little-endian integer defining the "offset", i.e. how long back we need to go
161    ///    in the decoded buffer and copy.
162    /// 2. An LSIC integer extension to the duplicate length as defined by the first part of the
163    ///    token, if it takes the highest value (15).
164    fn read_duplicate_section(&mut self) -> Result<(), Error> {
165        // Now, we will obtain the offset which we will use to copy from the output. It is an
166        // 16-bit integer.
167        let offset = self.read_u16()?;
168
169        // Obtain the initial match length. The match length is the length of the duplicate segment
170        // which will later be copied from data previously decompressed into the output buffer. The
171        // initial length is derived from the second part of the token (the lower nibble), we read
172        // earlier. Since having a match length of less than 4 would mean negative compression
173        // ratio, we start at 4.
174        let mut match_length = (4 + (self.token & 0xF)) as usize;
175
176        // The intial match length can maximally be 19. As with the literal length, this indicates
177        // that there are more bytes to read.
178        if match_length == 4 + 15 {
179            // The match length took the maximal value, indicating that there is more bytes. We
180            // read the extra integer.
181            match_length += self.read_integer()?;
182        }
183
184        // We now copy from the already decompressed buffer. This allows us for storing duplicates
185        // by simply referencing the other location.
186
187        // Calculate the start of this duplicate segment. We use wrapping subtraction to avoid
188        // overflow checks, which we will catch later.
189        let start = self.output.len().wrapping_sub(offset as usize);
190
191        // We'll do a bound check to avoid panicking.
192        if start < self.output.len() {
193            // Write the duplicate segment to the output buffer.
194            self.duplicate(start, match_length);
195
196            Ok(())
197        }
198        else {
199            Err(Error::InvalidDeduplicationOffset)
200        }
201    }
202
203    /// Complete the decompression by reading all the blocks.
204    ///
205    /// # Decompressing a block
206    ///
207    /// Blocks consists of:
208    ///  - A 1 byte token
209    ///      * A 4 bit integer $t_1$.
210    ///      * A 4 bit integer $t_2$.
211    ///  - A $n$ byte sequence of 0xFF bytes (if $t_1 \neq 15$, then $n = 0$).
212    ///  - $x$ non-0xFF 8-bit integers, L (if $t_1 = 15$, $x = 1$, else $x = 0$).
213    ///  - $t_1 + 15n + L$ bytes of uncompressed data (literals).
214    ///  - 16-bits offset (little endian), $a$.
215    ///  - A $m$ byte sequence of 0xFF bytes (if $t_2 \neq 15$, then $m = 0$).
216    ///  - $y$ non-0xFF 8-bit integers, $c$ (if $t_2 = 15$, $y = 1$, else $y = 0$).
217    ///
218    /// First, the literals are copied directly and unprocessed to the output buffer, then (after
219    /// the involved parameters are read) $t_2 + 15m + c$ bytes are copied from the output buffer
220    /// at position $a + 4$ and appended to the output buffer. Note that this copy can be
221    /// overlapping.
222    #[inline]
223    fn complete(&mut self) -> Result<(), Error> {
224        // Exhaust the decoder by reading and decompressing all blocks until the remaining buffer
225        // is empty.
226        while !self.input.is_empty() {
227            // Read the token. The token is the first byte in a block. It is divided into two 4-bit
228            // subtokens, the higher and the lower.
229            self.token = self.take(1)?[0];
230
231            // Now, we read the literals section.
232            self.read_literal_section()?;
233
234            // If the input stream is emptied, we break out of the loop. This is only the case
235            // in the end of the stream, since the block is intact otherwise.
236            if self.input.is_empty() { break; }
237
238            // Now, we read the duplicates section.
239            self.read_duplicate_section()?;
240        }
241
242        Ok(())
243    }
244}
245
246/// Decompress all bytes of `input` into `output`.
247pub fn decompress_into(input: &[u8], output: &mut Vec<u8>) -> Result<(), Error> {
248    // Decode into our vector.
249    Decoder {
250        input,
251        output,
252        token: 0,
253    }.complete()?;
254
255    Ok(())
256}
257
258/// Decompress all bytes of `input`.
259pub fn decompress(input: &[u8]) -> Result<Vec<u8>, Error> {
260    // Allocate a vector to contain the decompressed stream.
261    let mut vec = Vec::with_capacity(4096);
262
263    decompress_into(input, &mut vec)?;
264
265    Ok(vec)
266}
267
268#[cfg(test)]
269mod test {
270    use super::*;
271
272    #[test]
273    fn aaaaaaaaaaa_lots_of_aaaaaaaaa() {
274        assert_eq!(decompress(&[0x11, b'a', 1, 0]).unwrap(), b"aaaaaa");
275    }
276
277    #[test]
278    fn multiple_repeated_blocks() {
279        assert_eq!(decompress(&[0x11, b'a', 1, 0, 0x22, b'b', b'c', 2, 0]).unwrap(), b"aaaaaabcbcbcbc");
280    }
281
282    #[test]
283    fn all_literal() {
284        assert_eq!(decompress(&[0x30, b'a', b'4', b'9']).unwrap(), b"a49");
285    }
286
287    #[test]
288    fn offset_oob() {
289        decompress(&[0x10, b'a', 2, 0]).unwrap_err();
290        decompress(&[0x40, b'a', 1, 0]).unwrap_err();
291    }
292}
lz4_compression/decompress.rs

lz4_compression/
decompress.rs