001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.apache.commons.compress.compressors.CompressorInputStream; 025import org.apache.commons.compress.utils.IOUtils; 026 027/** 028 * CompressorInputStream for the raw Snappy format. 029 * 030 * <p>This implementation uses an internal buffer in order to handle 031 * the back-references that are at the heart of the LZ77 algorithm. 032 * The size of the buffer must be at least as big as the biggest 033 * offset used in the compressed stream. The current version of the 034 * Snappy algorithm as defined by Google works on 32k blocks and 035 * doesn't contain offsets bigger than 32k which is the default block 036 * size used by this class.</p> 037 * 038 * @see <a href="http://code.google.com/p/snappy/source/browse/trunk/format_description.txt">Snappy compressed format description</a> 039 * @since 1.7 040 */ 041public class SnappyCompressorInputStream extends CompressorInputStream { 042 043 /** Mask used to determine the type of "tag" is being processed */ 044 private static final int TAG_MASK = 0x03; 045 046 /** Default block size */ 047 public static final int DEFAULT_BLOCK_SIZE = 32768; 048 049 /** Buffer to write decompressed bytes to for back-references */ 050 private final byte[] decompressBuf; 051 052 /** One behind the index of the last byte in the buffer that was written */ 053 private int writeIndex; 054 055 /** Index of the next byte to be read. */ 056 private int readIndex; 057 058 /** The actual block size specified */ 059 private final int blockSize; 060 061 /** The underlying stream to read compressed data from */ 062 private final InputStream in; 063 064 /** The size of the uncompressed data */ 065 private final int size; 066 067 /** Number of uncompressed bytes still to be read. */ 068 private int uncompressedBytesRemaining; 069 070 // used in no-arg read method 071 private final byte[] oneByte = new byte[1]; 072 073 private boolean endReached = false; 074 075 /** 076 * Constructor using the default buffer size of 32k. 077 * 078 * @param is 079 * An InputStream to read compressed data from 080 * 081 * @throws IOException 082 */ 083 public SnappyCompressorInputStream(final InputStream is) throws IOException { 084 this(is, DEFAULT_BLOCK_SIZE); 085 } 086 087 /** 088 * Constructor using a configurable buffer size. 089 * 090 * @param is 091 * An InputStream to read compressed data from 092 * @param blockSize 093 * The block size used in compression 094 * 095 * @throws IOException 096 */ 097 public SnappyCompressorInputStream(final InputStream is, final int blockSize) 098 throws IOException { 099 this.in = is; 100 this.blockSize = blockSize; 101 this.decompressBuf = new byte[blockSize * 3]; 102 this.writeIndex = readIndex = 0; 103 uncompressedBytesRemaining = size = (int) readSize(); 104 } 105 106 /** {@inheritDoc} */ 107 @Override 108 public int read() throws IOException { 109 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 110 } 111 112 /** {@inheritDoc} */ 113 @Override 114 public void close() throws IOException { 115 in.close(); 116 } 117 118 /** {@inheritDoc} */ 119 @Override 120 public int available() { 121 return writeIndex - readIndex; 122 } 123 124 /** 125 * {@inheritDoc} 126 */ 127 @Override 128 public int read(byte[] b, int off, int len) throws IOException { 129 if (endReached) { 130 return -1; 131 } 132 final int avail = available(); 133 if (len > avail) { 134 fill(len - avail); 135 } 136 137 int readable = Math.min(len, available()); 138 System.arraycopy(decompressBuf, readIndex, b, off, readable); 139 readIndex += readable; 140 if (readIndex > blockSize) { 141 slideBuffer(); 142 } 143 return readable; 144 } 145 146 /** 147 * Try to fill the buffer with enough bytes to satisfy the current 148 * read request. 149 * 150 * @param len the number of uncompressed bytes to read 151 */ 152 private void fill(int len) throws IOException { 153 if (uncompressedBytesRemaining == 0) { 154 endReached = true; 155 } 156 int readNow = Math.min(len, uncompressedBytesRemaining); 157 158 while (readNow > 0) { 159 final int b = readOneByte(); 160 int length = 0; 161 long offset = 0; 162 163 switch (b & TAG_MASK) { 164 165 case 0x00: 166 167 length = readLiteralLength(b); 168 169 if (expandLiteral(length)) { 170 return; 171 } 172 break; 173 174 case 0x01: 175 176 /* 177 * These elements can encode lengths between [4..11] bytes and 178 * offsets between [0..2047] bytes. (len-4) occupies three bits 179 * and is stored in bits [2..4] of the tag byte. The offset 180 * occupies 11 bits, of which the upper three are stored in the 181 * upper three bits ([5..7]) of the tag byte, and the lower 182 * eight are stored in a byte following the tag byte. 183 */ 184 185 length = 4 + ((b >> 2) & 0x07); 186 offset = (b & 0xE0) << 3; 187 offset |= readOneByte(); 188 189 if (expandCopy(offset, length)) { 190 return; 191 } 192 break; 193 194 case 0x02: 195 196 /* 197 * These elements can encode lengths between [1..64] and offsets 198 * from [0..65535]. (len-1) occupies six bits and is stored in 199 * the upper six bits ([2..7]) of the tag byte. The offset is 200 * stored as a little-endian 16-bit integer in the two bytes 201 * following the tag byte. 202 */ 203 204 length = (b >> 2) + 1; 205 206 offset = readOneByte(); 207 offset |= readOneByte() << 8; 208 209 if (expandCopy(offset, length)) { 210 return; 211 } 212 break; 213 214 case 0x03: 215 216 /* 217 * These are like the copies with 2-byte offsets (see previous 218 * subsection), except that the offset is stored as a 32-bit 219 * integer instead of a 16-bit integer (and thus will occupy 220 * four bytes). 221 */ 222 223 length = (b >> 2) + 1; 224 225 offset = readOneByte(); 226 offset |= readOneByte() << 8; 227 offset |= readOneByte() << 16; 228 offset |= ((long) readOneByte()) << 24; 229 230 if (expandCopy(offset, length)) { 231 return; 232 } 233 break; 234 } 235 236 readNow -= length; 237 uncompressedBytesRemaining -= length; 238 } 239 } 240 241 /** 242 * Slide buffer. 243 * 244 * <p>Move all bytes of the buffer after the first block down to 245 * the beginning of the buffer.</p> 246 */ 247 private void slideBuffer() { 248 System.arraycopy(decompressBuf, blockSize, decompressBuf, 0, 249 blockSize * 2); 250 writeIndex -= blockSize; 251 readIndex -= blockSize; 252 } 253 254 255 /* 256 * For literals up to and including 60 bytes in length, the 257 * upper six bits of the tag byte contain (len-1). The literal 258 * follows immediately thereafter in the bytestream. - For 259 * longer literals, the (len-1) value is stored after the tag 260 * byte, little-endian. The upper six bits of the tag byte 261 * describe how many bytes are used for the length; 60, 61, 62 262 * or 63 for 1-4 bytes, respectively. The literal itself follows 263 * after the length. 264 */ 265 private int readLiteralLength(int b) throws IOException { 266 int length; 267 switch (b >> 2) { 268 case 60: 269 length = readOneByte(); 270 break; 271 case 61: 272 length = readOneByte(); 273 length |= readOneByte() << 8; 274 break; 275 case 62: 276 length = readOneByte(); 277 length |= readOneByte() << 8; 278 length |= readOneByte() << 16; 279 break; 280 case 63: 281 length = readOneByte(); 282 length |= readOneByte() << 8; 283 length |= readOneByte() << 16; 284 length |= (((long) readOneByte()) << 24); 285 break; 286 default: 287 length = b >> 2; 288 break; 289 } 290 291 return length + 1; 292 } 293 294 /** 295 * Literals are uncompressed data stored directly in the byte stream. 296 * 297 * @param length 298 * The number of bytes to read from the underlying stream 299 * 300 * @throws IOException 301 * If the first byte cannot be read for any reason other than 302 * end of file, or if the input stream has been closed, or if 303 * some other I/O error occurs. 304 * @return True if the decompressed data should be flushed 305 */ 306 private boolean expandLiteral(final int length) throws IOException { 307 int bytesRead = IOUtils.readFully(in, decompressBuf, writeIndex, length); 308 count(bytesRead); 309 if (length != bytesRead) { 310 throw new IOException("Premature end of stream"); 311 } 312 313 writeIndex += length; 314 return writeIndex >= 2 * this.blockSize; 315 } 316 317 /** 318 * Copies are references back into previous decompressed data, telling the 319 * decompressor to reuse data it has previously decoded. They encode two 320 * values: The offset, saying how many bytes back from the current position 321 * to read, and the length, how many bytes to copy. Offsets of zero can be 322 * encoded, but are not legal; similarly, it is possible to encode 323 * backreferences that would go past the end of the block (offset > current 324 * decompressed position), which is also nonsensical and thus not allowed. 325 * 326 * @param off 327 * The offset from the backward from the end of expanded stream 328 * @param length 329 * The number of bytes to copy 330 * 331 * @throws IOException 332 * An the offset expands past the front of the decompression 333 * buffer 334 * @return True if the decompressed data should be flushed 335 */ 336 private boolean expandCopy(final long off, int length) throws IOException { 337 if (off > blockSize) { 338 throw new IOException("Offset is larger than block size"); 339 } 340 int offset = (int) off; 341 342 if (offset == 1) { 343 byte lastChar = decompressBuf[writeIndex - 1]; 344 for (int i = 0; i < length; i++) { 345 decompressBuf[writeIndex++] = lastChar; 346 } 347 } else if (length < offset) { 348 System.arraycopy(decompressBuf, writeIndex - offset, 349 decompressBuf, writeIndex, length); 350 writeIndex += length; 351 } else { 352 int fullRotations = length / offset; 353 int pad = length - (offset * fullRotations); 354 355 while (fullRotations-- != 0) { 356 System.arraycopy(decompressBuf, writeIndex - offset, 357 decompressBuf, writeIndex, offset); 358 writeIndex += offset; 359 } 360 361 if (pad > 0) { 362 System.arraycopy(decompressBuf, writeIndex - offset, 363 decompressBuf, writeIndex, pad); 364 365 writeIndex += pad; 366 } 367 } 368 return writeIndex >= 2 * this.blockSize; 369 } 370 371 /** 372 * This helper method reads the next byte of data from the input stream. The 373 * value byte is returned as an <code>int</code> in the range <code>0</code> 374 * to <code>255</code>. If no byte is available because the end of the 375 * stream has been reached, an Exception is thrown. 376 * 377 * @return The next byte of data 378 * @throws IOException 379 * EOF is reached or error reading the stream 380 */ 381 private int readOneByte() throws IOException { 382 int b = in.read(); 383 if (b == -1) { 384 throw new IOException("Premature end of stream"); 385 } 386 count(1); 387 return b & 0xFF; 388 } 389 390 /** 391 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 392 * 1), stored as a little-endian varint. Varints consist of a series of 393 * bytes, where the lower 7 bits are data and the upper bit is set iff there 394 * are more bytes to be read. In other words, an uncompressed length of 64 395 * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) 396 * would be stored as 0xFE 0xFF 0x7F. 397 * 398 * @return The size of the uncompressed data 399 * 400 * @throws IOException 401 * Could not read a byte 402 */ 403 private long readSize() throws IOException { 404 int index = 0; 405 long sz = 0; 406 int b = 0; 407 408 do { 409 b = readOneByte(); 410 sz |= (b & 0x7f) << (index++ * 7); 411 } while (0 != (b & 0x80)); 412 return sz; 413 } 414 415 /** 416 * Get the uncompressed size of the stream 417 * 418 * @return the uncompressed size 419 */ 420 public int getSize() { 421 return size; 422 } 423 424}