Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utility method to decode payload using the HTTP Content-Encoding header #88

Merged
merged 2 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@
<version>4.13.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.brotli</groupId>
<artifactId>dec</artifactId>
<version>0.1.2</version>
<scope>compile</scope>
<optional>true</optional>
</dependency>
</dependencies>

<properties>
Expand Down
22 changes: 22 additions & 0 deletions src/org/netpreserve/jwarc/BrotliUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright (C) 2024 National Library of Australia and the jwarc contributors
*/

package org.netpreserve.jwarc;

import java.io.IOException;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;

import org.brotli.dec.BrotliInputStream;

/**
* Utility class to read brotli-encoded data, based on org.brotli:dec.
*/
public final class BrotliUtils {

public static ReadableByteChannel brotliChannel(ReadableByteChannel brotli) throws IOException {
return Channels.newChannel(new BrotliInputStream(Channels.newInputStream(brotli)));
}
}
17 changes: 10 additions & 7 deletions src/org/netpreserve/jwarc/ChunkedBody.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;

/**
* A message body with chunked HTTP "Transfer-Encoding".
*/
class ChunkedBody extends MessageBody {
private final ReadableByteChannel channel;
private final ByteBuffer buffer;
Expand Down Expand Up @@ -95,7 +98,7 @@ public int read(ByteBuffer dst) throws IOException {
}


// line 138 "ChunkedBody.rl"
// line 141 "ChunkedBody.rl"


private int cs = chunked_start;
Expand Down Expand Up @@ -186,15 +189,15 @@ else if ( ( buffer.get(p)) > _chunked_trans_keys[_mid+1] )
switch ( _chunked_actions[_acts++] )
{
case 0:
// line 99 "ChunkedBody.rl"
// line 102 "ChunkedBody.rl"
{ tmp = tmp * 16 + Character.digit(buffer.get(p), 16); }
break;
case 1:
// line 100 "ChunkedBody.rl"
// line 103 "ChunkedBody.rl"
{ if (tmp != 0) { chunkLength = tmp; tmp = 0; { p += 1; _goto_targ = 5; if (true) continue _goto;} } }
break;
case 2:
// line 101 "ChunkedBody.rl"
// line 104 "ChunkedBody.rl"
{ chunkLength = 0; }
break;
// line 201 "ChunkedBody.java"
Expand All @@ -217,7 +220,7 @@ else if ( ( buffer.get(p)) > _chunked_trans_keys[_mid+1] )
break; }
}

// line 147 "ChunkedBody.rl"
// line 150 "ChunkedBody.rl"
if (cs == chunked_error) {
if (strict) {
throw new ParsingException("chunked encoding at position " + p + ": "
Expand All @@ -231,7 +234,7 @@ else if ( ( buffer.get(p)) > _chunked_trans_keys[_mid+1] )
}


// line 235 "ChunkedBody.java"
// line 233 "ChunkedBody.java"
private static byte[] init__chunked_actions_0()
{
return new byte [] {
Expand Down Expand Up @@ -380,5 +383,5 @@ private static byte[] init__chunked_trans_actions_0()
static final int chunked_en_chunks = 1;


// line 160 "ChunkedBody.rl"
// line 163 "ChunkedBody.rl"
}
3 changes: 3 additions & 0 deletions src/org/netpreserve/jwarc/ChunkedBody.rl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;

/**
* A message body with chunked HTTP "Transfer-Encoding".
*/
class ChunkedBody extends MessageBody {
private final ReadableByteChannel channel;
private final ByteBuffer buffer;
Expand Down
76 changes: 76 additions & 0 deletions src/org/netpreserve/jwarc/DecodedBody.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright (C) 2024 National Library of Australia and the jwarc contributors
*/

package org.netpreserve.jwarc;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;


/**
* A message body which decodes content on-the-fly using the specified encoding.
*/
public class DecodedBody extends MessageBody {

public static enum Encoding {
DEFLATE,
GZIP,
BROTLI
}

private final ReadableByteChannel channel;
private final Encoding encoding;
long position = 0;

private DecodedBody(ReadableByteChannel channel, Encoding encoding) throws IOException {
this.encoding = encoding;
switch (this.encoding) {
case DEFLATE:
this.channel = IOUtils.inflateChannel(channel);
break;
case GZIP:
this.channel = IOUtils.gunzipChannel(channel);
break;
case BROTLI:
try {
this.channel = BrotliUtils.brotliChannel(channel);
} catch (NoClassDefFoundError e) {
throw new IOException("Brotli decoder not found, please install org.brotli:dec", e);
}
break;
default:
throw new IOException("Unsupported encoding");
}
}

public static DecodedBody create(ReadableByteChannel channel, Encoding encoding) throws IOException {
return new DecodedBody(channel, encoding);
}

@Override
public long position() throws IOException {
return position;
};

@Override
public int read(ByteBuffer dst) throws IOException {
int n = channel.read(dst);
if (n > 0) {
position += n;
}
return n;
}

@Override
public boolean isOpen() {
return channel.isOpen();
}

@Override
public void close() throws IOException {
channel.close();
}
}
33 changes: 33 additions & 0 deletions src/org/netpreserve/jwarc/HttpMessage.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
package org.netpreserve.jwarc;

import java.nio.charset.Charset;
import java.util.List;

import static java.nio.charset.StandardCharsets.ISO_8859_1;

import java.io.IOException;

public abstract class HttpMessage extends Message {
HttpMessage(MessageVersion version, MessageHeaders headers, MessageBody body) {
super(version, headers, body);
Expand All @@ -19,6 +22,36 @@ Charset headerCharset() {
return ISO_8859_1;
}

/**
* The HTTP payload with Content-Encoding decoded.
*
* @return a message body with content decoded following the HTTP
* Content-Encoding header.
* @throws IOException
*/
public MessageBody bodyDecoded() throws IOException {
MessageBody payload = body();
List<String> contentEncodings = headers().all("Content-Encoding");
if (contentEncodings.isEmpty()) {
return payload;
} else if (contentEncodings.size() > 1) {
throw new IOException("Multiple Content-Encodings not supported: " + contentEncodings);
} else if (contentEncodings.get(0).equalsIgnoreCase("identity")
|| contentEncodings.get(0).equalsIgnoreCase("none")) {
return payload;
} else if (contentEncodings.get(0).equalsIgnoreCase("gzip")
|| contentEncodings.get(0).equalsIgnoreCase("x-gzip")) {
return DecodedBody.create(payload, DecodedBody.Encoding.GZIP);
} else if (contentEncodings.get(0).equalsIgnoreCase("br")) {
return DecodedBody.create(payload, DecodedBody.Encoding.BROTLI);
} else if (contentEncodings.get(0).equalsIgnoreCase("deflate")) {
return DecodedBody.create(payload, DecodedBody.Encoding.DEFLATE);
} else {
throw new IOException("Content-Encoding not supported: " + contentEncodings.get(0));
}

}

public abstract static class AbstractBuilder<R extends HttpMessage, B extends AbstractBuilder<R, B>> extends Message.AbstractBuilder<R, B> {
public AbstractBuilder() {
super(MessageVersion.HTTP_1_1);
Expand Down
54 changes: 19 additions & 35 deletions src/org/netpreserve/jwarc/tools/ExtractTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;

Expand Down Expand Up @@ -46,43 +45,28 @@ private static void writeHttpHeaders(WritableByteChannel out, WarcRecord record)
}

private static void writePayload(WritableByteChannel out, WarcRecord record) throws IOException {
MessageBody payload;
List<String> contentEncodings = Collections.emptyList();
if (record instanceof WarcResponse) {
if (record.contentType().base().equals(MediaType.HTTP)) {
HttpResponse response = ((WarcResponse) record).http();
payload = response.body();
contentEncodings = response.headers().all("Content-Encoding");
} else {
payload = ((WarcResponse) record).payload().map(WarcPayload::body).orElse(MessageBody.empty());
}
} else if (record instanceof WarcRequest) {
if (record.contentType().base().equals(MediaType.HTTP)) {
HttpRequest request = ((WarcRequest) record).http();
payload = request.body();
contentEncodings = request.headers().all("Content-Encoding");
try {
MessageBody payload;
if (record instanceof WarcResponse) {
if (record.contentType().base().equals(MediaType.HTTP)) {
HttpResponse response = ((WarcResponse) record).http();
payload = response.bodyDecoded();
} else {
payload = ((WarcResponse) record).payload().map(WarcPayload::body).orElse(MessageBody.empty());
}
} else if (record instanceof WarcRequest) {
if (record.contentType().base().equals(MediaType.HTTP)) {
HttpRequest request = ((WarcRequest) record).http();
payload = request.bodyDecoded();
} else {
payload = ((WarcRequest) record).payload().map(WarcPayload::body).orElse(MessageBody.empty());
}
} else {
payload = ((WarcRequest) record).payload().map(WarcPayload::body).orElse(MessageBody.empty());
payload = record.body();
}
} else {
payload = record.body();
}
if (contentEncodings.isEmpty()) {
writeBody(out, payload);
} else {
if (contentEncodings.size() > 1) {
System.err.println("Multiple Content-Encodings not supported: " + contentEncodings);
} else if (contentEncodings.get(0).equalsIgnoreCase("identity")
|| contentEncodings.get(0).equalsIgnoreCase("none")) {
writeBody(out, payload);
} else if (contentEncodings.get(0).equalsIgnoreCase("gzip")
|| contentEncodings.get(0).equalsIgnoreCase("x-gzip")) {
writeBody(out, IOUtils.gunzipChannel(payload));
} else if (contentEncodings.get(0).equalsIgnoreCase("deflate")) {
writeBody(out, IOUtils.inflateChannel(payload));
} else {
System.err.println("Content-Encoding not supported: " + contentEncodings.get(0));
}
} catch (IOException e) {
System.err.println("Failed to read payload: " + e.getMessage());
}
}

Expand Down
Loading