Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added an optional parameter to specify the character encoding of a source #78

Merged
merged 1 commit into from
Dec 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions src/main/java/io/frictionlessdata/tableschema/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import java.io.*;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.*;

/**
Expand Down Expand Up @@ -123,25 +125,40 @@ public static Table fromSource(InputStream data, InputStream schema, CSVFormat f
* @param format The expected CSVFormat if dataSource is a CSV-containing InputStream; ignored for JSON data.
* Can be `null`
*/
public static Table fromSource(File dataSource, File basePath, Schema schema, CSVFormat format) {
Table table = fromSource(dataSource, basePath);
public static Table fromSource(File dataSource, File basePath, Schema schema, CSVFormat format, Charset charset) {
Table table = fromSource(dataSource, basePath, charset);
table.schema = schema;
if (null != format) {
table.setCsvFormat(format);
}
return table;
}

public static Table fromSource(File dataSource, File basePath, Schema schema, CSVFormat format) {
return fromSource(dataSource, basePath, schema, format, null);
}

/**
* Create Table from a {@link java.io.File} containing the CSV/JSON
* data and without either a Schema or a CSVFormat.
* @param dataSource relative File for reading the data from. Must be inside `basePath`
* @param basePath Parent directory
* @param charset Character encoding of the file
*/
public static Table fromSource(File dataSource, File basePath, Charset charset) {
Table table = new Table();
table.dataSource = TableDataSource.fromSource(dataSource, basePath, charset);
return table;
}

/**
* Create Table from a {@link java.io.File} containing the CSV/JSON
* data and without either a Schema or a CSVFormat.
* @param dataSource relative File for reading the data from. Must be inside `basePath`
* @param basePath Parent directory
*/
public static Table fromSource(File dataSource, File basePath) {
Table table = new Table();
table.dataSource = TableDataSource.fromSource(dataSource, basePath);
return table;
return fromSource(dataSource, basePath, null);
}

/**
Expand Down Expand Up @@ -729,4 +746,4 @@ private void writeCSVData(Map<Integer, Integer> mapping, CSVPrinter csvPrinter)
}
});
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
Expand Down Expand Up @@ -32,7 +33,7 @@ public List<String[]> getDataAsStringArray() {
}

String getFileContents(String path) throws IOException {
return TableDataSource.getFileContents(path, workDir);
return TableDataSource.getFileContents(path, workDir, Charset.defaultCharset());
}

}
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package io.frictionlessdata.tableschema.tabledatasource;

import com.fasterxml.jackson.databind.node.ArrayNode;
import com.google.common.primitives.Chars;
import io.frictionlessdata.tableschema.Table;
import io.frictionlessdata.tableschema.exception.TableIOException;
import io.frictionlessdata.tableschema.inputstream.ByteOrderMarkStrippingInputStream;
import io.frictionlessdata.tableschema.util.JsonUtil;
import org.apache.commons.csv.CSVFormat;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
Expand Down Expand Up @@ -81,9 +83,9 @@ static TableDataSource fromSource(String input) {
* {@link CsvTableDataSource} based on input format
* @return DataSource created from input File
*/
static TableDataSource fromSource(File input, File workDir) {
static TableDataSource fromSource(File input, File workDir, Charset charset) {
try {
String content = getFileContents(input.getPath(), workDir);
String content = getFileContents(input.getPath(), workDir, charset);
return fromSource(content);
} catch (IOException ex) {
throw new TableIOException(ex);
Expand All @@ -109,7 +111,7 @@ static TableDataSource fromSource(InputStream input) {
return fromSource(content);
}

static String getFileContents(String path, File workDir) throws IOException {
static String getFileContents(String path, File workDir, Charset charset) throws IOException {
String lines;
if (workDir.getName().endsWith(".zip")) {
//have to exchange the backslashes on Windows, as
Expand All @@ -119,7 +121,7 @@ static String getFileContents(String path, File workDir) throws IOException {
ZipFile zipFile = new ZipFile(workDir.getAbsolutePath());
ZipEntry entry = zipFile.getEntry(path);
InputStream stream = zipFile.getInputStream(entry);
lines = readSkippingBOM(stream);
lines = readSkippingBOM(stream, charset);
} else {
// The path value can either be a relative path or a full path.
// If it's a relative path then build the full path by using the working directory.
Expand All @@ -129,7 +131,7 @@ static String getFileContents(String path, File workDir) throws IOException {
// - https://github.com/frictionlessdata/tableschema-java/issues/29
// - https://frictionlessdata.io/specs/data-resource/#url-or-path
Path resolvedPath = TableDataSource.toSecure(new File(path).toPath(), workDir.toPath());
lines = readSkippingBOM(new FileInputStream(resolvedPath.toFile()));
lines = readSkippingBOM(new FileInputStream(resolvedPath.toFile()), charset);
}
return lines;
}
Expand All @@ -141,10 +143,10 @@ static String getFileContents(String path, File workDir) throws IOException {
* @param is InputStream to read from
* @return Contents of the InputStream as a String
*/
static String readSkippingBOM(InputStream is) {
static String readSkippingBOM(InputStream is, Charset charset) {
String content;
try (ByteOrderMarkStrippingInputStream bims = new ByteOrderMarkStrippingInputStream(is);
InputStreamReader isr = new InputStreamReader(bims.skipBOM(), bims.getCharset());
InputStreamReader isr = new InputStreamReader(bims.skipBOM(), charset == null ? bims.getCharset() : charset);
BufferedReader rdr = new BufferedReader(isr)) {
content = rdr.lines().collect(Collectors.joining("\n"));
} catch (IOException ex) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
import org.junit.jupiter.api.Test;

import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;

import static io.frictionlessdata.tableschema.TestHelper.getTestDataDirectory;
import static io.frictionlessdata.tableschema.TestHelper.getTestsuiteDataDirectory;

public class TableEncodingTests {
Expand All @@ -17,12 +19,11 @@ public class TableEncodingTests {
// currently disabled
@Test
@DisplayName("Create a Table from a ISO-8859-1 encoded file")
@Disabled
void createTableFromIso8859() throws Exception{
File testDataDir = getTestsuiteDataDirectory();
File testDataDir = getTestDataDirectory();

Table table
= Table.fromSource(new File("csv/encodings/iso8859.csv"), testDataDir, null, null);
= Table.fromSource(new File("csv/encodings/iso8859.csv"), testDataDir, null, null, StandardCharsets.ISO_8859_1);

Iterator<Object[]> iter = table.iterator();
Object[] row = iter.next();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.junit.jupiter.api.Test;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -74,7 +75,7 @@ void testJsonArrayDataSourceHeaders() throws Exception{
@DisplayName("Validate creating a JsonArrayTableDataSource from JSON file")
void testSafePathCreationJson() throws Exception {
TableDataSource ds = TableDataSource.fromSource(new File("simple_geojson.json"),
TestHelper.getTestDataDirectory());
TestHelper.getTestDataDirectory(), Charset.defaultCharset());
Assertions.assertNotNull(ds);
}
/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import java.io.*;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand Down Expand Up @@ -85,7 +86,7 @@ public void testSafePath() throws Exception {
@Test
@DisplayName("Create a TableDataSource from a safe path and ensure no exception is thrown")
public void testSafePathCreationCsv() throws Exception {
TableDataSource ds = TableDataSource.fromSource(new File ("data/population.csv"), TestHelper.getTestDataDirectory());
TableDataSource ds = TableDataSource.fromSource(new File ("data/population.csv"), TestHelper.getTestDataDirectory(), Charset.defaultCharset());
Assertions.assertNotNull(ds);
}

Expand Down Expand Up @@ -162,7 +163,7 @@ public void testZipInputFileCreationCsv2() throws Exception {
TableDataSource ds;
File basePath = new File(TestHelper.getTestDataDirectory(),"data/population.zip");
File inFile = new File("population.csv");
ds = TableDataSource.fromSource(inFile,basePath);
ds = TableDataSource.fromSource(inFile,basePath, null);
List<String[]> data = ds.getDataAsStringArray();
Assertions.assertNotNull(data);
byte[] bytes = Files.readAllBytes(new File(TestHelper.getTestDataDirectory(), "data/population.csv").toPath());
Expand Down
2 changes: 2 additions & 0 deletions src/test/resources/fixtures/csv/encodings/iso8859.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name
R�union