Skip to content

Commit

Permalink
[Feature][Connector-V2] Support use EasyExcel as read excel engine (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
dwave authored Dec 30, 2024
1 parent 37612d9 commit b8e1177
Show file tree
Hide file tree
Showing 20 changed files with 897 additions and 158 deletions.
2 changes: 1 addition & 1 deletion docs/en/connector-v2/sink/LocalFile.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ By default, we use 2PC commit to ensure `exactly-once`

## Options

| Name | Type | Required | Default | Description |
| Name | Type | Required | Default | Description |
|---------------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------|
| path | string | yes | - | |
| tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. |
Expand Down
11 changes: 11 additions & 0 deletions docs/en/connector-v2/source/LocalFile.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
| skip_header_row_number | long | no | 0 |
| schema | config | no | - |
| sheet_name | string | no | - |
| excel_engine | string | no | POI | |
| xml_row_tag | string | no | - |
| xml_use_attr_format | boolean | no | - |
| file_filter_pattern | string | no | |
Expand Down Expand Up @@ -239,6 +240,16 @@ Only need to be configured when file_format is excel.

Reader the sheet of the workbook.

### excel_engine [string]

Only need to be configured when file_format is excel.

supported as the following file types:
`POI` `EasyExcel`

The default excel reading engine is POI, but POI can easily cause memory overflow when reading Excel with more than 65,000 rows, so you can switch to EasyExcel as the reading engine.


### xml_row_tag [string]

Only need to be configured when file_format is xml.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@ public class DateTimeUtils {
FORMATTER_MAP.put(
Formatter.YYYY_MM_DD_HH_MM_SS_SLASH,
DateTimeFormatter.ofPattern(Formatter.YYYY_MM_DD_HH_MM_SS_SLASH.value));
FORMATTER_MAP.put(
Formatter.YYYY_M_D_HH_MM_SS_SLASH,
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SS_SLASH.value));
FORMATTER_MAP.put(
Formatter.YYYY_M_D_HH_MM_SS_ISO8601,
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SS_ISO8601.value));
FORMATTER_MAP.put(
Formatter.YYYY_M_D_HH_MM_SLASH,
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SLASH.value));
FORMATTER_MAP.put(
Formatter.YYYY_M_D_HH_MM_ISO8601,
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_ISO8601.value));
FORMATTER_MAP.put(
Formatter.YYYY_MM_DD_HH_MM_SS_NO_SPLIT,
DateTimeFormatter.ofPattern(Formatter.YYYY_MM_DD_HH_MM_SS_NO_SPLIT.value));
Expand All @@ -73,9 +85,26 @@ public class DateTimeUtils {
DateTimeFormatter.ofPattern(Formatter.YYYY_MM_DD_HH_MM_SS_SSSSSSSSS_ISO8601.value));
}

// if the datatime string length is 17, find the DateTimeFormatter from this map
public static final Map<Pattern, DateTimeFormatter> YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP =
new LinkedHashMap<>();

// if the datatime string length is 15, find the DateTimeFormatter from this map
public static final Map<Pattern, DateTimeFormatter> YYYY_M_D_HH_MM_15_FORMATTER_MAP =
new LinkedHashMap<>();

// all Pattern in this set
public static Set<Map.Entry<Pattern, DateTimeFormatter>>
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP_ENTRY_SET = new LinkedHashSet<>();

// all Pattern in this set
public static Set<Map.Entry<Pattern, DateTimeFormatter>>
YYYY_M_D_HH_MM_15_FORMATTER_MAP_ENTRY_SET = new LinkedHashSet<>();

// if the datatime string length is 19, find the DateTimeFormatter from this map
public static final Map<Pattern, DateTimeFormatter> YYYY_MM_DD_HH_MM_SS_19_FORMATTER_MAP =
new LinkedHashMap<>();

public static Set<Map.Entry<Pattern, DateTimeFormatter>>
YYYY_MM_DD_HH_MM_SS_19_FORMATTER_MAP_ENTRY_SET = new LinkedHashSet<>();

Expand Down Expand Up @@ -115,6 +144,22 @@ public class DateTimeUtils {
Pattern.compile("\\d{4}/\\d{2}/\\d{2}\\s\\d{2}:\\d{2}:\\d{2}"),
DateTimeFormatter.ofPattern(Formatter.YYYY_MM_DD_HH_MM_SS_SLASH.value));

YYYY_M_D_HH_MM_15_FORMATTER_MAP.put(
Pattern.compile("\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{2}:\\d{2}"),
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SLASH.value));

YYYY_M_D_HH_MM_15_FORMATTER_MAP.put(
Pattern.compile("\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{2}:\\d{2}"),
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_ISO8601.value));

YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP.put(
Pattern.compile("\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}"),
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SS_SLASH.value));

YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP.put(
Pattern.compile("\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}"),
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SS_ISO8601.value));

YYYY_MM_DD_HH_MM_SS_M19_FORMATTER_MAP.put(
Pattern.compile("\\d{4}/\\d{2}/\\d{2}\\s\\d{2}:\\d{2}.*"),
new DateTimeFormatterBuilder()
Expand Down Expand Up @@ -159,6 +204,12 @@ public class DateTimeUtils {
YYYY_MM_DD_HH_MM_SS_19_FORMATTER_MAP.entrySet());
YYYY_MM_DD_HH_MM_SS_M19_FORMATTER_MAP_ENTRY_SET.addAll(
YYYY_MM_DD_HH_MM_SS_M19_FORMATTER_MAP.entrySet());

YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP_ENTRY_SET.addAll(
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP.entrySet());

YYYY_M_D_HH_MM_15_FORMATTER_MAP_ENTRY_SET.addAll(
YYYY_M_D_HH_MM_15_FORMATTER_MAP.entrySet());
}

/**
Expand All @@ -176,14 +227,40 @@ public static DateTimeFormatter matchDateTimeFormatter(String dateTime) {
return entry.getValue();
}
}
for (Map.Entry<Pattern, DateTimeFormatter> entry :
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP_ENTRY_SET) {
if (entry.getKey().matcher(dateTime).matches()) {
return entry.getValue();
}
}
} else if (dateTime.length() > 19) {
for (Map.Entry<Pattern, DateTimeFormatter> entry :
YYYY_MM_DD_HH_MM_SS_M19_FORMATTER_MAP_ENTRY_SET) {
if (entry.getKey().matcher(dateTime).matches()) {
return entry.getValue();
}
}
} else if (dateTime.length() == 17 || dateTime.length() == 18) {
for (Map.Entry<Pattern, DateTimeFormatter> entry :
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP_ENTRY_SET) {
if (entry.getKey().matcher(dateTime).matches()) {
return entry.getValue();
}
}
} else if (dateTime.length() == 15 || dateTime.length() == 16) {
for (Map.Entry<Pattern, DateTimeFormatter> entry :
YYYY_M_D_HH_MM_15_FORMATTER_MAP_ENTRY_SET) {
if (entry.getKey().matcher(dateTime).matches()) {
return entry.getValue();
}
}
} else if (dateTime.length() == 14) {
for (Map.Entry<Pattern, DateTimeFormatter> entry :
YYYY_M_D_HH_MM_15_FORMATTER_MAP_ENTRY_SET) {
if (entry.getKey().matcher(dateTime).matches()) {
return entry.getValue();
}
}
return YYYY_MM_DD_HH_MM_SS_14_FORMATTER;
}
return null;
Expand Down Expand Up @@ -247,6 +324,10 @@ public enum Formatter {
YYYY_MM_DD_HH_MM_SS_SSSSSS("yyyy-MM-dd HH:mm:ss.SSSSSS"),
YYYY_MM_DD_HH_MM_SS_SPOT("yyyy.MM.dd HH:mm:ss"),
YYYY_MM_DD_HH_MM_SS_SLASH("yyyy/MM/dd HH:mm:ss"),
YYYY_M_D_HH_MM_SLASH("yyyy/M/d HH:mm"),
YYYY_M_D_HH_MM_ISO8601("yyyy-M-d HH:mm"),
YYYY_M_D_HH_MM_SS_SLASH("yyyy/M/d HH:mm:ss"),
YYYY_M_D_HH_MM_SS_ISO8601("yyyy-M-d HH:mm:ss"),
YYYY_MM_DD_HH_MM_SS_NO_SPLIT("yyyyMMddHHmmss"),
YYYY_MM_DD_HH_MM_SS_ISO8601("yyyy-MM-dd'T'HH:mm:ss"),
YYYY_MM_DD_HH_MM_SS_SSS_ISO8601("yyyy-MM-dd'T'HH:mm:ss.SSS"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public class DateUtils {
Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d{1,9})?Z"),
Pattern.compile("\\d{2}:\\d{2}:\\d{2}\\+\\d{2}:\\d{2}"),
Pattern.compile("\\d{2}:\\d{2}:\\d{2}(\\.\\d{1,9})?"),
Pattern.compile("\\d{4}/\\d{1,2}/\\d{1,2}")
};

public static final Map<Pattern, DateTimeFormatter> DATE_FORMATTER_MAP = new HashMap();
Expand Down Expand Up @@ -147,6 +148,12 @@ public class DateUtils {
.toFormatter());
DATE_FORMATTER_MAP.put(PATTERN_ARRAY[6], ISO_OFFSET_TIME);
DATE_FORMATTER_MAP.put(PATTERN_ARRAY[7], ISO_LOCAL_TIME);
DATE_FORMATTER_MAP.put(
PATTERN_ARRAY[8],
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.append(DateTimeFormatter.ofPattern("yyyy/M/d"))
.toFormatter());
}

/**
Expand Down Expand Up @@ -184,6 +191,7 @@ public static String toString(LocalDate date, Formatter formatter) {

public enum Formatter {
YYYY_MM_DD("yyyy-MM-dd"),
YYYY_M_D("yyyy/M/d"),
YYYY_MM_DD_SPOT("yyyy.MM.dd"),
YYYY_MM_DD_SLASH("yyyy/MM/dd");
private final String value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.time.format.DateTimeFormatter;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

public class TimeUtils {
private static final Map<Formatter, DateTimeFormatter> FORMATTER_MAP =
Expand All @@ -37,6 +38,29 @@ public static LocalTime parse(String time, Formatter formatter) {
return LocalTime.parse(time, FORMATTER_MAP.get(formatter));
}

public static final Pattern[] PATTERN_ARRAY =
new Pattern[] {
Pattern.compile("\\d{2}:\\d{2}:\\d{2}"),
Pattern.compile("\\d{2}:\\d{2}:\\d{2}.\\d{3}"),
};

public static Formatter matchTimeFormatter(String dateTime) {
for (int j = 0; j < PATTERN_ARRAY.length; j++) {
if (PATTERN_ARRAY[j].matcher(dateTime).matches()) {
Formatter dateTimeFormatter = Time_FORMATTER_MAP.get(PATTERN_ARRAY[j]);
return dateTimeFormatter;
}
}
return null;
}

public static final Map<Pattern, Formatter> Time_FORMATTER_MAP = new HashMap();

static {
Time_FORMATTER_MAP.put(PATTERN_ARRAY[0], Formatter.parse(Formatter.HH_MM_SS.value));
Time_FORMATTER_MAP.put(PATTERN_ARRAY[1], Formatter.parse(Formatter.HH_MM_SS_SSS.value));
}

public static String toString(LocalTime time, Formatter formatter) {
return time.format(FORMATTER_MAP.get(formatter));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@ public void testAutoDateTimeFormatter() {
datetimeStr = "2020/10/10 10:10:10";
Assertions.assertEquals("2020-10-10T10:10:10", DateTimeUtils.parse(datetimeStr).toString());

datetimeStr = "2020/1/1 10:10";
Assertions.assertEquals("2020-01-01T10:10", DateTimeUtils.parse(datetimeStr).toString());

datetimeStr = "2024/12/2 10:10";
Assertions.assertEquals("2024-12-02T10:10", DateTimeUtils.parse(datetimeStr).toString());

datetimeStr = "2024/12/1 10:10";
Assertions.assertEquals("2024-12-01T10:10", DateTimeUtils.parse(datetimeStr).toString());

datetimeStr = "2020年10月10日 10时10分10秒";
Assertions.assertEquals("2020-10-10T10:10:10", DateTimeUtils.parse(datetimeStr).toString());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,17 @@ public void testMatchDateTimeFormatter() {
Assertions.assertEquals(
"2020-10-10",
DateUtils.parse(datetimeStr, DateUtils.matchDateFormatter(datetimeStr)).toString());
datetimeStr = "2024/1/1";
Assertions.assertEquals(
"2024-01-01",
DateUtils.parse(datetimeStr, DateUtils.matchDateFormatter(datetimeStr)).toString());
datetimeStr = "2024/10/1";
Assertions.assertEquals(
"2024-10-01",
DateUtils.parse(datetimeStr, DateUtils.matchDateFormatter(datetimeStr)).toString());
datetimeStr = "2024/1/10";
Assertions.assertEquals(
"2024-01-10",
DateUtils.parse(datetimeStr, DateUtils.matchDateFormatter(datetimeStr)).toString());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.seatunnel.common.utils;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

public class TimeUtilsTest {
@Test
public void testMatchTimeFormatter() {
String timeStr = "12:12:12";
Assertions.assertEquals(
"12:12:12",
TimeUtils.parse(timeStr, TimeUtils.matchTimeFormatter(timeStr)).toString());

timeStr = "12:12:12.123";
Assertions.assertEquals(
"12:12:12.123",
TimeUtils.parse(timeStr, TimeUtils.matchTimeFormatter(timeStr)).toString());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
<hadoop-minikdc.version>3.1.4</hadoop-minikdc.version>
<dom4j.version>2.1.4</dom4j.version>
<jaxen.version>2.0.0</jaxen.version>
<easyexcel.version>4.0.3</easyexcel.version>
<fastexcel-reader.version>0.18.4</fastexcel-reader.version>
</properties>

<dependencyManagement>
Expand Down Expand Up @@ -158,6 +160,13 @@
<artifactId>jaxen</artifactId>
<version>${jaxen.version}</version>
</dependency>

<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>${easyexcel.version}</version>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,12 @@ public class BaseSourceConfigOptions {
.noDefaultValue()
.withDescription("To be read sheet name,only valid for excel files");

public static final Option<ExcelEngine> EXCEL_ENGINE =
Options.key("excel_engine")
.enumType(ExcelEngine.class)
.defaultValue(ExcelEngine.POI)
.withDescription("To switch excel read engine, e.g. POI , EasyExcel");

public static final Option<String> XML_ROW_TAG =
Options.key("xml_row_tag")
.stringType()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.seatunnel.connectors.seatunnel.file.config;

import java.io.Serializable;

public enum ExcelEngine implements Serializable {
POI("POI"),
EASY_EXCEL("EasyExcel");

private final String excelEngineName;

ExcelEngine(String excelEngineName) {
this.excelEngineName = excelEngineName;
}

public String getExcelEngineName() {
return excelEngineName;
}
}
Loading

0 comments on commit b8e1177

Please sign in to comment.