Skip to content

Commit

Permalink
[Feature-#1918][s3] Add support for reading all types of documents su…
Browse files Browse the repository at this point in the history
…pported by Apache Tika
  • Loading branch information
libailin authored and lihongwei committed Sep 20, 2024
1 parent 33c14a5 commit 112f183
Show file tree
Hide file tree
Showing 15 changed files with 695 additions and 18 deletions.
56 changes: 56 additions & 0 deletions chunjun-connectors/chunjun-connector-format-base/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>chunjun-connectors</artifactId>
<groupId>com.dtstack.chunjun</groupId>
<version>${revision}</version>
</parent>
<modelVersion>4.0.0</modelVersion>

<artifactId>chunjun-connector-format-base</artifactId>
<name>ChunJun : Connector : Format Base</name>

<properties>
<tika.version>2.8.0</tika.version>
</properties>

<dependencies>
<!-- 部署时上传[tika-core-2.8.0.jar] https://repo1.maven.org/maven2/org/apache/tika/tika-core/2.8.0/tika-core-2.8.0.jar -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${tika.version}</version>
<scope>provided</scope>
</dependency>

<!-- 部署时上传[tika-server-standard-2.8.0.jar] https://archive.apache.org/dist/tika/2.8.0/tika-server-standard-2.8.0.jar -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.dtstack.chunjun.connector.format.base.common;

import lombok.Data;

@Data
public class TikaData {

private String[] data;
private boolean end;

public TikaData(String[] data, boolean end) {
this.data = data;
this.end = end;
}

public TikaData(String[] data) {
this.data = data;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.dtstack.chunjun.connector.format.base.config;

import lombok.Data;

import java.io.Serializable;

@Data
public class TikaReadConfig implements Serializable {

public static final String ORIGINAL_FILENAME = "_ORIGINAL_FILENAME";

private static final long serialVersionUID = 9142075335239994317L;

/** 是否启用tika提取 */
private boolean useExtract = false;

/** 内容重合度比例值 0-100 */
private int overlapRatio = 0;

/** 是否启动分块 */
private boolean enableChunk = false;

/** 分块大小 */
private int chunkSize = -1;

public boolean isEnableChunk() {
return chunkSize > 0 ? true : false;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.dtstack.chunjun.connector.format.base.options;

import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;

public class TikaOptions {

public static final ConfigOption<Boolean> USE_EXTRACT =
ConfigOptions.key("tika-use-extract")
.booleanType()
.defaultValue(false)
.withDescription("use tika extract");

public static final ConfigOption<Integer> OVERLAP_RATIO =
ConfigOptions.key("tika-overlap-ratio")
.intType()
.defaultValue(0)
.withDescription("content overlap ratio");

public static final ConfigOption<Integer> CHUNK_SIZE =
ConfigOptions.key("tika-chunk-size")
.intType()
.defaultValue(-1)
.withDescription("chunk size");
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.dtstack.chunjun.connector.format.base.source;

import com.dtstack.chunjun.connector.format.base.common.TikaData;
import com.dtstack.chunjun.connector.format.base.config.TikaReadConfig;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.concurrent.BasicThreadFactory;

import java.io.Closeable;
import java.io.InputStream;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import static java.util.concurrent.TimeUnit.NANOSECONDS;

@Slf4j
public class TikaInputFormat implements Closeable {
private ThreadPoolExecutor executorService;
private final BlockingQueue<TikaData> queue = new LinkedBlockingQueue<>(4096);
private TikaReadConfig tikaReadConfig;
private TikaData row;
private int fieldCount;

public TikaInputFormat(TikaReadConfig tikaReadConfig, int fieldCount) {
this.tikaReadConfig = tikaReadConfig;
this.fieldCount = fieldCount;
}

public void open(InputStream inputStream, String originalFilename) {
this.executorService =
new ThreadPoolExecutor(
1,
1,
0,
NANOSECONDS,
new LinkedBlockingDeque<>(2),
new BasicThreadFactory.Builder()
.namingPattern("tika-schedule-pool-%d")
.daemon(false)
.build());
TikaReaderExecutor executor =
new TikaReaderExecutor(tikaReadConfig, queue, inputStream, originalFilename);
executorService.execute(executor);
}

public boolean hasNext() {
try {
row = queue.poll(3000L, TimeUnit.MILLISECONDS);
// 如果没有数据,则继续等待
if (row == null) {
log.warn("Waiting for queue get tika data");
hasNext();
}
if (row != null && row.isEnd()) {
return false;
}
return true;
} catch (InterruptedException e) {
throw new RuntimeException(
"cannot get data from the queue because the current thread is interrupted.", e);
}
}

/** 根据声明的字段个数,对数据进行补全 */
public String[] nextRecord() {
String[] data = row.getData();
if (fieldCount == data.length) {
return data;
}
if (fieldCount < data.length) {
fieldCount = data.length;
}
return formatValue(data);
}

private String[] formatValue(String[] data) {
String[] record = initDataContainer(fieldCount, "");
// because fieldCount is always >= data.length
System.arraycopy(data, 0, record, 0, data.length);
return record;
}

private String[] initDataContainer(int capacity, String defValue) {
String[] container = new String[capacity];
for (int i = 0; i < capacity; i++) {
container[i] = defValue;
}
return container;
}

@Override
public void close() {
if (executorService != null) {
executorService.shutdown();
queue.clear();
}
}
}
Loading

0 comments on commit 112f183

Please sign in to comment.