Skip to content

Commit

Permalink
add google drive ocr capability
Browse files Browse the repository at this point in the history
  • Loading branch information
umjammer committed Mar 4, 2020
1 parent 4a078a9 commit c4d8029
Show file tree
Hide file tree
Showing 7 changed files with 247 additions and 22 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ fuse for java and many file systems.
| hfs+ (dmg) || | | | | | | | | [hfsexplorer](https://github.com/umjammer/hfsexplorer) |
| gathered || | | | | | | | | |
| cyberduck || | | | | | | | | [vavi-nio-file-cyberduck](https://github.com/umjammer/vavi-nio-file-cyberduck), [cyberduck](https://github.com/iterate-ch/cyberduck) |
| discutils (vhd/ntfs) || | | | | | | | | [vavi-nio-file-discutils](https://github.com/umjammer/vavi-nio-file-discutils) |
| discutils (vdi/ntfs) || | | | | | | | | [vavi-nio-file-discutils](https://github.com/umjammer/vavi-nio-file-discutils) |


# TODO
Expand Down
13 changes: 9 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@

<groupId>vavi</groupId>
<artifactId>vavi-apps-fuse</artifactId>
<version>0.0.6-SNAPSHOT</version>
<version>0.0.6</version>
<packaging>jar</packaging>

<name>vavi-apps-fuse</name>
<url>https://github.com/umjammer/vavi-apps-fuse</url>
<description>0.0.5
<description>0.0.6

improve google drive
add google drive ocr capability
gatheredfs alias

0.0.5

relive fuse-jna as generic
make onedrive (cyberduck engine) work
Expand Down Expand Up @@ -43,8 +49,7 @@ TODO

apple photos.app

google automatic authentication (how to click button)
</description>
google automatic authentication (how to click button)</description>
<scm>
<url>https://github.com/umjammer/vavi-apps-fuse</url>
</scm>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@ public enum GoogleDriveCopyOption implements CopyOption {
EXPORT_AS_GDOCS("application/vnd.google-apps.document");

/** */
private String mimeType;
private String value;

/** */
private GoogleDriveCopyOption(String mimeType) {
this.mimeType = mimeType;
private GoogleDriveCopyOption(String value) {
this.value = value;
}

/** */
public String getMimeType() {
return mimeType;
public String getValue() {
return value;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ public File getEntry(Path path) throws IOException {
File entry;
if (path.getNameCount() == 0) {
entry = drive.files().get(toPathString(path)).execute();
cache.putFile(path, entry);
return entry;
cache.putFile(path, entry);
return entry;
} else {
List<Path> siblings;
if (!cache.containsFolder(path.getParent())) {
Expand Down Expand Up @@ -163,9 +163,15 @@ public InputStream newInputStream(final Path path, final Set<? extends OpenOptio
throw new IsDirectoryException("path: " + path);
}

// TODO
// TODO detect automatically?
ByteArrayOutputStream baos = new ByteArrayOutputStream();
drive.files().get(entry.getId()).executeMediaAndDownloadTo(baos);
if (options.stream().anyMatch(o -> GoogleDriveOpenOption.class.isInstance(o))) {
GoogleDriveOpenOption option = GoogleDriveOpenOption.class
.cast(options.stream().filter(o -> GoogleDriveOpenOption.class.isInstance(o)).findFirst().get());
drive.files().export(entry.getId(), option.getValue()).executeMediaAndDownloadTo(baos);
} else {
drive.files().get(entry.getId()).executeMediaAndDownloadTo(baos);
}
return new ByteArrayInputStream(baos.toByteArray());
}

Expand All @@ -186,11 +192,11 @@ public OutputStream newOutputStream(final Path path, final Set<? extends OpenOpt
options.forEach(o -> { System.err.println("newOutputStream: " + o); });
}

// TODO mime type
if (options.stream().anyMatch(o -> GoogleDriveCopyOption.class.isInstance(o))) {
String mimeType = options.stream()
.filter(o -> GoogleDriveCopyOption.class.isInstance(o))
.map(o -> GoogleDriveCopyOption.class.cast(o).getMimeType()).findFirst().get();
// TODO detect automatically?
if (options.stream().anyMatch(o -> GoogleDriveOpenOption.class.isInstance(o))) {
@SuppressWarnings("unused")
GoogleDriveOpenOption option = GoogleDriveOpenOption.class
.cast(options.stream().filter(o -> GoogleDriveOpenOption.class.isInstance(o)).findFirst().get());
}

return new Util.OutputStreamForUploading() {
Expand Down Expand Up @@ -287,7 +293,7 @@ protected long getLeftOver() throws IOException {
@Override
public void close() throws IOException {
if (lock != null) {
System.out.println("SeekableByteChannelForWriting::close: scpecial: " + path);
System.out.println("SeekableByteChannelForWriting::close: scpecial: " + path);
return;
}
super.close();
Expand Down Expand Up @@ -336,7 +342,7 @@ public void copy(final Path source, final Path target, final Set<CopyOption> opt
}
}

copyEntry(source, target);
copyEntry(source, target, options);
}

@Override
Expand Down Expand Up @@ -479,13 +485,16 @@ private void removeEntry(Path path) throws IOException {
}

/** */
private void copyEntry(final Path source, final Path target) throws IOException {
private void copyEntry(final Path source, final Path target, Set<CopyOption> options) throws IOException {
final File sourceEntry = cache.getEntry(source);
File targetParentEntry = cache.getEntry(target.getParent());
if (!isFolder(sourceEntry)) {
File entry = new File();
entry.setName(toFilenameString(target));
entry.setParents(Arrays.asList(targetParentEntry.getId()));
if (options.stream().anyMatch(o -> o.equals(GoogleDriveCopyOption.EXPORT_AS_GDOCS))) {
entry.setMimeType(GoogleDriveCopyOption.EXPORT_AS_GDOCS.getValue());
}
File newEntry = drive.files().copy(sourceEntry.getId(), entry)
.setFields("id, parents, name, size, mimeType, createdTime").execute();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ public class GoogleDriveFileSystemOptionsFactory extends FileSystemOptionsFactor

public GoogleDriveFileSystemOptionsFactory() {
addLinkOption(LinkOption.NOFOLLOW_LINKS);
addCopyOption(GoogleDriveCopyOption.EXPORT_AS_GDOCS);
addReadOpenOption(GoogleDriveOpenOption.EXPORT_WITH_GDOCS_DOCX);
}
}

Expand Down
37 changes: 37 additions & 0 deletions src/main/java/vavi/nio/file/googledrive/GoogleDriveOpenOption.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2017 by Naohide Sano, All rights reserved.
*
* Programmed by Naohide Sano
*/

package vavi.nio.file.googledrive;

import java.nio.file.OpenOption;


/**
* GoogleDriveOpenOption.
*
* @author <a href="mailto:[email protected]">Naohide Sano</a> (umjammer)
* @version 0.00 2017/03/01 umjammer initial version <br>
*/
public enum GoogleDriveOpenOption implements OpenOption {

EXPORT_WITH_GDOCS_DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
EXPORT_WITH_GDOCS_XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");

/** */
private String value;

/** */
private GoogleDriveOpenOption(String value) {
this.value = value;
}

/** */
public String getValue() {
return value;
}
}

/* */
172 changes: 172 additions & 0 deletions src/test/java/GoogleOCR.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
* Copyright (c) 2020 by Naohide Sano, All rights reserved.
*
* Programmed by Naohide Sano
*/

import java.io.IOException;
import java.net.URI;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import vavi.nio.file.googledrive.GoogleDriveCopyOption;
import vavi.nio.file.googledrive.GoogleDriveOpenOption;
import vavi.util.archive.Entry;
import vavi.util.archive.zip.ZipArchive;


/**
* google drive ocr
*
* @author <a href="mailto:[email protected]">Naohide Sano</a> (umjammer)
* @version 0.00 2020/03/03 umjammer initial version <br>
*/
public final class GoogleOCR {

/**
* @param args 0: email, 1: zip file, 2: extract to dir, 3: extracted dir, 4: output ocr dir
*/
public static void main(final String... args) {
int exitCode = 0;
try {
GoogleOCR app = new GoogleOCR();
FileSystem fs = app.prepare(args);
System.err.println("fs: " + fs);
// app.extract(fs, args);
// app.ocr(fs, args);
app.gather(fs, args);
} catch (Exception e) {
e.printStackTrace();
exitCode = -1;
} finally {
//Thread.getAllStackTraces().keySet().forEach(System.err::println);
System.exit(exitCode);
}
}

/**
* @param args 0: email
*/
FileSystem prepare(final String... args) throws IOException {
String email = args[0];

URI uri = URI.create("googledrive:///?id=" + email);

return FileSystems.newFileSystem(uri, Collections.EMPTY_MAP);
}

/**
* @param args 1: zip file, 2: extract to dir
*/
void extract(FileSystem fs, final String... args) throws IOException {
String filename = args[1];
String dirname = args[2];
// boolean dryRun = false;

Path dir = fs.getPath(dirname);
if (!Files.exists(dir)) {
Files.createDirectories(dir);
}

Path file = fs.getPath(filename);
Path tmp = Paths.get("/tmp", file.getFileName().toString());
if (!Files.exists(tmp)) {
Files.copy(file, tmp);
System.err.println("copy: " + file + " to " + tmp);
}
ZipArchive archive = new ZipArchive(tmp.toFile());
//int c = 0;
Set<Path> paths = new HashSet<>();
for (Entry entry : archive.entries()) {
Path path = dir.resolve(entry.getName());
if (!Files.exists(path.getParent())) {
Files.createDirectories(path.getParent());
}
if (!Files.exists(path)) {
if (entry.isDirectory()) {
Files.createDirectory(path);
} else {
Files.copy(archive.getInputStream(entry), path);
paths.add(path);
}
System.err.println("extract: " + path);
try { Thread.sleep(300); } catch (InterruptedException e) {}
} else {
System.err.println("skip: " + path);
}
//if (c++ > 5) break;
}
Files.delete(tmp);
System.err.println("rm: " + tmp);
}

/**
* @param args 3: extracted dir, 4: output ocr dir
*/
void ocr(FileSystem fs, final String... args) throws IOException {
String extractedDirname = args[3];
// specify other directory avoid ConcurrentModificationException Files#list().forEach()
String ocrDirname = args[4];
Files.list(fs.getPath(extractedDirname)).forEach(p -> {
//System.err.println(p);
try {
String fn = p.getFileName().toString();
if (fn.indexOf(".jpg") > 0) {
// gdocs doesn't have extension
String ocrName = fn.substring(0, fn.lastIndexOf('.')) + "_ocr"/* + fn.substring(fn.lastIndexOf('.')) */;
Path ocr = fs.getPath(ocrDirname).resolve(ocrName);
if (!Files.exists(ocr)) {
Path x = Files.copy(p, ocr, GoogleDriveCopyOption.EXPORT_AS_GDOCS);
System.err.println("ocr: " + x);
try { Thread.sleep(1000); } catch (InterruptedException e) {}
} else {
System.err.println("skip ocr: " + fn);
}
}
} catch (IOException e) {
throw new IllegalStateException(e);
}
});
}

//int c = 0;
/**
* @param args 4: output ocr dir
*/
void gather(FileSystem fs, final String... args) throws IOException {
String ocrDirname = args[4];
Files.list(fs.getPath(ocrDirname)).sorted().forEach(p -> {
//System.err.println(p);
try {
ZipInputStream zis = new ZipInputStream(Files.newInputStream(p, GoogleDriveOpenOption.EXPORT_WITH_GDOCS_DOCX));
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
if (entry.getName().equals("word/document.xml")) {
@SuppressWarnings("resource")
Scanner scanner = new Scanner(zis);
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
line = line.replaceAll("</w:p>", "\n");
line = line.replaceAll("<[^>]{1,}>", "");
line = line.replaceAll("[^[\\p{Print}]\\n]{1,}", ""); // TODO check
System.out.println(line);
}
}
zis.closeEntry();
}
} catch (IOException e) {
throw new IllegalStateException(e);
}
//if (c++ > 1) System.exit(0);
});
}
}

0 comments on commit c4d8029

Please sign in to comment.