Skip to content

Commit

Permalink
Changed PDF filenames with MD5 hash
Browse files Browse the repository at this point in the history
  • Loading branch information
yusniel authored and yusniel committed Jan 30, 2016
1 parent 562e552 commit 3bff7c2
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 19 deletions.
4 changes: 2 additions & 2 deletions sdlcrawler/config/urls
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
http://cagricola.uclv.edu.cu/index.php/es/
#http://ojs.uo.edu.cu/
http://coodes.upr.edu.cu/index.php/coodes
http://cfores.upr.edu.cu/index.php/cfores
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import org.apache.http.Header;
import org.apache.http.message.BasicHeader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down
32 changes: 16 additions & 16 deletions sdlcrawler/src/main/java/cu/uci/sdlcrawler/PdfCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,14 @@

import java.io.File;
import java.io.IOException;
import java.util.UUID;
import java.util.regex.Pattern;
import org.apache.commons.codec.digest.DigestUtils;

/**
* @author Yasser Ganjisaffar [lastname at gmail dot com]
* @author Yusniel Hidalgo Delgado [yhidalgo86 at gmail dot com]
*/

/*
* This class shows how you can crawl images on the web and store them in a
* folder. This is just for demonstration purposes and doesn't scale for large
* number of images. For crawling millions of images you would need to store
* downloaded images in a hierarchy of folders
*/
public class PdfCrawler extends WebCrawler {

private static final Pattern filters = Pattern.compile(".*(\\.(css|js|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v"
Expand Down Expand Up @@ -81,22 +75,28 @@ public void visit(Page page) {
if (page.getStatusCode() == 200) {
System.out.println(page.getContentType() + " -- " + page.getStatusCode() + " -- " + url);
}

if (!(page.getContentType().equals("application/pdf")) || !(page.getParseData() instanceof BinaryParseData)) {
return;
}

String extension = ".pdf";
String hashedName = UUID.randomUUID().toString() + extension;
File folder=new File(storageFolder.getAbsolutePath()+url.substring(6, url.indexOf(".")));
if (!folder.exists()) {
folder.mkdirs();
File folder;
if (url.startsWith("http")) {
folder = new File(storageFolder.getAbsolutePath() + url.substring(6, url.indexOf(".")));
if (!folder.exists()) {
folder.mkdirs();
}
} else {
folder = new File(storageFolder.getAbsolutePath() + url.substring(7, url.indexOf(".")));
if (!folder.exists()) {
folder.mkdirs();
}
}

String filename = folder+"/" + hashedName;
byte[] content = page.getContentData();
String md5 = DigestUtils.md5Hex(content);
String filename = folder + "/" + md5 + extension;
try {
System.out.println(filename);
Files.write(page.getContentData(), new File(filename));
Files.write(content, new File(filename));
} catch (IOException iox) {
System.out.println(iox.getMessage());
}
Expand Down

0 comments on commit 3bff7c2

Please sign in to comment.