removed duplicate files by using md5 hash

This commit is contained in:
Niklas 2018-10-12 00:39:39 +02:00
parent d179895318
commit 6d22a5f579
3 changed files with 12 additions and 4 deletions

View File

@ -1,5 +1,7 @@
package de.gurkengewuerz.ripmewrapper;
import org.apache.commons.codec.digest.DigestUtils;
import java.io.File;
import java.io.FileInputStream;
import java.nio.file.Files;
@ -44,13 +46,19 @@ public class ImageCrawler extends TimerTask {
}).map(String::valueOf)
.collect(Collectors.toList());
PreparedStatement ps = connection.prepareStatement("INSERT INTO filelist VALUES (NULL, ?, ?)");
PreparedStatement ps = connection.prepareStatement("INSERT INTO filelist VALUES (NULL, ?, ?, ?)");
HashSet<String> foundIds = new HashSet<>();
for (String s : pathList) {
String id = s.substring(s.lastIndexOf(File.separator) + 1).split("-")[0];
if (!findIds.contains(id)) continue;
FileInputStream fis = new FileInputStream(new File(s));
String md5 = DigestUtils.md5Hex(fis);
fis.close();
ps.setString(1, id);
ps.setString(2, s);
ps.setString(3, md5);
ps.executeUpdate();
foundIds.add(id);
}

View File

@ -30,7 +30,7 @@ public class Webserver {
statement.setQueryTimeout(30); // set timeout to 30 sec.
statement.executeUpdate("CREATE TABLE IF NOT EXISTS metalist (id string, subreddit string, created integer, title string, file string)");
statement.executeUpdate("CREATE TABLE IF NOT EXISTS filelist (iid INTEGER PRIMARY KEY AUTOINCREMENT, id string, path string)");
statement.executeUpdate("CREATE TABLE IF NOT EXISTS filelist (iid INTEGER PRIMARY KEY AUTOINCREMENT, id string, path string, md5 string)");
connection.close();

View File

@ -61,7 +61,7 @@ public class APIHandler extends AbstractHandler {
if (offset != -1) {
PreparedStatement psPre = connection.prepareStatement(
"SELECT iid FROM filelist LEFT JOIN metalist ON filelist.id = metalist.id WHERE subreddit IN (" + builderString + ") ORDER BY created DESC, iid DESC"
"SELECT iid FROM filelist LEFT JOIN metalist ON filelist.id = metalist.id WHERE subreddit IN (" + builderString + ") GROUP BY md5 ORDER BY created DESC, iid DESC"
);
for (String o : subreddits) {
@ -82,7 +82,7 @@ public class APIHandler extends AbstractHandler {
// ----------------
PreparedStatement ps = connection.prepareStatement(
"SELECT iid, metalist.id, subreddit, created, path FROM filelist LEFT JOIN metalist ON filelist.id = metalist.id WHERE subreddit IN (" + builderString + ") ORDER BY created DESC, iid DESC LIMIT 10 OFFSET ?"
"SELECT iid, metalist.id, subreddit, created, path FROM filelist LEFT JOIN metalist ON filelist.id = metalist.id WHERE subreddit IN (" + builderString + ") GROUP BY md5 ORDER BY created DESC, iid DESC LIMIT 10 OFFSET ?"
);
index = 1;