diff --git a/src/main/c/Makefile.am b/src/main/c/Makefile.am index eead86d..ed561ac 100644 --- a/src/main/c/Makefile.am +++ b/src/main/c/Makefile.am @@ -1,4 +1,4 @@ -lib_LTLIBRARIES = libjpostal_parser.la libjpostal_expander.la +lib_LTLIBRARIES = libjpostal_parser.la libjpostal_expander.la libjpostal_dedupe.la libjpostal_expander_la_SOURCES = jpostal_AddressExpander.c libjpostal_expander_la_CFLAGS = $(LIBPOSTAL_CFLAGS) @@ -7,3 +7,7 @@ libjpostal_expander_la_LIBADD = $(LIBPOSTAL_LIBS) libjpostal_parser_la_SOURCES = jpostal_AddressParser.c libjpostal_parser_la_CFLAGS = $(LIBPOSTAL_CFLAGS) libjpostal_parser_la_LIBADD = $(LIBPOSTAL_LIBS) + +libjpostal_dedupe_la_SOURCES = jpostal_Dedupe.c +libjpostal_dedupe_la_CFLAGS = $(LIBPOSTAL_CFLAGS) +libjpostal_dedupe_la_LIBADD = $(LIBPOSTAL_LIBS) diff --git a/src/main/c/jpostal_Dedupe.c b/src/main/c/jpostal_Dedupe.c new file mode 100644 index 0000000..34175eb --- /dev/null +++ b/src/main/c/jpostal_Dedupe.c @@ -0,0 +1,260 @@ +#include +#include + +JNIEXPORT void JNICALL Java_com_mapzen_jpostal_Dedupe_setup + (JNIEnv *env, jclass cls) { + + if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + jclass exceptionClass; + exceptionClass = (*env)->FindClass(env, "java/lang/RuntimeException"); + if (exceptionClass == NULL) return; + (*env)->ThrowNew(env, exceptionClass, "Error loading libpostal dedupe modules\n"); + } +} + +JNIEXPORT void JNICALL Java_com_mapzen_jpostal_Dedupe_setupDataDir + (JNIEnv *env, jclass cls, jstring jDataDir) { + const char *datadir = (*env)->GetStringUTFChars(env, jDataDir, 0); + if (!libpostal_setup_datadir((char *)datadir) || !libpostal_setup_language_classifier_datadir((char *)datadir)) { + jclass exceptionClass; + exceptionClass = (*env)->FindClass(env, "java/lang/IllegalArgumentException"); + if (exceptionClass == NULL) return; + (*env)->ThrowNew(env, exceptionClass, "Error loading libpostal dedupe modules\n"); + } +} + +typedef libpostal_duplicate_status_t (*duplicate_function)(char *, char *, libpostal_duplicate_options_t); + +JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isDuplicate(JNIEnv *env, jobject thisObj, + jstring jValue1, jstring jValue2, jobject jOptions, duplicate_function func) { + + const char *value1 = (*env)->GetStringUTFChars(env, jValue1, 0); + + const char *value2 = (*env)->GetStringUTFChars(env, jValue2, 0); + + libpostal_duplicate_options_t options = libpostal_get_default_duplicate_options(); + + jfieldID fid; + + jclass optionsCls = (*env)->GetObjectClass(env, jOptions); + + fid = (*env)->GetFieldID(env, optionsCls, "languages", "[Ljava/lang/String;"); + if (fid == 0) { + return NULL; + } + + jobject jLanguages = (*env)->GetObjectField(env, jOptions, fid); + + size_t num_languages = 0; + char **languages = NULL; + int i; + + if (jLanguages != NULL) { + jsize jNumLanguages = (*env)->GetArrayLength(env, jLanguages); + + languages = malloc(sizeof(char *) * jNumLanguages); + jboolean is_copy = JNI_FALSE; + + num_languages = (size_t)jNumLanguages; + + for (i = 0; i < jNumLanguages; i++) { + jstring jLanguage = (*env)->GetObjectArrayElement(env, jLanguages, i); + + const char *lang = (*env)->GetStringUTFChars(env, jLanguage, &is_copy); + + char *language = strdup(lang); + languages[i] = language; + + (*env)->ReleaseStringUTFChars(env, jLanguage, lang); + } + options.languages = languages; + options.num_languages = num_languages; + } + + libpostal_duplicate_status_t response = func((char *)value1, (char *)value2, options); + + (*env)->ReleaseStringUTFChars(env, jValue1, value1); + + (*env)->ReleaseStringUTFChars(env, jValue2, value2); + + return (jint) response; +} + +JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isNameDuplicate + (JNIEnv *env, jobject thisObj, jstring jName1, jstring jName2, jobject jOptions) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicate( + env, thisObj, jName1, jName2, jOptions, libpostal_is_name_duplicate); +} + +JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isStreetDuplicate + (JNIEnv *env, jobject thisObj, jstring jStreet1, jstring jStreet2, jobject jOptions) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicate( + env, thisObj, jStreet1, jStreet2, jOptions, libpostal_is_street_duplicate); +} + +JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isHouseNumberDuplicate + (JNIEnv *env, jobject thisObj, jstring jHouseNumber1, jstring jHouseNumber2, jobject jOptions) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicate( + env, thisObj, jHouseNumber1, jHouseNumber2, jOptions, libpostal_is_house_number_duplicate); +} + +JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isPOBoxDuplicate + (JNIEnv *env, jobject thisObj, jstring jPOBox1, jstring jPOBox2, jobject jOptions) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicate( + env, thisObj, jPOBox1, jPOBox2, jOptions, libpostal_is_po_box_duplicate); +} + +JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isUnitDuplicate + (JNIEnv *env, jobject thisObj, jstring jUnit1, jstring jUnit2, jobject jOptions) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicate( + env, thisObj, jUnit1, jUnit2, jOptions, libpostal_is_unit_duplicate); +} + +JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isFloorDuplicate + (JNIEnv *env, jobject thisObj, jstring jFloor1, jstring jFloor2, jobject jOptions) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicate( + env, thisObj, jFloor1, jFloor2, jOptions, libpostal_is_floor_duplicate); +} + +JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isPostalCodeDuplicate + (JNIEnv *env, jobject thisObj, jstring jPostalCode1, jstring jPostalCode2, jobject jOptions) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicate( + env, thisObj, jPostalCode1, jPostalCode2, jOptions, libpostal_is_postal_code_duplicate); +} + +JNIEXPORT void JNICALL Java_com_mapzen_jpostal_Dedupe_teardown + (JNIEnv *env, jclass cls) { + libpostal_teardown_language_classifier(); +} + +JNIEXPORT void JNICALL Java_com_mapzen_jpostal_DuplicateOptions_00024Builder_setDefaultOptions + (JNIEnv *env, jobject builder) { + jfieldID fid; + jclass cls = (*env)->GetObjectClass(env, builder); + + libpostal_duplicate_options_t default_options = libpostal_get_default_duplicate_options(); + + fid = (*env)->GetFieldID(env, cls, "languages", "[Ljava/lang/String;"); + if (fid == 0) { + return; + } + + (*env)->SetObjectField(env, builder, fid, NULL); +} + +jint throwError( JNIEnv *env, char *message ) +{ + jclass exClass; + char *className = "java/lang/Error"; + + exClass = (*env)->FindClass( env, className); + if (exClass == NULL) { + return throwNoClassDefError( env, className ); + } + + return (*env)->ThrowNew( env, exClass, message ); +} + +typedef libpostal_fuzzy_duplicate_status_t (*fuzzy_duplicate_function)(size_t, char **, double *, size_t, char **, double *, libpostal_fuzzy_duplicate_options_t); + +JNIEXPORT jdouble JNICALL Java_com_mapzen_jpostal_Dedupe_isDuplicateFuzzy( + JNIEnv *env, jobject thisObj, + jobjectArray jTokens1, jdoubleArray jScores1, + jobjectArray jTokens2, jdoubleArray jScores2, + jobject jOptions, fuzzy_duplicate_function func +) { + jboolean is_copy = JNI_FALSE; + + // Get arguments + int num_tokens1 = (*env)->GetArrayLength(env, jTokens1); + int num_tokens2 = (*env)->GetArrayLength(env, jTokens2); + + double *scores1 = (*env)->GetDoubleArrayElements(env, jScores1, &is_copy); + double *scores2 = (*env)->GetDoubleArrayElements(env, jScores2, &is_copy); + + const char *tokens1[num_tokens1]; + const char *tokens2[num_tokens2]; + + for (int i = 0; i < num_tokens1; i++) { + jstring jToken = (jstring) (*env)->GetObjectArrayElement(env, jTokens1, i); + const char *token = (*env)->GetStringUTFChars(env, jToken, &is_copy); + tokens1[i] = strdup(token); + (*env)->ReleaseStringUTFChars(env, jToken, token); + } + + for (int i = 0; i < num_tokens2; i++) { + jstring jToken = (jstring) (*env)->GetObjectArrayElement(env, jTokens2, i); + const char *token = (*env)->GetStringUTFChars(env, jToken, &is_copy); + tokens2[i] = strdup(token); + (*env)->ReleaseStringUTFChars(env, jToken, token); + } + + // Build Options Argument + libpostal_fuzzy_duplicate_options_t options = libpostal_get_default_fuzzy_duplicate_options(); + + jfieldID fid; + jclass optionsCls = (*env)->GetObjectClass(env, jOptions); + + fid = (*env)->GetFieldID(env, optionsCls, "languages", "[Ljava/lang/String;"); + if (fid == 0) { + throwError(env, "options.languages cannot be null"); + } + + jobject jLanguages = (*env)->GetObjectField(env, jOptions, fid); + + size_t num_languages = 0; + char **languages = NULL; + int i; + + if (jLanguages != NULL) { + jsize jNumLanguages = (*env)->GetArrayLength(env, jLanguages); + + languages = malloc(sizeof(char *) * jNumLanguages); + jboolean is_copy = JNI_FALSE; + + num_languages = (size_t)jNumLanguages; + + for (i = 0; i < jNumLanguages; i++) { + jstring jLanguage = (*env)->GetObjectArrayElement(env, jLanguages, i); + + const char *lang = (*env)->GetStringUTFChars(env, jLanguage, &is_copy); + + char *language = strdup(lang); + languages[i] = language; + + (*env)->ReleaseStringUTFChars(env, jLanguage, lang); + } + options.languages = languages; + options.num_languages = num_languages; + } + + // Call the libpostal function + libpostal_fuzzy_duplicate_status_t status = func(num_tokens1, tokens1, scores1, num_tokens2, tokens2, scores2, options); + + // Clean up + (*env)->ReleaseDoubleArrayElements(env, jScores1, scores1, 0); + (*env)->ReleaseDoubleArrayElements(env, jScores2, scores2, 0); + + return (jdouble) status.similarity; +} + + +JNIEXPORT jdouble JNICALL Java_com_mapzen_jpostal_Dedupe_isNameDuplicateFuzzy( + JNIEnv *env, jobject thisObj, + jobjectArray jTokens1, jdoubleArray jScores1, + jobjectArray jTokens2, jdoubleArray jScores2, + jobject jOptions +) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicateFuzzy(env, thisObj, + jTokens1, jScores1, jTokens2, jScores2, jOptions, libpostal_is_name_duplicate_fuzzy); +} + +JNIEXPORT jdouble JNICALL Java_com_mapzen_jpostal_Dedupe_isStreetDuplicateFuzzy( + JNIEnv *env, jobject thisObj, + jobjectArray jTokens1, jdoubleArray jScores1, + jobjectArray jTokens2, jdoubleArray jScores2, + jobject jOptions +) { + return Java_com_mapzen_jpostal_Dedupe_isDuplicateFuzzy(env, thisObj, + jTokens1, jScores1, jTokens2, jScores2, jOptions, libpostal_is_street_duplicate_fuzzy); +} diff --git a/src/main/java/com/mapzen/jpostal/Dedupe.java b/src/main/java/com/mapzen/jpostal/Dedupe.java new file mode 100644 index 0000000..859d4c7 --- /dev/null +++ b/src/main/java/com/mapzen/jpostal/Dedupe.java @@ -0,0 +1,123 @@ +package com.mapzen.jpostal; + +public class Dedupe { + static { + System.loadLibrary("jpostal_dedupe"); // Load native library at runtime + } + + private volatile static Dedupe instance = null; + + public static Dedupe getInstanceDataDir(String dataDir) { + if (instance == null) { + synchronized(Dedupe.class) { + if (instance == null) { + instance = new Dedupe(dataDir); + } + } + } + return instance; + } + + public static Dedupe getInstance() { + return getInstanceDataDir(null); + } + + static native synchronized void setup(); + static native synchronized void setupDataDir(String dataDir); + + private static native synchronized int isStreetDuplicate(String street1, String street2, DuplicateOptions options); + private static native synchronized int isNameDuplicate(String name1, String name2, DuplicateOptions options); + private static native synchronized int isHouseNumberDuplicate(String name1, String name2, DuplicateOptions options); + private static native synchronized int isPOBoxDuplicate(String poBox1, String poBox2, DuplicateOptions options); + private static native synchronized int isUnitDuplicate(String unit1, String unit2, DuplicateOptions options); + private static native synchronized int isFloorDuplicate(String floor1, String floor2, DuplicateOptions options); + private static native synchronized int isPostalCodeDuplicate(String postalCode1, String postalCode2, DuplicateOptions options); + private static native synchronized double isNameDuplicateFuzzy(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2, DuplicateOptions options); + private static native synchronized double isStreetDuplicateFuzzy(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2, DuplicateOptions options); + + static native synchronized void teardown(); + + public DuplicateStatus isStreetDupe(String street1, String street2) { + return isStreetDupeWithOptions(street1, street2, new DuplicateOptions.Builder().build()); + } + + public DuplicateStatus isStreetDupeWithOptions(String street1, String street2, DuplicateOptions options) { + return DuplicateStatus.fromInt(isStreetDuplicate(street1, street2, options)); + } + + public DuplicateStatus isNameDupe(String name1, String name2) { + return isNameDupeWithOptions(name1, name2, new DuplicateOptions.Builder().build()); + } + + public DuplicateStatus isNameDupeWithOptions(String name1, String name2, DuplicateOptions options) { + return DuplicateStatus.fromInt(isNameDuplicate(name1, name2, options)); + } + + public DuplicateStatus isHouseNumberDupe(String houseNumber1, String houseNumber2) { + return isHouseNumberDupeWithOptions(houseNumber1, houseNumber2, new DuplicateOptions.Builder().build()); + } + + public DuplicateStatus isHouseNumberDupeWithOptions(String houseNumber1, String houseNumber2, DuplicateOptions options) { + return DuplicateStatus.fromInt(isHouseNumberDuplicate(houseNumber1, houseNumber2, options)); + } + + public DuplicateStatus isPOBoxDupe(String poBox1, String poBox2) { + return isPOBoxDupeWithOptions(poBox1, poBox2, new DuplicateOptions.Builder().build()); + } + + public DuplicateStatus isPOBoxDupeWithOptions(String poBox1, String poBox2, DuplicateOptions options) { + return DuplicateStatus.fromInt(isPOBoxDuplicate(poBox1, poBox2, options)); + } + + public DuplicateStatus isUnitDupe(String unit1, String unit2) { + return isUnitDupeWithOptions(unit1, unit2, new DuplicateOptions.Builder().build()); + } + + public DuplicateStatus isUnitDupeWithOptions(String unit1, String unit2, DuplicateOptions options) { + return DuplicateStatus.fromInt(isUnitDuplicate(unit1, unit2, options)); + } + + public DuplicateStatus isFloorDupe(String floor1, String floor2) { + return isFloorDupeWithOptions(floor1, floor2, new DuplicateOptions.Builder().build()); + } + + public DuplicateStatus isFloorDupeWithOptions(String floor1, String floor2, DuplicateOptions options) { + return DuplicateStatus.fromInt(isFloorDuplicate(floor1, floor2, options)); + } + + public DuplicateStatus isPostalCodeDupe(String postalCode1, String postalCode2) { + return isPostalCodeDupeWithOptions(postalCode1, postalCode2, new DuplicateOptions.Builder().build()); + } + + public DuplicateStatus isPostalCodeDupeWithOptions(String postalCode1, String postalCode2, DuplicateOptions options) { + return DuplicateStatus.fromInt(isPostalCodeDuplicate(postalCode1, postalCode2, options)); + } + + public double isNameDupeFuzzy(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2) { + return isNameDupeFuzzyWithOptions(tokens1, scores1, tokens2, scores2, new DuplicateOptions.Builder().build()); + } + + public double isNameDupeFuzzyWithOptions(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2, DuplicateOptions options) { + return isNameDuplicateFuzzy(tokens1, scores1, tokens2, scores2, options); + } + + public double isStreetDupeFuzzy(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2) { + return isStreetDupeFuzzyWithOptions(tokens1, scores1, tokens2, scores2, new DuplicateOptions.Builder().build()); + } + + public double isStreetDupeFuzzyWithOptions(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2, DuplicateOptions options) { + return isStreetDuplicateFuzzy(tokens1, scores1, tokens2, scores2, options); + } + + protected Dedupe(String dataDir) { + if (dataDir == null) { + setup(); + } else { + setupDataDir(dataDir); + } + } + + protected void finalize() { + teardown(); + } +} diff --git a/src/main/java/com/mapzen/jpostal/DuplicateOptions.java b/src/main/java/com/mapzen/jpostal/DuplicateOptions.java new file mode 100644 index 0000000..d9bb642 --- /dev/null +++ b/src/main/java/com/mapzen/jpostal/DuplicateOptions.java @@ -0,0 +1,44 @@ +package com.mapzen.jpostal; + +public class DuplicateOptions { + + private String[] languages; + + public String[] getLanguages() { + return languages; + } + + public void setLanguages(String[] languages) { + this.languages = languages; + } + + public static class Builder { + static { + System.loadLibrary("jpostal_dedupe"); // Load native library at runtime + } + + private String[] languages; + + private native synchronized void setDefaultOptions(); + + public Builder() { + super(); + setDefaultOptions(); + } + + public Builder languages(String[] languages) { + this.languages = languages; + return this; + } + + public DuplicateOptions build() { + return new DuplicateOptions(this); + } + + } + + private DuplicateOptions(Builder builder) { + languages = builder.languages; + } + +} diff --git a/src/main/java/com/mapzen/jpostal/DuplicateStatus.java b/src/main/java/com/mapzen/jpostal/DuplicateStatus.java new file mode 100644 index 0000000..03008c1 --- /dev/null +++ b/src/main/java/com/mapzen/jpostal/DuplicateStatus.java @@ -0,0 +1,25 @@ +package com.mapzen.jpostal; + +public enum DuplicateStatus { + LIBPOSTAL_NULL_DUPLICATE_STATUS(-1), + LIBPOSTAL_NON_DUPLICATE(0), + LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW(3), + LIBPOSTAL_LIKELY_DUPLICATE(6), + LIBPOSTAL_EXACT_DUPLICATE(9); + + public final int intVal; + + DuplicateStatus(int intVal) { + this.intVal = intVal; + } + + public static DuplicateStatus fromInt(int i) { + for (DuplicateStatus s: values()) { + if (s.intVal == i) { + return s; + } + } + + return null; + } +}