Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dedupe bindings #32

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/main/c/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
lib_LTLIBRARIES = libjpostal_parser.la libjpostal_expander.la
lib_LTLIBRARIES = libjpostal_parser.la libjpostal_expander.la libjpostal_dedupe.la

libjpostal_expander_la_SOURCES = jpostal_AddressExpander.c
libjpostal_expander_la_CFLAGS = $(LIBPOSTAL_CFLAGS)
Expand All @@ -7,3 +7,7 @@ libjpostal_expander_la_LIBADD = $(LIBPOSTAL_LIBS)
libjpostal_parser_la_SOURCES = jpostal_AddressParser.c
libjpostal_parser_la_CFLAGS = $(LIBPOSTAL_CFLAGS)
libjpostal_parser_la_LIBADD = $(LIBPOSTAL_LIBS)

libjpostal_dedupe_la_SOURCES = jpostal_Dedupe.c
libjpostal_dedupe_la_CFLAGS = $(LIBPOSTAL_CFLAGS)
libjpostal_dedupe_la_LIBADD = $(LIBPOSTAL_LIBS)
260 changes: 260 additions & 0 deletions src/main/c/jpostal_Dedupe.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
#include <jni.h>
#include <libpostal/libpostal.h>

JNIEXPORT void JNICALL Java_com_mapzen_jpostal_Dedupe_setup
(JNIEnv *env, jclass cls) {

if (!libpostal_setup() || !libpostal_setup_language_classifier()) {
jclass exceptionClass;
exceptionClass = (*env)->FindClass(env, "java/lang/RuntimeException");
if (exceptionClass == NULL) return;
(*env)->ThrowNew(env, exceptionClass, "Error loading libpostal dedupe modules\n");
}
}

JNIEXPORT void JNICALL Java_com_mapzen_jpostal_Dedupe_setupDataDir
(JNIEnv *env, jclass cls, jstring jDataDir) {
const char *datadir = (*env)->GetStringUTFChars(env, jDataDir, 0);
if (!libpostal_setup_datadir((char *)datadir) || !libpostal_setup_language_classifier_datadir((char *)datadir)) {
jclass exceptionClass;
exceptionClass = (*env)->FindClass(env, "java/lang/IllegalArgumentException");
if (exceptionClass == NULL) return;
(*env)->ThrowNew(env, exceptionClass, "Error loading libpostal dedupe modules\n");
}
}

typedef libpostal_duplicate_status_t (*duplicate_function)(char *, char *, libpostal_duplicate_options_t);

JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isDuplicate(JNIEnv *env, jobject thisObj,
jstring jValue1, jstring jValue2, jobject jOptions, duplicate_function func) {

const char *value1 = (*env)->GetStringUTFChars(env, jValue1, 0);

const char *value2 = (*env)->GetStringUTFChars(env, jValue2, 0);

libpostal_duplicate_options_t options = libpostal_get_default_duplicate_options();

jfieldID fid;

jclass optionsCls = (*env)->GetObjectClass(env, jOptions);

fid = (*env)->GetFieldID(env, optionsCls, "languages", "[Ljava/lang/String;");
if (fid == 0) {
return NULL;
}

jobject jLanguages = (*env)->GetObjectField(env, jOptions, fid);

size_t num_languages = 0;
char **languages = NULL;
int i;

if (jLanguages != NULL) {
jsize jNumLanguages = (*env)->GetArrayLength(env, jLanguages);

languages = malloc(sizeof(char *) * jNumLanguages);
jboolean is_copy = JNI_FALSE;

num_languages = (size_t)jNumLanguages;

for (i = 0; i < jNumLanguages; i++) {
jstring jLanguage = (*env)->GetObjectArrayElement(env, jLanguages, i);

const char *lang = (*env)->GetStringUTFChars(env, jLanguage, &is_copy);

char *language = strdup(lang);
languages[i] = language;

(*env)->ReleaseStringUTFChars(env, jLanguage, lang);
}
options.languages = languages;
options.num_languages = num_languages;
}

libpostal_duplicate_status_t response = func((char *)value1, (char *)value2, options);

(*env)->ReleaseStringUTFChars(env, jValue1, value1);

(*env)->ReleaseStringUTFChars(env, jValue2, value2);

return (jint) response;
}

JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isNameDuplicate
(JNIEnv *env, jobject thisObj, jstring jName1, jstring jName2, jobject jOptions) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicate(
env, thisObj, jName1, jName2, jOptions, libpostal_is_name_duplicate);
}

JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isStreetDuplicate
(JNIEnv *env, jobject thisObj, jstring jStreet1, jstring jStreet2, jobject jOptions) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicate(
env, thisObj, jStreet1, jStreet2, jOptions, libpostal_is_street_duplicate);
}

JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isHouseNumberDuplicate
(JNIEnv *env, jobject thisObj, jstring jHouseNumber1, jstring jHouseNumber2, jobject jOptions) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicate(
env, thisObj, jHouseNumber1, jHouseNumber2, jOptions, libpostal_is_house_number_duplicate);
}

JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isPOBoxDuplicate
(JNIEnv *env, jobject thisObj, jstring jPOBox1, jstring jPOBox2, jobject jOptions) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicate(
env, thisObj, jPOBox1, jPOBox2, jOptions, libpostal_is_po_box_duplicate);
}

JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isUnitDuplicate
(JNIEnv *env, jobject thisObj, jstring jUnit1, jstring jUnit2, jobject jOptions) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicate(
env, thisObj, jUnit1, jUnit2, jOptions, libpostal_is_unit_duplicate);
}

JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isFloorDuplicate
(JNIEnv *env, jobject thisObj, jstring jFloor1, jstring jFloor2, jobject jOptions) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicate(
env, thisObj, jFloor1, jFloor2, jOptions, libpostal_is_floor_duplicate);
}

JNIEXPORT jint JNICALL Java_com_mapzen_jpostal_Dedupe_isPostalCodeDuplicate
(JNIEnv *env, jobject thisObj, jstring jPostalCode1, jstring jPostalCode2, jobject jOptions) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicate(
env, thisObj, jPostalCode1, jPostalCode2, jOptions, libpostal_is_postal_code_duplicate);
}

JNIEXPORT void JNICALL Java_com_mapzen_jpostal_Dedupe_teardown
(JNIEnv *env, jclass cls) {
libpostal_teardown_language_classifier();
}

JNIEXPORT void JNICALL Java_com_mapzen_jpostal_DuplicateOptions_00024Builder_setDefaultOptions
(JNIEnv *env, jobject builder) {
jfieldID fid;
jclass cls = (*env)->GetObjectClass(env, builder);

libpostal_duplicate_options_t default_options = libpostal_get_default_duplicate_options();

fid = (*env)->GetFieldID(env, cls, "languages", "[Ljava/lang/String;");
if (fid == 0) {
return;
}

(*env)->SetObjectField(env, builder, fid, NULL);
}

jint throwError( JNIEnv *env, char *message )
{
jclass exClass;
char *className = "java/lang/Error";

exClass = (*env)->FindClass( env, className);
if (exClass == NULL) {
return throwNoClassDefError( env, className );
}

return (*env)->ThrowNew( env, exClass, message );
}

typedef libpostal_fuzzy_duplicate_status_t (*fuzzy_duplicate_function)(size_t, char **, double *, size_t, char **, double *, libpostal_fuzzy_duplicate_options_t);

JNIEXPORT jdouble JNICALL Java_com_mapzen_jpostal_Dedupe_isDuplicateFuzzy(
JNIEnv *env, jobject thisObj,
jobjectArray jTokens1, jdoubleArray jScores1,
jobjectArray jTokens2, jdoubleArray jScores2,
jobject jOptions, fuzzy_duplicate_function func
) {
jboolean is_copy = JNI_FALSE;

// Get arguments
int num_tokens1 = (*env)->GetArrayLength(env, jTokens1);
int num_tokens2 = (*env)->GetArrayLength(env, jTokens2);

double *scores1 = (*env)->GetDoubleArrayElements(env, jScores1, &is_copy);
double *scores2 = (*env)->GetDoubleArrayElements(env, jScores2, &is_copy);

const char *tokens1[num_tokens1];
const char *tokens2[num_tokens2];

for (int i = 0; i < num_tokens1; i++) {
jstring jToken = (jstring) (*env)->GetObjectArrayElement(env, jTokens1, i);
const char *token = (*env)->GetStringUTFChars(env, jToken, &is_copy);
tokens1[i] = strdup(token);
(*env)->ReleaseStringUTFChars(env, jToken, token);
}

for (int i = 0; i < num_tokens2; i++) {
jstring jToken = (jstring) (*env)->GetObjectArrayElement(env, jTokens2, i);
const char *token = (*env)->GetStringUTFChars(env, jToken, &is_copy);
tokens2[i] = strdup(token);
(*env)->ReleaseStringUTFChars(env, jToken, token);
}

// Build Options Argument
libpostal_fuzzy_duplicate_options_t options = libpostal_get_default_fuzzy_duplicate_options();

jfieldID fid;
jclass optionsCls = (*env)->GetObjectClass(env, jOptions);

fid = (*env)->GetFieldID(env, optionsCls, "languages", "[Ljava/lang/String;");
if (fid == 0) {
throwError(env, "options.languages cannot be null");
}

jobject jLanguages = (*env)->GetObjectField(env, jOptions, fid);

size_t num_languages = 0;
char **languages = NULL;
int i;

if (jLanguages != NULL) {
jsize jNumLanguages = (*env)->GetArrayLength(env, jLanguages);

languages = malloc(sizeof(char *) * jNumLanguages);
jboolean is_copy = JNI_FALSE;

num_languages = (size_t)jNumLanguages;

for (i = 0; i < jNumLanguages; i++) {
jstring jLanguage = (*env)->GetObjectArrayElement(env, jLanguages, i);

const char *lang = (*env)->GetStringUTFChars(env, jLanguage, &is_copy);

char *language = strdup(lang);
languages[i] = language;

(*env)->ReleaseStringUTFChars(env, jLanguage, lang);
}
options.languages = languages;
options.num_languages = num_languages;
}

// Call the libpostal function
libpostal_fuzzy_duplicate_status_t status = func(num_tokens1, tokens1, scores1, num_tokens2, tokens2, scores2, options);

// Clean up
(*env)->ReleaseDoubleArrayElements(env, jScores1, scores1, 0);
(*env)->ReleaseDoubleArrayElements(env, jScores2, scores2, 0);

return (jdouble) status.similarity;
}


JNIEXPORT jdouble JNICALL Java_com_mapzen_jpostal_Dedupe_isNameDuplicateFuzzy(
JNIEnv *env, jobject thisObj,
jobjectArray jTokens1, jdoubleArray jScores1,
jobjectArray jTokens2, jdoubleArray jScores2,
jobject jOptions
) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicateFuzzy(env, thisObj,
jTokens1, jScores1, jTokens2, jScores2, jOptions, libpostal_is_name_duplicate_fuzzy);
}

JNIEXPORT jdouble JNICALL Java_com_mapzen_jpostal_Dedupe_isStreetDuplicateFuzzy(
JNIEnv *env, jobject thisObj,
jobjectArray jTokens1, jdoubleArray jScores1,
jobjectArray jTokens2, jdoubleArray jScores2,
jobject jOptions
) {
return Java_com_mapzen_jpostal_Dedupe_isDuplicateFuzzy(env, thisObj,
jTokens1, jScores1, jTokens2, jScores2, jOptions, libpostal_is_street_duplicate_fuzzy);
}
123 changes: 123 additions & 0 deletions src/main/java/com/mapzen/jpostal/Dedupe.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package com.mapzen.jpostal;

public class Dedupe {
static {
System.loadLibrary("jpostal_dedupe"); // Load native library at runtime
}

private volatile static Dedupe instance = null;

public static Dedupe getInstanceDataDir(String dataDir) {
if (instance == null) {
synchronized(Dedupe.class) {
if (instance == null) {
instance = new Dedupe(dataDir);
}
}
}
return instance;
}

public static Dedupe getInstance() {
return getInstanceDataDir(null);
}

static native synchronized void setup();
static native synchronized void setupDataDir(String dataDir);

private static native synchronized int isStreetDuplicate(String street1, String street2, DuplicateOptions options);
private static native synchronized int isNameDuplicate(String name1, String name2, DuplicateOptions options);
private static native synchronized int isHouseNumberDuplicate(String name1, String name2, DuplicateOptions options);
private static native synchronized int isPOBoxDuplicate(String poBox1, String poBox2, DuplicateOptions options);
private static native synchronized int isUnitDuplicate(String unit1, String unit2, DuplicateOptions options);
private static native synchronized int isFloorDuplicate(String floor1, String floor2, DuplicateOptions options);
private static native synchronized int isPostalCodeDuplicate(String postalCode1, String postalCode2, DuplicateOptions options);
private static native synchronized double isNameDuplicateFuzzy(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2, DuplicateOptions options);
private static native synchronized double isStreetDuplicateFuzzy(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2, DuplicateOptions options);

static native synchronized void teardown();

public DuplicateStatus isStreetDupe(String street1, String street2) {
return isStreetDupeWithOptions(street1, street2, new DuplicateOptions.Builder().build());
}

public DuplicateStatus isStreetDupeWithOptions(String street1, String street2, DuplicateOptions options) {
return DuplicateStatus.fromInt(isStreetDuplicate(street1, street2, options));
}

public DuplicateStatus isNameDupe(String name1, String name2) {
return isNameDupeWithOptions(name1, name2, new DuplicateOptions.Builder().build());
}

public DuplicateStatus isNameDupeWithOptions(String name1, String name2, DuplicateOptions options) {
return DuplicateStatus.fromInt(isNameDuplicate(name1, name2, options));
}

public DuplicateStatus isHouseNumberDupe(String houseNumber1, String houseNumber2) {
return isHouseNumberDupeWithOptions(houseNumber1, houseNumber2, new DuplicateOptions.Builder().build());
}

public DuplicateStatus isHouseNumberDupeWithOptions(String houseNumber1, String houseNumber2, DuplicateOptions options) {
return DuplicateStatus.fromInt(isHouseNumberDuplicate(houseNumber1, houseNumber2, options));
}

public DuplicateStatus isPOBoxDupe(String poBox1, String poBox2) {
return isPOBoxDupeWithOptions(poBox1, poBox2, new DuplicateOptions.Builder().build());
}

public DuplicateStatus isPOBoxDupeWithOptions(String poBox1, String poBox2, DuplicateOptions options) {
return DuplicateStatus.fromInt(isPOBoxDuplicate(poBox1, poBox2, options));
}

public DuplicateStatus isUnitDupe(String unit1, String unit2) {
return isUnitDupeWithOptions(unit1, unit2, new DuplicateOptions.Builder().build());
}

public DuplicateStatus isUnitDupeWithOptions(String unit1, String unit2, DuplicateOptions options) {
return DuplicateStatus.fromInt(isUnitDuplicate(unit1, unit2, options));
}

public DuplicateStatus isFloorDupe(String floor1, String floor2) {
return isFloorDupeWithOptions(floor1, floor2, new DuplicateOptions.Builder().build());
}

public DuplicateStatus isFloorDupeWithOptions(String floor1, String floor2, DuplicateOptions options) {
return DuplicateStatus.fromInt(isFloorDuplicate(floor1, floor2, options));
}

public DuplicateStatus isPostalCodeDupe(String postalCode1, String postalCode2) {
return isPostalCodeDupeWithOptions(postalCode1, postalCode2, new DuplicateOptions.Builder().build());
}

public DuplicateStatus isPostalCodeDupeWithOptions(String postalCode1, String postalCode2, DuplicateOptions options) {
return DuplicateStatus.fromInt(isPostalCodeDuplicate(postalCode1, postalCode2, options));
}

public double isNameDupeFuzzy(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2) {
return isNameDupeFuzzyWithOptions(tokens1, scores1, tokens2, scores2, new DuplicateOptions.Builder().build());
}

public double isNameDupeFuzzyWithOptions(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2, DuplicateOptions options) {
return isNameDuplicateFuzzy(tokens1, scores1, tokens2, scores2, options);
}

public double isStreetDupeFuzzy(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2) {
return isStreetDupeFuzzyWithOptions(tokens1, scores1, tokens2, scores2, new DuplicateOptions.Builder().build());
}

public double isStreetDupeFuzzyWithOptions(String[] tokens1, double[] scores1, String[] tokens2, double[] scores2, DuplicateOptions options) {
return isStreetDuplicateFuzzy(tokens1, scores1, tokens2, scores2, options);
}

protected Dedupe(String dataDir) {
if (dataDir == null) {
setup();
} else {
setupDataDir(dataDir);
}
}

protected void finalize() {
teardown();
}
}
Loading