openvenues · charsleysa · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,5 @@
+# Default code owner
+
+* @Senzing/senzing-gdev
+
+/.github/ @Senzing/senzing-devsecops
diff --git a/.github/pull.yml b/.github/pull.yml
@@ -0,0 +1,6 @@
+version: "1"
+rules:                              # Array of rules
+  - base: main                      # Required. Target branch
+    upstream: openvenues:master     # Required. Must be in the same fork network.
+    mergeMethod: squash             # Optional, one of [none, merge, squash, rebase, hardreset], Default: none.
+label: ":arrow_heading_down: pull"  # Optional
diff --git a/.github/workflows/add-labels-standardized.yaml b/.github/workflows/add-labels-standardized.yaml
@@ -0,0 +1,16 @@
+name: 'add labels standardized'
+
+on:
+  issues:
+    types:
+      - opened
+      - reopened
+
+jobs:
+  add-issue-labels:
+    permissions:
+      issues: write
+    secrets:
+      ORG_MEMBERSHIP_TOKEN: ${{ secrets.ORG_MEMBERSHIP_TOKEN }}
+      SENZING_MEMBERS: ${{ secrets.SENZING_MEMBERS }}
+    uses: senzing-factory/build-resources/.github/workflows/add-labels-to-issue.yaml@v1
diff --git a/.github/workflows/add-to-project-gdev.yaml b/.github/workflows/add-to-project-gdev.yaml
@@ -0,0 +1,20 @@
+name: 'add to project gdev'
+
+on:
+  issues:
+    types:
+      - opened
+      - reopened
+
+jobs:
+  add-to-project:
+    name: Add to classic project
+    runs-on: ubuntu-latest
+    steps:
+      - name: Add to classic project
+        uses: senzing-factory/github-action-add-issue-to-project@main
+        with:
+          project: "https://github.com/orgs/Senzing/projects/9"
+          column_name: "Backlog"
+        env:
+          GITHUB_TOKEN: ${{ secrets.SENZING_GITHUB_ACCESS_TOKEN }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -2,9 +2,9 @@ name: Test
 
 on:
   push:
-    branches: [master]
+    branches: [main]
   pull_request:
-    branches: [master]
+    branches: [main]
   workflow_dispatch:
 
 jobs:
@@ -14,7 +14,7 @@ jobs:
         os: [ubuntu-latest, macos-latest]
     runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Install Dependencies Linux
         if: matrix.os == 'ubuntu-latest'
         run: |

diff --git a/.gitignore b/.gitignore
@@ -82,3 +82,5 @@ docs/_build/
 
 # PyBuilder
 target/
+
+.history
diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,13 @@
+# Pull request questions
+
+## Which issue does this address
+
+Issue number: #nnn
+
+## Why was change needed
+
+???
+
+## What does change improve
+
+???
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ The core library is written in pure C. Language bindings for [Python](https://gi
 Sponsors
 --------
 
-If your company is using libpostal, consider asking your organization to sponsor the project. Interpreting what humans mean when they refer to locations is far from a solved problem, and sponsorships help us pursue new frontiers in geospatial NLP. As a sponsor, your company logo will appear prominently on the Github repo page along with a link to your site. [Sponsorship info](https://opencollective.com/libpostal#sponsor)
+If your company is using libpostal, consider asking your organization to sponsor the project. Interpreting what humans mean when they refer to locations is far from a solved problem, and sponsorships help us pursue new frontiers in geospatial NLP. As a sponsor, your company logo will appear prominently on the GitHub repo page along with a link to your site. [Sponsorship info](https://opencollective.com/libpostal#sponsor)
 
 <a href="https://opencollective.com/libpostal/sponsor/0/website" target="_blank"><img src="https://opencollective.com/libpostal/sponsor/0/avatar.svg"></a>
 <a href="https://opencollective.com/libpostal/sponsor/1/website" target="_blank"><img src="https://opencollective.com/libpostal/sponsor/1/avatar.svg"></a>
@@ -177,24 +177,6 @@ If you require a .lib import library to link this to your application. You can g
 lib.exe /def:libpostal.def /out:libpostal.lib /machine:x64
 ```
 
-Installation with an alternative data model
--------------------------------------------
-
-An alternative data model is available for libpostal. It is created by Senzing Inc. for improved parsing on US, UK and Singapore addresses and improved US rural route address handling.
-To enable this add `MODEL=senzing` to the conigure line during installation:
-```
-./configure --datadir=[...some dir with a few GB of space...] MODEL=senzing
-```
-
-The data for this model is gotten from [OpenAddress](https://openaddresses.io/), [OpenStreetMap](https://www.openstreetmap.org/) and data generated by Senzing based on customer feedback (a few hundred records), a total of about 1.2 billion records of data from over 230 countries, in 100+ languages. The data from OpenStreetMap and OpenAddress is good but not perfect so the data set was modified by filtering out badly formed addresses, correcting misclassified address tokens and removing tokens that didn't belong in the addresses, whenever these conditions were encountered.
-
-Senzing created a data set of 12950 addresses from 89 countries that it uses to test and verify the quality of its models. The data set was generated using random addresses from OSM, minimally 50 per country. Hard-to-parse addresses were gotten from Senzing support team and customers and from the libpostal github page and added to this set. The Senzing model got 4.3% better parsing results than the default model, using this test set.
-
-The size of this model is about 2.2GB compared to 1.8GB for the default model so keep that in mind if storages space is important.
-
-Further information about this data model can be found at: https://github.com/Senzing/libpostal-data
-If you run into any issues with this model, whether they have to do with parses, installation or any other problems, then please report them at https://github.com/Senzing/libpostal-data
-
 Examples of parsing
 -------------------
 
@@ -492,8 +474,7 @@ optionally be separated so Rosenstraße and Rosen Straße are equivalent.
 "Main Street", "city": "New York", "state": "New York"}. The parser works
 for a wide variety of countries and languages, not just US/English. 
 The model is trained on over 1 billion addresses and address-like strings, using the
-templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted,
-tagged traning examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
+templates in the [OpenCage address formatting repo](https://github.com/OpenCageData/address-formatting) to construct formatted, tagged training examples for every inhabited country in the world. Many types of [normalizations](https://github.com/openvenues/libpostal/blob/master/scripts/geodata/addresses/components.py)
 are performed to make the training data resemble real messy geocoder input as closely as possible.
 
 - **Language classification**: multinomial logistic regression
@@ -644,7 +625,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions:
 - Confines almost all mallocs to *name*_new and all frees to *name*_destroy
 - Efficient existing implementations for simple things like hashtables
 - Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
-- Data structrues take advantage of sparsity as much as possible
+- Data structures take advantage of sparsity as much as possible
 - Efficient double-array trie implementation for most string dictionaries
 - Cross-platform as much as possible, particularly for *nix
 
@@ -691,7 +672,7 @@ ways the address parser can be improved even further (in order of difficulty):
    when creating the training data that will ensure the model is trained to
    handle your use case without you having to do any manual data entry.
    If you see a pattern of obviously bad address parses, the best thing to
-   do is post an issue to Github.
+   do is post an issue to GitHub.
 
 Contributing
 ------------

diff --git a/configure.ac b/configure.ac
@@ -60,17 +60,6 @@ AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
 AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
 AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
 
-# Senzing data
-AC_SUBST([LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING], [v1])
-
-SENZING_DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/senzing/base_data)
-SENZING_PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/parser)
-SENZING_LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/senzing/language_classifier)
-
-AC_SUBST([LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION], [$SENZING_DATA_FILE_LATEST_VERSION])
-AC_SUBST([LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION], [$SENZING_PARSER_MODEL_LATEST_VERSION])
-AC_SUBST([LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION], [$SENZING_LANG_CLASS_MODEL_LATEST_VERSION])
-
 AC_CONFIG_FILES([Makefile
                  libpostal.pc
                  src/Makefile
@@ -108,10 +97,6 @@ AC_ARG_ENABLE([data-download],
                 *) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
               esac], [DOWNLOAD_DATA=true])
 
-AC_ARG_VAR(MODEL, [Option to use alternative data models. Currently available is "senzing" (MODEL=senzing). If this option is not set the default libpostal data model is used.])
-AS_VAR_IF([MODEL], [], [],
-  [AS_VAR_IF([MODEL], [senzing], [], [AC_MSG_FAILURE([Invalid MODEL value set])])])
-
 AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
 
 AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],

diff --git a/resources/boundaries/osm/nz.yaml b/resources/boundaries/osm/nz.yaml
@@ -9,7 +9,7 @@
         id:
             relation:
                 # Auckland
-                "2094141": "city"
+                "17000522": "city"
                 # Wellington
                 "4266321": "city"
                 # Christchurch
@@ -31,12 +31,12 @@
                 # Invercargill
                 "1656388": "city"
                 # Nelson
-                "4266962": "city"
+                "17000449": "city"
                 # Upper Hutt
                 "4266375": "city"
 
         use_admin_center:
-            - id: 2094141 # Auckland
+            - id: 17000522 # Auckland
               type: relation
             - id: 4266321 # Wellington
               type: relation
@@ -68,7 +68,7 @@
             - id: 1656388 # Invercargill
               type: relation
               probability: 0.7
-            - id: 4266962 # Nelson
+            - id: 17000449 # Nelson
               type: relation
             - id: 4266375 # Upper Hutt
               type: relation

diff --git a/src/libpostal_data.in b/src/libpostal_data.in
@@ -14,12 +14,10 @@ LIBPOSTAL_DATA_DIR=$3
 MB=$((1024*1024))
 CHUNK_SIZE=$((64*$MB))
 
-DATAMODEL="@MODEL@"
-
 # Not loving this approach but there appears to be no way to query the size
 # of a release asset without using the Github API
 LIBPOSTAL_DATA_FILE_CHUNKS=1
-LIBPOSTAL_PARSER_MODEL_CHUNKS=12
+LIBPOSTAL_PARSER_MODEL_CHUNKS=1
 LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
 
 LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_DATA_DIR_VERSION_STRING@"
@@ -34,21 +32,7 @@ LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
 LIBPOSTAL_PARSER_FILE="parser.tar.gz"
 LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
 
-LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
-
-if [ "$DATAMODEL" = "senzing" ]; then
-    LIBPOSTAL_DATA_FILE_CHUNKS=1
-    LIBPOSTAL_PARSER_MODEL_CHUNKS=1
-    LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
-
-    LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_SENZING_DATA_DIR_VERSION_STRING@"
-
-    LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_SENZING_DATA_FILE_LATEST_VERSION@"
-    LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_PARSER_MODEL_LATEST_VERSION@"
-    LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_SENZING_LANG_CLASS_MODEL_LATEST_VERSION@"
-
-    LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com"
-fi
+LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com"
 
 LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
 LIBPOSTAL_DATA_DIR_VERSION=

diff --git a/versions/base_data b/versions/base_data
@@ -1 +1 @@
-v1.0.0
+v1.1.0
diff --git a/versions/language_classifier b/versions/language_classifier
@@ -1 +1 @@
-v1.0.0
+v1.1.0
diff --git a/versions/parser b/versions/parser
@@ -1 +1 @@
-v1.0.0
+v1.1.0
diff --git a/versions/senzing/base_data b/versions/senzing/base_data
diff --git a/versions/senzing/language_classifier b/versions/senzing/language_classifier
diff --git a/versions/senzing/parser b/versions/senzing/parser
Original file line number	Diff line number	Diff line change
Expand Up		@@ -82,3 +82,5 @@ docs/_build/

		# PyBuilder
		target/

		.history