Skip to content
This repository has been archived by the owner on Sep 2, 2024. It is now read-only.

Commit

Permalink
New index injection (#53)
Browse files Browse the repository at this point in the history
* Initial commit - unfinished.

* Tests are green

* Fixes pafAll problems

* Adds tests for the poscode split

* Modifies mapping creation Json

* Added missing mappings.

* Added custom tokenizer.

* Added relatives. Added post mapping method.

* Added relatives. Added post mapping method.

* New lpi columns added.

* New lpi columns added.

* Sets "C" in postcodeIn as lowercase before it's too late (in the API project it's lowercase everywhere)
Removes legacy code that deals with individual paf and nag
Fixes bug where the name of the created index wasn't used

* httpcomponents version lowered to match hadoop version on Cloudera.
Inline object used in preference to reading file - temporary?

* Moves the construction of the nagAll field from SQL to the scala code

* Index naming change and null pointer fix.

* relatives not indexed or searched

* Moves Hybrid Document tests from Spark Provider tests
  • Loading branch information
Mironor authored Mar 6, 2017
1 parent e8602b2 commit fc81285
Show file tree
Hide file tree
Showing 18 changed files with 1,675 additions and 454 deletions.
3 changes: 2 additions & 1 deletion batch/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ libraryDependencies ++= Seq(
"com.typesafe" % "config" % "1.3.1",
"org.elasticsearch" %% "elasticsearch-spark" % "2.4.0" excludeAll ExclusionRule(organization = "javax.servlet"),
"org.scalatest" %% "scalatest" % "3.0.0" % Test,
"org.rogach" %% "scallop" % "2.0.5"
"org.rogach" %% "scallop" % "2.0.5",
"org.apache.httpcomponents" % "httpclient" % "4.2.5"
)

scalacOptions ++= List("-unchecked", "-Xlint")
349 changes: 349 additions & 0 deletions batch/src/main/resources/es_index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1,
"analysis": {
"analyzer": {
"welsh_no_split_analyzer": {
"tokenizer": "custom_keyword",
"filter": [
"asciifolding"
]
},
"welsh_split_analyzer": {
"tokenizer": "classic",
"filter": [
"asciifolding"
]
}
},
"tokenizer": {
"custom_keyword": {
"type": "keyword",
"buffer_size": 128
}
}
}
},
"mappings": {
"address": {
"properties": {
"lpi": {
"properties": {
"addressBasePostal": {
"type": "string",
"index": "not_analyzed"
},
"classificationCode": {
"type": "string",
"index": "not_analyzed"
},
"easting": {
"type": "float",
"index": "not_analyzed"
},
"location": {
"type": "geo_point",
"index": "not_analyzed"
},
"legalName": {
"type": "string",
"analyzer": "welsh_split_analyzer"
},
"level": {
"type": "string",
"index": "not_analyzed"
},
"locality": {
"type": "string",
"index": "not_analyzed"
},
"lpiLogicalStatus": {
"type": "byte",
"index": "not_analyzed"
},
"blpuLogicalStatus": {
"type": "byte",
"index": "not_analyzed"
},
"lpiKey": {
"type": "string",
"index": "not_analyzed"
},
"northing": {
"type": "float",
"index": "not_analyzed"
},
"officialFlag": {
"type": "string",
"index": "not_analyzed"
},
"organisation": {
"type": "string",
"analyzer": "welsh_split_analyzer"
},
"paoEndNumber": {
"type": "short",
"index": "not_analyzed"
},
"paoEndSuffix": {
"type": "string",
"index": "not_analyzed"
},
"paoStartNumber": {
"type": "short",
"index": "not_analyzed"
},
"paoStartSuffix": {
"type": "string",
"index": "not_analyzed"
},
"paoText": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"postcodeLocator": {
"type": "string",
"index": "not_analyzed"
},
"saoEndNumber": {
"type": "short",
"index": "not_analyzed"
},
"saoEndSuffix": {
"type": "string",
"index": "not_analyzed"
},
"saoStartNumber": {
"type": "short",
"index": "not_analyzed"
},
"saoStartSuffix": {
"type": "string",
"index": "not_analyzed"
},
"saoText": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"streetDescriptor": {
"type": "string",
"analyzer": "welsh_split_analyzer"
},
"townName": {
"type": "string",
"index": "not_analyzed"
},
"uprn": {
"type": "long",
"index": "not_analyzed"
},
"usrn": {
"type": "integer",
"index": "not_analyzed"
},
"parentUprn": {
"type": "long",
"index": "not_analyzed"
},
"multiOccCount": {
"type": "short",
"index": "not_analyzed"
},
"localCustodianCode": {
"type": "short",
"index": "not_analyzed"
},
"rpc": {
"type": "byte",
"index": "not_analyzed"
},
"usrnMatchIndicator": {
"type": "byte",
"index": "not_analyzed"
},
"language": {
"type": "string",
"index": "not_analyzed"
},
"streetClassification": {
"type": "byte",
"index": "not_analyzed"
},
"classScheme": {
"type": "string",
"index": "not_analyzed"
},
"crossReference": {
"type": "string",
"index": "not_analyzed"
},
"source": {
"type": "string",
"index": "not_analyzed"
},
"nagAll": {
"type": "string",
"analyzer": "welsh_split_analyzer"
},
"relatives": {
"type": "long",
"index": "no"
},
"lpiStartDate": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis",
"index": "not_analyzed"
},
"lpiLastUpdateDate": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis",
"index": "not_analyzed"
}
}
},
"paf": {
"properties": {
"buildingName": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"buildingNumber": {
"type": "short",
"index": "not_analyzed"
},
"changeType": {
"type": "string",
"index": "not_analyzed"
},
"deliveryPointSuffix": {
"type": "string",
"index": "not_analyzed"
},
"departmentName": {
"type": "string",
"analyzer": "welsh_split_analyzer"
},
"dependentLocality": {
"type": "string",
"index": "not_analyzed"
},
"dependentThoroughfare": {
"type": "string",
"index": "not_analyzed"
},
"doubleDependentLocality": {
"type": "string",
"index": "not_analyzed"
},
"endDate": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis",
"index": "not_analyzed"
},
"entryDate": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis",
"index": "not_analyzed"
},
"lastUpdateDate": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis",
"index": "not_analyzed"
},
"organizationName": {
"type": "string",
"analyzer": "welsh_split_analyzer"
},
"poBoxNumber": {
"type": "string",
"index": "not_analyzed"
},
"postTown": {
"type": "string",
"index": "not_analyzed"
},
"postcode": {
"type": "string",
"index": "not_analyzed"
},
"postcodeType": {
"type": "string",
"index": "not_analyzed"
},
"proOrder": {
"type": "long",
"index": "not_analyzed"
},
"processDate": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis",
"index": "not_analyzed"
},
"recordIdentifier": {
"type": "byte",
"index": "not_analyzed"
},
"startDate": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis",
"index": "not_analyzed"
},
"subBuildingName": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"thoroughfare": {
"type": "string",
"index": "not_analyzed"
},
"udprn": {
"type": "integer",
"index": "not_analyzed"
},
"uprn": {
"type": "long",
"index": "not_analyzed"
},
"welshDependentLocality": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"welshDependentThoroughfare": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"welshDoubleDependentLocality": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"welshPostTown": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"welshThoroughfare": {
"type": "string",
"analyzer": "welsh_no_split_analyzer"
},
"pafAll": {
"type": "string",
"analyzer": "welsh_split_analyzer"
}
}
},
"uprn": {
"type": "long",
"index": "not_analyzed"
},
"postcodeIn": {
"type": "string",
"index": "not_analyzed"
},
"postcodeOut": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
4 changes: 3 additions & 1 deletion batch/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,6 @@ addressindex.files.csv.lpi="batch/src/test/resources/csv/lpi/read_test.csv"
addressindex.files.csv.organisation="batch/src/test/resources/csv/organisation/read_test.csv"
addressindex.files.csv.street="batch/src/test/resources/csv/street/read_test.csv"
addressindex.files.csv.street-descriptor="batch/src/test/resources/csv/street_descriptor/read_test.csv"
addressindex.files.csv.successor="batch/src/test/resources/csv/successor/read_test.csv"
addressindex.files.csv.successor="batch/src/test/resources/csv/successor/read_test.csv"

addressindex.files.es.json="batch/src/main/resources/es_index.json"
Loading

0 comments on commit fc81285

Please sign in to comment.