diff --git a/docs/_build/html/.doctrees/api/blocker.doctree b/docs/_build/html/.doctrees/api/blocker.doctree
new file mode 100644
index 0000000..28de037
Binary files /dev/null and b/docs/_build/html/.doctrees/api/blocker.doctree differ
diff --git a/docs/_build/html/.doctrees/api/blocking_result.doctree b/docs/_build/html/.doctrees/api/blocking_result.doctree
new file mode 100644
index 0000000..23abe16
Binary files /dev/null and b/docs/_build/html/.doctrees/api/blocking_result.doctree differ
diff --git a/docs/_build/html/.doctrees/api/index.doctree b/docs/_build/html/.doctrees/api/index.doctree
new file mode 100644
index 0000000..577395e
Binary files /dev/null and b/docs/_build/html/.doctrees/api/index.doctree differ
diff --git a/docs/_build/html/.doctrees/changelog.doctree b/docs/_build/html/.doctrees/changelog.doctree
new file mode 100644
index 0000000..68be04f
Binary files /dev/null and b/docs/_build/html/.doctrees/changelog.doctree differ
diff --git a/docs/_build/html/.doctrees/environment.pickle b/docs/_build/html/.doctrees/environment.pickle
new file mode 100644
index 0000000..07cff67
Binary files /dev/null and b/docs/_build/html/.doctrees/environment.pickle differ
diff --git a/docs/_build/html/.doctrees/examples/deduplication.doctree b/docs/_build/html/.doctrees/examples/deduplication.doctree
new file mode 100644
index 0000000..e8baca9
Binary files /dev/null and b/docs/_build/html/.doctrees/examples/deduplication.doctree differ
diff --git a/docs/_build/html/.doctrees/examples/deduplication_2.doctree b/docs/_build/html/.doctrees/examples/deduplication_2.doctree
new file mode 100644
index 0000000..b1386d9
Binary files /dev/null and b/docs/_build/html/.doctrees/examples/deduplication_2.doctree differ
diff --git a/docs/_build/html/.doctrees/examples/index.doctree b/docs/_build/html/.doctrees/examples/index.doctree
new file mode 100644
index 0000000..7cf8194
Binary files /dev/null and b/docs/_build/html/.doctrees/examples/index.doctree differ
diff --git a/docs/_build/html/.doctrees/examples/record_linkage.doctree b/docs/_build/html/.doctrees/examples/record_linkage.doctree
new file mode 100644
index 0000000..2e1d926
Binary files /dev/null and b/docs/_build/html/.doctrees/examples/record_linkage.doctree differ
diff --git a/docs/_build/html/.doctrees/getting_started/index.doctree b/docs/_build/html/.doctrees/getting_started/index.doctree
new file mode 100644
index 0000000..67c5719
Binary files /dev/null and b/docs/_build/html/.doctrees/getting_started/index.doctree differ
diff --git a/docs/_build/html/.doctrees/getting_started/installation.doctree b/docs/_build/html/.doctrees/getting_started/installation.doctree
new file mode 100644
index 0000000..2501bf7
Binary files /dev/null and b/docs/_build/html/.doctrees/getting_started/installation.doctree differ
diff --git a/docs/_build/html/.doctrees/getting_started/quickstart.doctree b/docs/_build/html/.doctrees/getting_started/quickstart.doctree
new file mode 100644
index 0000000..9821c6f
Binary files /dev/null and b/docs/_build/html/.doctrees/getting_started/quickstart.doctree differ
diff --git a/docs/_build/html/.doctrees/index.doctree b/docs/_build/html/.doctrees/index.doctree
new file mode 100644
index 0000000..81f8619
Binary files /dev/null and b/docs/_build/html/.doctrees/index.doctree differ
diff --git a/docs/_build/html/.doctrees/user_guide/basic_operations.doctree b/docs/_build/html/.doctrees/user_guide/basic_operations.doctree
new file mode 100644
index 0000000..b0b51f9
Binary files /dev/null and b/docs/_build/html/.doctrees/user_guide/basic_operations.doctree differ
diff --git a/docs/_build/html/.doctrees/user_guide/configuration_tuning.doctree b/docs/_build/html/.doctrees/user_guide/configuration_tuning.doctree
new file mode 100644
index 0000000..d822dda
Binary files /dev/null and b/docs/_build/html/.doctrees/user_guide/configuration_tuning.doctree differ
diff --git a/docs/_build/html/.doctrees/user_guide/core_concepts.doctree b/docs/_build/html/.doctrees/user_guide/core_concepts.doctree
new file mode 100644
index 0000000..226dad4
Binary files /dev/null and b/docs/_build/html/.doctrees/user_guide/core_concepts.doctree differ
diff --git a/docs/_build/html/.doctrees/user_guide/evaluation_metrics.doctree b/docs/_build/html/.doctrees/user_guide/evaluation_metrics.doctree
new file mode 100644
index 0000000..fb56d6e
Binary files /dev/null and b/docs/_build/html/.doctrees/user_guide/evaluation_metrics.doctree differ
diff --git a/docs/_build/html/.doctrees/user_guide/index.doctree b/docs/_build/html/.doctrees/user_guide/index.doctree
new file mode 100644
index 0000000..8b91cc3
Binary files /dev/null and b/docs/_build/html/.doctrees/user_guide/index.doctree differ
diff --git a/docs/_build/html/.doctrees/user_guide/input_data_handling.doctree b/docs/_build/html/.doctrees/user_guide/input_data_handling.doctree
new file mode 100644
index 0000000..74eef27
Binary files /dev/null and b/docs/_build/html/.doctrees/user_guide/input_data_handling.doctree differ
diff --git a/docs/_build/html/_modules/blockingpy/blocker.html b/docs/_build/html/_modules/blockingpy/blocker.html
index a061e8d..90500a2 100644
--- a/docs/_build/html/_modules/blockingpy/blocker.html
+++ b/docs/_build/html/_modules/blockingpy/blocker.html
@@ -234,6 +234,9 @@
Source code for blockingpy.blocker
2. Sparse matrices (scipy.sparse.csr_matrix) as a Document-Term Matrix (DTM) 3. Dense matrices (numpy.ndarray) as a Document-Term Matrix (DTM)
+ For evaluation of larger datasets, we recommend using the separate eval() method
+ since it allows you to set the batch size for evaluation.
+
For text data, additional preprocessing is performed using the parameters in control_txt.
@@ -278,9 +281,11 @@
[docs]
- defeval(self,blocking_result:BlockingResult,true_blocks:pd.DataFrame)->BlockingResult:
+ defeval(
+ self,blocking_result:BlockingResult,true_blocks:pd.DataFrame,batch_size:int=1_000
+ )->BlockingResult:""" Evaluate blocking results against true block assignments and return new BlockingResult. This method calculates evaluation metrics and confusion matrix by comparing predicted blocks with known true blocks and returns a new BlockingResult instance containing the evaluation results
- along with the original blocking results.
+ along with the original blocking results. It allows you to set
+ the batch size for evaluation of larger datasets. Parameters ----------
@@ -488,6 +485,9 @@
Source code for blockingpy.blocker
DataFrame with true block assignments For deduplication: columns ['x', 'block'] For record linkage: columns ['x', 'y', 'block']
+ batch_size : int
+ Size of the batch for evaluation. This size if applied for both datasets
+ for record linkage. Defaults to 1,000. Returns -------
@@ -514,82 +514,68 @@
Whether the blocking was performed for deduplication true_blocks : pandas.DataFrame, optional DataFrame with true blocks to calculate evaluation metrics
- len_x : int
- Number of records in the original reference dataset
+ n_original_records : tuple[int, int]
+ Number of records in the original dataset(s) eval_metrics : pandas.Series, optional Evaluation metrics if true blocks were provided confusion : pandas.DataFrame, optional
@@ -116,6 +116,8 @@
Source code for blockingpy.blocking_result
Column names used in the blocking process graph : bool, optional Whether to create a graph from the blocking results (default False)
+ reduction_ratio : float, optional
+ Pre-calculated reduction ratio (default None) Attributes ----------
@@ -133,8 +135,10 @@
Source code for blockingpy.blocking_result
Names of columns used in blocking graph : networkx.Graph or None Network representation of blocking results if requested
- len_x : int
- Number of records in the original reference dataset
+ n_original_records : tuple[int, int]
+ Number of records in the original dataset(s)
+ reduction_ratio : float
+ Reduction ratio calculated for the blocking method Notes -----
@@ -148,12 +152,13 @@
self.result.groupby("block").agg({"x":"nunique","y":"nunique"}).sum(axis=1))block_size_dist=Counter(block_sizes.values)
- reduction_ratio=self._calculate_reduction_ratio()output=[]output.append("="*56)output.append(f"Blocking based on the {self.method} method.")output.append(f"Number of blocks: {len(block_sizes)}")output.append(f"Number of columns used for blocking: {len(self.colnames)}")
- output.append(f"Reduction ratio: {reduction_ratio:.4f}")
+ output.append(f"Reduction ratio: {self.reduction_ratio:.6f}")output.append("="*56)output.append("Distribution of the size of the blocks:")
@@ -251,13 +260,13 @@
Source code for blockingpy.blocking_result
"""ifself.deduplication:
- denominator=self.len_x*(self.len_x-1)/2
+ denominator=self.n_original_records[0]*(self.n_original_records[0]-1)/2block_sizes=self.result.groupby("block")[["x","y"]].apply(lambdax:len(pd.concat([x["x"],x["y"]]).unique()))numerator=(block_sizes*(block_sizes-1)/2).sum()iflen(block_sizes)>0else0else:
- denominator=self.len_x*len(self.result)
+ denominator=self.n_original_records[0]*self.n_original_records[1]block_comparisons=self.result.groupby("block").agg({"x":"nunique","y":"nunique"})numerator=(block_comparisons["x"]*block_comparisons["y"]).sum()
diff --git a/docs/_build/html/_sources/changelog.md.txt b/docs/_build/html/_sources/changelog.md.txt
index 72987cb..e57b0c5 100644
--- a/docs/_build/html/_sources/changelog.md.txt
+++ b/docs/_build/html/_sources/changelog.md.txt
@@ -1,5 +1,22 @@
# Changelog
+## v0.1.10
+- evaluation only for records that exist in true blocks.
+- default distance for `faiss` changed to `cosine`
+- code simplification
+- minor changes
+fix docs, fix eval, fix codecov
+
+## v0.1.9
+- optimized evaluation part to allow batch processing
+
+## v0.1.8
+- added author Maciej Beręsewicz
+- added info about funding
+- added data inside the package
+- added new deduplication example in docs
+- minor changes
+
## v0.1.7
- added CODE_OF_CONDUCT.md
- documentation update
diff --git a/docs/_build/html/_sources/examples/deduplication.md.txt b/docs/_build/html/_sources/examples/deduplication.md.txt
index 39e45f3..63d558e 100644
--- a/docs/_build/html/_sources/examples/deduplication.md.txt
+++ b/docs/_build/html/_sources/examples/deduplication.md.txt
@@ -236,6 +236,7 @@ eval_result = blocker.block(
# true_blocks=true_blocs_dedup
# )
# The rest stays the same in both cases
+#Note: We recommend using eval() method when evaluating larger datasets since it allows you to set the batch #size for currently evaluated record pairs.
print(eval_result)
print(eval_result.metrics)
diff --git a/docs/_build/html/_sources/examples/deduplication_2.md.txt b/docs/_build/html/_sources/examples/deduplication_2.md.txt
new file mode 100644
index 0000000..50b5df0
--- /dev/null
+++ b/docs/_build/html/_sources/examples/deduplication_2.md.txt
@@ -0,0 +1,241 @@
+# Deduplication No. 2
+
+In this example we'll use data known as `RLdata10000` taken from [RecordLinkage](https://cran.r-project.org/package=RecordLinkage) R package developed by Murat Sariyar
+and Andreas Borg. It contains 10 000 records in total where some have been duplicated with randomly generated errors. There are 9000 original records and 1000 duplicates.
+
+## Data Preparation
+
+Let's install `blockingpy`
+
+```bash
+pip install blockingpy
+```
+
+Import necessary packages and functions:
+
+```python
+import pandas as pd
+from blockingpy import Blocker
+from blockingpy.datasets import load_deduplication_data()
+```
+
+Let's load the data and take a look at first 5 rows:
+
+```python
+data = load_deduplication_data()
+data.head()
+
+# fname_c1 fname_c2 lname_c1 lname_c2 by bm bd id true_id
+# 0 FRANK NaN MUELLER NaN 1967 9 27 1 3606
+# 1 MARTIN NaN SCHWARZ NaN 1967 2 17 2 2560
+# 2 HERBERT NaN ZIMMERMANN NaN 1961 11 6 3 3892
+# 3 HANS NaN SCHMITT NaN 1945 8 14 4 329
+# 4 UWE NaN KELLER NaN 2000 7 5 5 1994
+```
+
+Now we need to prepare the `txt` column:
+
+```python
+data = data.fillna('')
+data[['by', 'bm', 'bd']] = data[['by', 'bm', 'bd']].astype('str')
+data['txt'] = (
+ data["fname_c1"] +
+ data["fname_c2"] +
+ data['lname_c1'] +
+ data['lname_c2'] +
+ data['by'] +
+ data['bm'] +
+ data['bd']
+ )
+data['txt'].head()
+
+# 0 FRANKMUELLER1967927
+# 1 MARTINSCHWARZ1967217
+# 2 HERBERTZIMMERMANN1961116
+# 3 HANSSCHMITT1945814
+# 4 UWEKELLER200075
+# Name: txt, dtype: object
+```
+
+## Basic Deduplication
+
+Let's perfrom basic deduplication using `hnsw` algorithm
+
+```python
+blocker = Blocker()
+dedup_result = blocker.block(
+ x=data['txt'],
+ ann='hnsw',
+ verbose=1,
+)
+
+# ===== creating tokens =====
+# ===== starting search (hnsw, x, y: 10000,10000, t: 674) =====
+# ===== creating graph =====
+```
+
+We can now take a look at the results:
+
+```python
+print(dedup_result)
+
+# ========================================================
+# Blocking based on the hnsw method.
+# Number of blocks: 2736
+# Number of columns used for blocking: 674
+# Reduction ratio: 0.9996
+# ========================================================
+# Distribution of the size of the blocks:
+# Block Size | Number of Blocks
+# 2 | 962
+# 3 | 725
+# 4 | 409
+# 5 | 263
+# 6 | 139
+# 7 | 89
+# 8 | 52
+# 9 | 37
+# 10 | 24
+# 11 | 14
+# 12 | 9
+# 13 | 5
+# 14 | 2
+# 15 | 1
+# 16 | 1
+# 17 | 2
+# 20 | 1
+# 64 | 1
+```
+
+and:
+
+```python
+print(dedup_result.result)
+# x y block dist
+# 0 3402 0 0 0.256839
+# 1 1179 1 1 0.331352
+# 2 2457 2 2 0.209737
+# 3 1956 3 3 0.085341
+# 4 4448 4 4 0.375000
+# ... ... ... ... ...
+# 7259 9206 9994 1981 0.390912
+# 7260 6309 9995 1899 0.268436
+# 7261 5162 9996 1742 0.188893
+# 7262 6501 9997 1293 0.245406
+# 7263 5183 9999 1273 0.209088
+```
+
+Let's see the pair in the `block` no. `3`
+
+```python
+print(data.iloc[[1956, 3], : ])
+# fname_c1 fname_c2 lname_c1 ... id true_id txt
+# 1956 HRANS SCHMITT ... 1957 329 HRANSSCHMITT1945814
+# 3 HANS SCHMITT ... 4 329 HANSSCHMITT1945814
+```
+
+## True Blocks Preparation
+
+```python
+df_eval = data.copy()
+df_eval['block'] = df_eval['true_id']
+df_eval['x'] = range(len(df_eval))
+```
+
+```python
+print(df_eval.head())
+# fname_c1 fname_c2 lname_c1 ... txt block x
+# 0 FRANK MUELLER ... FRANKMUELLER1967927 3606 0
+# 1 MARTIN SCHWARZ ... MARTINSCHWARZ1967217 2560 1
+# 2 HERBERT ZIMMERMANN ... HERBERTZIMMERMANN1961116 3892 2
+# 3 HANS SCHMITT ... HANSSCHMITT1945814 329 3
+# 4 UWE KELLER ... UWEKELLER200075 1994 4
+```
+
+Let's create the final `true_blocks_dedup`:
+
+```python
+true_blocks_dedup = df_eval[['x', 'block']]
+```
+
+## Evaluation
+
+Now we can evaluate our algorithm:
+
+```python
+control_ann = {
+ "faiss":{
+ "distance": "cosine"
+ }
+}
+
+blocker = Blocker()
+eval_result = blocker.block(
+ x=df_eval['txt'],
+ ann='faiss',
+ true_blocks=true_blocks_dedup,
+ verbose=1,
+ control_ann=control_ann
+)
+# ===== creating tokens =====
+# ===== starting search (faiss, x, y: 10000,10000, t: 674) =====
+# ===== creating graph =====
+```
+And the results:
+
+```python
+print(eval_result)
+print(eval_result.metrics)
+# ========================================================
+# Blocking based on the faiss method.
+# Number of blocks: 2737
+# Number of columns used for blocking: 674
+# Reduction ratio: 0.9996
+# ========================================================
+# Distribution of the size of the blocks:
+# Block Size | Number of Blocks
+# 2 | 972
+# 3 | 721
+# 4 | 423
+# 5 | 236
+# 6 | 138
+# 7 | 92
+# 8 | 62
+# 9 | 29
+# 10 | 28
+# 11 | 15
+# 12 | 8
+# 13 | 3
+# 14 | 3
+# 15 | 1
+# 16 | 1
+# 17 | 2
+# 18 | 1
+# 20 | 1
+# 67 | 1
+# ========================================================
+# Evaluation metrics (standard):
+# recall : 100.0
+# precision : 4.7651
+# fpr : 0.04
+# fnr : 0.0
+# accuracy : 99.96
+# specificity : 99.96
+# f1_score : 9.0967
+# recall 1.000000
+# precision 0.047651
+# fpr 0.000400
+# fnr 0.000000
+# accuracy 0.999600
+# specificity 0.999600
+# f1_score 0.090967
+```
+
+```python
+print(eval_result.confusion)
+# Actual Negative Actual Positive
+# Predicted Negative 49974014 0
+# Predicted Positive 19986 1000
+```
+
+The results show high reduction ratio `0.9996` alongside perfect recall (`1.000`) indicating that our package handled this dataset very well.
\ No newline at end of file
diff --git a/docs/_build/html/_sources/examples/index.md.txt b/docs/_build/html/_sources/examples/index.md.txt
index e628e1f..a8f71d7 100644
--- a/docs/_build/html/_sources/examples/index.md.txt
+++ b/docs/_build/html/_sources/examples/index.md.txt
@@ -5,4 +5,5 @@
:maxdepth: 1
record_linkage
-deduplication
\ No newline at end of file
+deduplication
+deduplication_2
\ No newline at end of file
diff --git a/docs/_build/html/_sources/examples/record_linkage.md.txt b/docs/_build/html/_sources/examples/record_linkage.md.txt
index 794e1e6..24f0983 100644
--- a/docs/_build/html/_sources/examples/record_linkage.md.txt
+++ b/docs/_build/html/_sources/examples/record_linkage.md.txt
@@ -1,13 +1,17 @@
(record_linkage)=
# Record Linkage
-This example demonstrates how to use BlockingPy for record linkage between two datasets. We'll use example data from the URos 2021 Conference tutorial which contains:
+This example demonstrates how to use BlockingPy for record linkage between two datasets. We'll use example data created by Paula McLeod, Dick Heasman and Ian Forbes, ONS,
+ for the ESSnet DI on-the-job training course, Southampton,
+ 25-28 January 2011:
- Census: A fictional dataset representing observations from a decennial Census
- CIS: Fictional observations from Customer Information System (combined administrative data from tax and benefit systems)
Some records in the CIS dataset contain Census person IDs, which we'll use to evaluate our blocking performance.
+This datasets come with the `BlockingPy` package and can be accesed via `load_census_cis_data` function from `blockingpy.datasets`.
+
## Setup
First, install BlockingPy:
@@ -20,6 +24,7 @@ Import required packages:
```python
from blockingpy import Blocker
+from blockingpy.datasets import load_census_cis_data
import pandas as pd
```
@@ -28,8 +33,14 @@ import pandas as pd
Download example data:
```python
-census = pd.read_csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv")
-cis = pd.read_csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv")
+census, cis = load_census_cis_data()
+```
+
+Firstly, we need to filter only those columns which we'll need:
+
+```python
+census = census[["PERSON_ID", "PERNAME1", "PERNAME2", "SEX", "DOB_DAY", "DOB_MON", "DOB_YEAR", "ENUMCAP", "ENUMPC"]]
+cis = cis[["PERSON_ID", "PERNAME1", "PERNAME2", "SEX", "DOB_DAY", "DOB_MON", "DOB_YEAR", "ENUMCAP", "ENUMPC"]]
```
Let's take a look at the data:
@@ -37,14 +48,14 @@ Let's take a look at the data:
```python
print(census.head())
-# person_id pername1 pername2 sex dob_day dob_mon dob_year \
+# PERSON_ID PERNAME1 PERNAME2 SEX DOB_DAY DOB_MON DOB_YEAR \
# 0 DE03US001001 COUIE PRICE M 1.0 6 1960.0
# 1 DE03US001002 ABBIE PVICE F 9.0 11 1961.0
# 2 DE03US001003 LACEY PRICE F 7.0 2 1999.0
# 3 DE03US001004 SAMUEL PRICE M 13.0 4 1990.0
# 4 DE03US001005 JOSEPH PRICE M 20.0 4 1986.0
-# enumcap enumpc
+# ENUMCAP ENUMPC
# 0 1 WINDSOR ROAD DE03US
# 1 1 WINDSOR ROAD DE03US
# 2 1 WINDSOR ROAD DE03US
@@ -53,19 +64,19 @@ print(census.head())
print(cis.head())
-# person_id pername1 pername2 sex dob_day dob_mon dob_year \
-# 0 NaN HAYDEN HALL M NaN 1 NaN
-# 1 NaN SEREN ANDERSON F 1.0 1 NaN
-# 2 NaN LEWIS LEWIS M 1.0 1 NaN
-# 3 NaN HARRISON POSTER M 5.0 1 NaN
-# 4 NaN MUHAMMED WATSUN M 7.0 1 NaN
+# PERSON_ID PERNAME1 PERNAME2 SEX DOB_DAY DOB_MON DOB_YEAR \
+# 0 PO827ER091001 HAYDEN HALL M NaN 1 NaN
+# 1 LS992DB024001 SEREN ANDERSON F 1.0 1 NaN
+# 2 M432ZZ053003 LEWIS LEWIS M 1.0 1 NaN
+# 3 SW75TQ018001 HARRISON POSTER M 5.0 1 NaN
+# 4 EX527TR017006 MUHAMMED WATSUN M 7.0 1 NaN
-# enumcap enumpc
+# ENUMCAP ENUMPC
# 0 91 CLARENCE ROAD PO827ER
# 1 24 CHURCH LANE LS992DB
# 2 53 CHURCH ROAD M432ZZ
# 3 19 HIGHFIELD ROAD SW75TG
-# 4 17 VICTORIA STREET NaN
+# 4 17 VICTORIA STREET NaN
print(census.shape)
# (25343, 9)
@@ -78,21 +89,21 @@ Preprocess data and create column `txt` containing concatenated variables:
```python
# Convert numeric fields to strings
-census[['dob_day', 'dob_mon', 'dob_year']] = census[['dob_day', 'dob_mon', 'dob_year']].astype(str)
-cis[['dob_day', 'dob_mon', 'dob_year']] = cis[['dob_day', 'dob_mon', 'dob_year']].astype(str)
+census[['DOB_DAY', 'DOB_MON', 'DOB_YEAR']] = census[['DOB_DAY', 'DOB_MON', 'DOB_YEAR']].astype(str)
+cis[['DOB_DAY', 'DOB_MON', 'DOB_YEAR']] = cis[['DOB_DAY', 'DOB_MON', 'DOB_YEAR']].astype(str)
# Fill NAs with empty strings
census = census.fillna('')
cis = cis.fillna('')
# Concatenate fields
-census['txt'] = census['pername1'] + census['pername2'] + census['sex'] + \
- census['dob_day'] + census['dob_mon'] + census['dob_year'] + \
- census['enumcap'] + census['enumpc']
+census['txt'] = census['PERNAME1'] + census['PERNAME2'] + census['SEX'] + \
+ census['DOB_DAY'] + census['DOB_MON'] + census['DOB_YEAR'] + \
+ census['ENUMCAP'] + census['ENUMPC']
-cis['txt'] = cis['pername1'] + cis['pername2'] + cis['sex'] + \
- cis['dob_day'] + cis['dob_mon'] + cis['dob_year'] + \
- cis['enumcap'] + cis['enumpc']
+cis['txt'] = cis['PERNAME1'] + cis['PERNAME2'] + cis['SEX'] + \
+ cis['DOB_DAY'] + cis['DOB_MON'] + cis['DOB_YEAR'] + \
+ cis['ENUMCAP'] + cis['ENUMPC']
```
Let's see how the new column looks like:
@@ -159,7 +170,7 @@ print(rec_lin_result)
# Blocking based on the hnsw method.
# Number of blocks: 23996
# Number of columns used for blocking: 1072
-# Reduction ratio: 1.0000
+# Reduction ratio: 0.999961
# ========================================================
# Distribution of the size of the blocks:
# Block Size | Number of Blocks
@@ -182,30 +193,28 @@ Let's take a look at the pair in block `0` :
print(cis.iloc[0, :])
print(census.iloc[17339, :])
-# person_id
-# pername1 HAYDEN
-# pername2 HALL
-# sex M
-# dob_day nan
-# dob_mon 1
-# dob_year nan
-# enumcap 91 CLARENCE ROAD
-# enumpc PO827ER
+# PERSON_ID PO827ER091001
+# PERNAME1 HAYDEN
+# PERNAME2 HALL
+# SEX M
+# DOB_DAY nan
+# DOB_MON 1
+# DOB_YEAR nan
+# ENUMCAP 91 CLARENCE ROAD
+# ENUMPC PO827ER
# txt HAYDENHALLMnan1nan91 CLARENCE ROADPO827ER
-# y 0
# Name: 0, dtype: object
-
-# person_id PO827ER091001
-# pername1 HAYDEM
-# pername2 HALL
-# sex M
-# dob_day 1.0
-# dob_mon 1
-# dob_year 1957.0
-# enumcap 91 CLARENCE ROAD
-# enumpc PO827ER
+# PERSON_ID PO827ER091001
+# PERNAME1 HAYDEM
+# PERNAME2 HALL
+# SEX M
+# DOB_DAY 1.0
+# DOB_MON 1
+# DOB_YEAR 1957.0
+# ENUMCAP 91 CLARENCE ROAD
+# ENUMPC PO827ER
# txt HAYDEMHALLM1.011957.091 CLARENCE ROADPO827ER
-# x 17339
+# Name: 17339, dtype: object
```
@@ -220,16 +229,20 @@ cis['y'] = range(len(cis))
# Find true matches using person_id
matches = pd.merge(
- left=census[['person_id', 'x']],
- right=cis[['person_id', 'y']],
- on='person_id'
+ left=census[['PERSON_ID', 'x']],
+ right=cis[['PERSON_ID', 'y']],
+ on='PERSON_ID'
)
# Add block numbers
matches['block'] = range(len(matches))
matches.shape
-# (971, 4)
+# (24043, 4)
+```
+Let's sample 1000 pairs for which we will evaluate:
+```python
+matches = matches.sample(1000, random_state=42)
```
Now we can evaluate the algorithm:
@@ -260,6 +273,9 @@ eval_result = blocker.block(
# true_blocks=matches[['x', 'y', 'block']]
#)
# The procedure in both cases stays the same.
+
+# Note: We recommend using eval() method when evaluating larger datasets
+# since it allows you to set the batch size for currently evaluated record pairs.
```
and print results with evaluation metrics:
diff --git a/docs/_build/html/_sources/index.md.txt b/docs/_build/html/_sources/index.md.txt
index 378e675..d207e60 100644
--- a/docs/_build/html/_sources/index.md.txt
+++ b/docs/_build/html/_sources/index.md.txt
@@ -41,21 +41,36 @@ If you're new to BlockingPy, we recommend following these steps:
4. Explore the {ref}`user-guide` for detailed usage instructions
5. Obtain more information via {ref}`api`
+## Example Datasets
+
+BlockingPy comes with built-in example datasets:
+
+- Census-Cis dataset created by Paula McLeod, Dick Heasman and Ian Forbes, ONS,
+ for the ESSnet DI on-the-job training course, Southampton,
+ 25-28 January 2011
+
+- Deduplication dataset taken from [RecordLinkage](https://cran.r-project.org/package=RecordLinkage) R package developed by Murat Sariyar
+ and Andreas Borg. Package is licensed under GPL-3 license. Also known as [RLdata10000](https://www.rdocumentation.org/packages/RecordLinkage/versions/0.4-12.4/topics/RLdata).
+
## License
-BlockingPy is released under [MIT license](https://github.com/T-Strojny/BlockingPy/blob/main/LICENSE).
+BlockingPy is released under [MIT license](https://github.com/ncn-foreigners/BlockingPy/blob/main/LICENSE).
## Issues
-Feel free to report any issues, bugs, suggestions with github issues [here](https://github.com/T-Strojny/BlockingPy/issues).
+Feel free to report any issues, bugs, suggestions with github issues [here](https://github.com/ncn-foreigners/BlockingPy/issues).
## Contributing
-Please see [CONTRIBUTING.md](https://github.com/T-Strojny/BlockingPy/blob/main/CONTRIBUTING.md) for more information.
+Please see [CONTRIBUTING.md](https://github.com/ncn-foreigners/BlockingPy/blob/main/CONTRIBUTING.md) for more information.
## Code of Conduct
You can find it [here](https://github.com/ncn-foreigners/BlockingPy/blob/main/CODE_OF_CONDUCT.md).
## Acknowledgements
-This package is based on the R [blocking](https://github.com/ncn-foreigners/blocking/tree/main) package developed by [BERENZ](https://github.com/BERENZ). Special thanks to the original author for his foundational work in this area.
+This package is based on the R [blocking](https://github.com/ncn-foreigners/blocking/tree/main) package developed by [BERENZ](https://github.com/BERENZ).
+
+## Funding
+
+Work on this package is supported by the National Science Centre, OPUS 20 grant no. 2020/39/B/HS4/00941 (Towards census-like statistics for foreign-born populations -- quality, data integration and estimation)
diff --git a/docs/_build/html/_sources/user_guide/basic_operations.md.txt b/docs/_build/html/_sources/user_guide/basic_operations.md.txt
index ed1d4a9..f4d4e5e 100644
--- a/docs/_build/html/_sources/user_guide/basic_operations.md.txt
+++ b/docs/_build/html/_sources/user_guide/basic_operations.md.txt
@@ -122,11 +122,15 @@ result = blocker.block(
)
evals = blocker.eval(
blocking_result=result,
- true_blocks=true_blocks
+ true_blocks=true_blocks,
+ batch_size=100 # (default is 10,000)
)
print(evals.metrics)
print(evals.confusion)
```
+
+Note: We recommend using eval() method when evaluating larger datasets since it allows you to set the batch size for currently evaluated record pairs.
+
### Example ground truth for record linkage
```python
diff --git a/docs/_build/html/api/blocker.html b/docs/_build/html/api/blocker.html
index 1ba5a25..a0944cc 100644
--- a/docs/_build/html/api/blocker.html
+++ b/docs/_build/html/api/blocker.html
@@ -233,6 +233,8 @@
1. Text data (pandas.Series)
2. Sparse matrices (scipy.sparse.csr_matrix) as a Document-Term Matrix (DTM)
3. Dense matrices (numpy.ndarray) as a Document-Term Matrix (DTM)
+
For evaluation of larger datasets, we recommend using the separate eval() method
+since it allows you to set the batch size for evaluation.
For text data, additional preprocessing is performed using
the parameters in control_txt.
Evaluate blocking results against true block assignments and return new BlockingResult.
This method calculates evaluation metrics and confusion matrix
by comparing predicted blocks with known true blocks and returns
a new BlockingResult instance containing the evaluation results
-along with the original blocking results.
+along with the original blocking results. It allows you to set
+the batch size for evaluation of larger datasets.
Parameters:
@@ -263,6 +266,8 @@
true_blocks (pandas.DataFrame) – DataFrame with true block assignments
For deduplication: columns [‘x’, ‘block’]
For record linkage: columns [‘x’, ‘y’, ‘block’]
+
batch_size (int) – Size of the batch for evaluation. This size if applied for both datasets
+for record linkage. Defaults to 1,000.
In this example we’ll use data known as RLdata10000 taken from RecordLinkage R package developed by Murat Sariyar
+and Andreas Borg. It contains 10 000 records in total where some have been duplicated with randomly generated errors. There are 9000 original records and 1000 duplicates.
Let’s load the data and take a look at first 5 rows:
+
data=load_deduplication_data()
+data.head()
+
+# fname_c1 fname_c2 lname_c1 lname_c2 by bm bd id true_id
+# 0 FRANK NaN MUELLER NaN 1967 9 27 1 3606
+# 1 MARTIN NaN SCHWARZ NaN 1967 2 17 2 2560
+# 2 HERBERT NaN ZIMMERMANN NaN 1961 11 6 3 3892
+# 3 HANS NaN SCHMITT NaN 1945 8 14 4 329
+# 4 UWE NaN KELLER NaN 2000 7 5 5 1994
+
This example demonstrates how to use BlockingPy for record linkage between two datasets. We’ll use example data from the URos 2021 Conference tutorial which contains:
+
This example demonstrates how to use BlockingPy for record linkage between two datasets. We’ll use example data created by Paula McLeod, Dick Heasman and Ian Forbes, ONS,
+for the ESSnet DI on-the-job training course, Southampton,
+25-28 January 2011:
Census: A fictional dataset representing observations from a decennial Census
CIS: Fictional observations from Customer Information System (combined administrative data from tax and benefit systems)
Some records in the CIS dataset contain Census person IDs, which we’ll use to evaluate our blocking performance.
+
This datasets come with the BlockingPy package and can be accesed via load_census_cis_data function from blockingpy.datasets.
Census-Cis dataset created by Paula McLeod, Dick Heasman and Ian Forbes, ONS,
+for the ESSnet DI on-the-job training course, Southampton,
+25-28 January 2011
+
Deduplication dataset taken from RecordLinkage R package developed by Murat Sariyar
+and Andreas Borg. Package is licensed under GPL-3 license. Also known as RLdata10000.
Work on this package is supported by the National Science Centre, OPUS 20 grant no. 2020/39/B/HS4/00941 (Towards census-like statistics for foreign-born populations – quality, data integration and estimation)
diff --git a/docs/changelog.md b/docs/changelog.md
index fa1f265..53abdcd 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,11 @@
# Changelog
+## v0.1.10
+- evaluation only for records that exist in true blocks.
+- default distance for `faiss` changed to `cosine`
+- code simplification
+- minor changes
+
## v0.1.9
- optimized evaluation part to allow batch processing
diff --git a/docs/examples/record_linkage.md b/docs/examples/record_linkage.md
index 27b8f32..24f0983 100644
--- a/docs/examples/record_linkage.md
+++ b/docs/examples/record_linkage.md
@@ -64,19 +64,19 @@ print(census.head())
print(cis.head())
-# PERSON_ID PERNAME1 PERNAME2 SEX DOB_DAY DOB_MON DOB_YEAR \
-# 0 NaN HAYDEN HALL M NaN 1 NaN
-# 1 NaN SEREN ANDERSON F 1.0 1 NaN
-# 2 NaN LEWIS LEWIS M 1.0 1 NaN
-# 3 NaN HARRISON POSTER M 5.0 1 NaN
-# 4 NaN MUHAMMED WATSUN M 7.0 1 NaN
+# PERSON_ID PERNAME1 PERNAME2 SEX DOB_DAY DOB_MON DOB_YEAR \
+# 0 PO827ER091001 HAYDEN HALL M NaN 1 NaN
+# 1 LS992DB024001 SEREN ANDERSON F 1.0 1 NaN
+# 2 M432ZZ053003 LEWIS LEWIS M 1.0 1 NaN
+# 3 SW75TQ018001 HARRISON POSTER M 5.0 1 NaN
+# 4 EX527TR017006 MUHAMMED WATSUN M 7.0 1 NaN
# ENUMCAP ENUMPC
# 0 91 CLARENCE ROAD PO827ER
# 1 24 CHURCH LANE LS992DB
# 2 53 CHURCH ROAD M432ZZ
# 3 19 HIGHFIELD ROAD SW75TG
-# 4 17 VICTORIA STREET NaN
+# 4 17 VICTORIA STREET NaN
print(census.shape)
# (25343, 9)
@@ -170,7 +170,7 @@ print(rec_lin_result)
# Blocking based on the hnsw method.
# Number of blocks: 23996
# Number of columns used for blocking: 1072
-# Reduction ratio: 1.0000
+# Reduction ratio: 0.999961
# ========================================================
# Distribution of the size of the blocks:
# Block Size | Number of Blocks
@@ -193,7 +193,7 @@ Let's take a look at the pair in block `0` :
print(cis.iloc[0, :])
print(census.iloc[17339, :])
-# PERSON_ID
+# PERSON_ID PO827ER091001
# PERNAME1 HAYDEN
# PERNAME2 HALL
# SEX M
@@ -203,9 +203,7 @@ print(census.iloc[17339, :])
# ENUMCAP 91 CLARENCE ROAD
# ENUMPC PO827ER
# txt HAYDENHALLMnan1nan91 CLARENCE ROADPO827ER
-# y 0
# Name: 0, dtype: object
-
# PERSON_ID PO827ER091001
# PERNAME1 HAYDEM
# PERNAME2 HALL
@@ -216,7 +214,7 @@ print(census.iloc[17339, :])
# ENUMCAP 91 CLARENCE ROAD
# ENUMPC PO827ER
# txt HAYDEMHALLM1.011957.091 CLARENCE ROADPO827ER
-# x 17339
+# Name: 17339, dtype: object
```
@@ -240,7 +238,11 @@ matches = pd.merge(
matches['block'] = range(len(matches))
matches.shape
-# (971, 4)
+# (24043, 4)
+```
+Let's sample 1000 pairs for which we will evaluate:
+```python
+matches = matches.sample(1000, random_state=42)
```
Now we can evaluate the algorithm:
diff --git a/tests/test_blocking.py b/tests/test_blocking.py
index 13224c4..21e46c1 100644
--- a/tests/test_blocking.py
+++ b/tests/test_blocking.py
@@ -280,7 +280,7 @@ def test_eval_basic_functionality(small_named_txt_data):
assert eval_result.method == result_no_eval.method
assert eval_result.deduplication == result_no_eval.deduplication
- assert eval_result.len_x == result_no_eval.len_x
+ assert eval_result.n_original_records == result_no_eval.n_original_records
pd.testing.assert_frame_equal(eval_result.result, result_no_eval.result)