-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwestoxon_council_tax.R
94 lines (76 loc) · 3.17 KB
/
westoxon_council_tax.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# ------------------------------------------------------------
# Task: Extract and Process Council Tax Data from a PDF
# Purpose: Extract data tables from a PDF document, clean and format them,
# and then save the result to a CSV file.
# Steps:
# 1. Read and extract tables from the specified PDF file.
# 2. Initialize field names for the final dataframe.
# 3. Loop through each extracted table to clean and format data.
# 4. Handle parish/town column naming and format band columns.
# 5. Merge cleaned data into a final dataframe and save it as a CSV.
# ------------------------------------------------------------
# Load necessary libraries
library(tabulizer)
library(dplyr)
# Specify the path to the PDF file
pdf_file <- "./wodc-council-tax-charges-2024-to-2025.pdf"
# Extract tables from all pages of the PDF file
tables <- extract_tables(pdf_file, pages = "all", guess = TRUE)
# Initialize column names for the final dataframe
fieldnames <- c('name', 'Council', 'Band A (6/9)', 'Band B (7/9)',
'Band C (8/9)', 'Band D (9/9)', 'Band E (11/9)',
'Band F (13/9)', 'Band G (15/9)', 'Band H (18/9)')
# Create an empty dataframe with the specified column names
final_df <- data.frame(matrix(ncol = length(fieldnames), nrow = 0))
colnames(final_df) <- fieldnames
# Assume the 'Council' column value for all rows is 'West Oxfordshire District Council'
council_name <- 'West Oxfordshire District Council'
# Loop through each extracted table to process and clean the data
for (i in seq_along(tables)) {
table <- tables[[i]]
# Convert the table into a dataframe
df <- as.data.frame(table, stringsAsFactors = FALSE)
# Skip if the table is empty
if (nrow(df) == 0) {
next
}
# Set the first row as column names
colnames(df) <- df[1, ]
df <- df[-1, ] # Remove the first row
# Remove extra spaces from column names and data
colnames(df) <- trimws(colnames(df))
df <- df %>% mutate(across(everything(), ~ trimws(.)))
# Rename the 'Parish/Town' column to 'name'
if ('Parish/Town' %in% colnames(df)) {
df <- df %>% rename(name = 'Parish/Town')
} else if ('Parish' %in% colnames(df)) {
df <- df %>% rename(name = 'Parish')
} else {
# If no appropriate column is found, skip this table
next
}
# Add the 'Council' column
df$Council <- council_name
# Rename the band columns to match the specified fieldnames
band_columns <- colnames(df)[grepl("^Band", colnames(df))]
band_mapping <- c(
'Band A' = 'Band A (6/9)',
'Band B' = 'Band B (7/9)',
'Band C' = 'Band C (8/9)',
'Band D' = 'Band D (9/9)',
'Band E' = 'Band E (11/9)',
'Band F' = 'Band F (13/9)',
'Band G' = 'Band G (15/9)',
'Band H' = 'Band H (18/9)'
)
# Apply the column renaming
df <- df %>% rename_at(vars(band_columns), ~ band_mapping[.])
# Select and arrange columns to match the specified fieldnames
df <- df %>% select(all_of(fieldnames))
# Append the cleaned dataframe to the final dataframe
final_df <- bind_rows(final_df, df)
}
# Save the final dataframe to a CSV file
output_csv <- "./wodc_council_tax_data.csv"
write.csv(final_df, output_csv, row.names = FALSE)
cat("Data successfully saved to", output_csv, "\n")