From 03e65ad1966b6b704e552ae069483a652ebfbd15 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 16 Sep 2024 15:19:16 -0400 Subject: [PATCH] Add more complex v4 methods and converters --- python/akimbo_ip/accessor.py | 148 +++++++++++++++++++++++++++++++-- src/lib.rs | 157 ++++++++++++++++++++++++++++++++--- tests/test_core.py | 23 ++++- 3 files changed, 304 insertions(+), 24 deletions(-) diff --git a/python/akimbo_ip/accessor.py b/python/akimbo_ip/accessor.py index 3a34ce1..ccc3ea0 100644 --- a/python/akimbo_ip/accessor.py +++ b/python/akimbo_ip/accessor.py @@ -1,5 +1,6 @@ import ipaddress import functools +from types import UnionType import awkward as ak import numpy as np @@ -22,6 +23,11 @@ def match_ip6(arr): return arr.is_regular and arr.size == 16 and arr.content.is_leaf and arr.content.dtype.itemsize == 1 +def match_ip(arr): + """matches either v4 or v6 IPs""" + return match_ip4(arr) or match_ip6(arr) + + def match_prefix(arr): """A network prefix is always one byte""" return arr.is_leaf and arr.dtype.itemsize == 1 @@ -38,12 +44,21 @@ def match_net4(arr, address="address", prefix="prefix"): def match_net6(arr, address="address", prefix="prefix"): + """Matches a record with IP6 field and prefix field (u8)""" return ( arr.is_record and {address, prefix}.issubset(arr.fields) and match_ip6(arr[address]) and match_prefix(arr[prefix]) ) + + +def match_list_net4(arr, address="address", prefix="prefix"): + """Matches lists of ip4 network records""" + if arr.is_list: + cont = arr.content.content if arr.content.is_option else arr.content + return match_net4(cont) + return False def match_stringlike(arr): @@ -101,18 +116,103 @@ def contains4(nets, other, address="address", prefix="prefix"): def hosts4(nets, address="address", prefix="prefix"): - arr = nets[address] - if arr.is_leaf: - arr = arr.data.astype("uint32") - else: - # fixed bytestring or 4 * uint8 regular - arr = arr.content.data.view("uint32") + arr, = to_ip4(nets[address]) ips, offsets = lib.hosts4(arr, nets[prefix].data.astype("uint8")) return ak.contents.ListOffsetArray( ak.index.Index64(offsets), utils.u8_to_ip4(ips) ) +def network4(nets, address="address", prefix="prefix"): + arr, = to_ip4(nets[address]) + out = lib.network4(arr, nets[prefix].data.astype("uint8")) + return utils.u8_to_ip4(out) + + +def broadcast4(nets, address="address", prefix="prefix"): + arr, = to_ip4(nets[address]) + out = lib.broadcast4(arr, nets[prefix].data.astype("uint8")) + return utils.u8_to_ip4(out) + + +def hostmask4(nets, address="address", prefix="prefix"): + out = lib.hostmask4(nets[prefix].data.astype("uint8")) + return utils.u8_to_ip4(out) + + +def netmask4(nets, address="address", prefix="prefix"): + out = lib.netmask4(nets[prefix].data.astype("uint8")) + return utils.u8_to_ip4(out) + + +def trunc4(nets, address="address", prefix="prefix"): + arr, = to_ip4(nets[address]) + out = lib.trunc4(arr, nets[prefix].data.astype("uint8")) + return ak.contents.RecordArray( + [utils.u8_to_ip4(out), nets[prefix]], + fields=[address, prefix] + ) + + +def supernet4(nets, address="address", prefix="prefix"): + arr, = to_ip4(nets[address]) + out = lib.supernet4(arr, nets[prefix].data.astype("uint8")) + return ak.contents.RecordArray( + [utils.u8_to_ip4(out), ak.contents.NumpyArray(nets[prefix].data - 1)], + fields=[address, prefix] + ) + + +def subnets4(nets, new_prefix, address="address", prefix="prefix"): + arr, = to_ip4(nets[address]) + out, offsets = lib.subnets4(arr, nets[prefix].data.astype("uint8"), new_prefix) + addr = utils.u8_to_ip4(out) + return ak.contents.ListOffsetArray( + ak.index.Index64(offsets), + ak.contents.RecordArray( + [addr, + ak.contents.NumpyArray(np.full((len(addr), ), new_prefix, dtype="uint8"))], + fields=[address, prefix] + ), + ) + + +def aggregate4(net_lists, address="address", prefix="prefix"): + offsets = net_lists.offsets.data.astype("uint64") + cont = net_lists.content.content if net_lists.content.is_option else net_lists.content + arr, = to_ip4(cont[address]) + out_addr, out_pref, counts = lib.aggregate4(arr, offsets, cont[prefix].data) + # TODO: reassemble optional if input net_lists was list[optional[networks]] + return ak.contents.ListOffsetArray( + ak.index.Index64(counts), + ak.contents.RecordArray( + [utils.u8_to_ip4(out_addr), ak.contents.NumpyArray(out_pref)], + fields=[address, prefix] + ) + ) + + +def to_int_list(arr): + if (arr.is_leaf and arr.dtype.itemsize == 4): + out = ak.contents.RegularArray( + ak.contents.NumpyArray(arr.data.view('uint8')), + size=4 + ) + else: + out = ak.copy(arr) + out.parameters.pop('__array__') + return out + + +def to_bytestring(arr): + if (arr.is_leaf and arr.dtype.itemsize == 4): + out = utils.u8_to_ip4(arr) + else: + out = ak.copy(arr) + out.parameters['__array__'] = "bytestring" + out.content.parameters["__array__"] = "byte" + return out + def to_ip4(arr): if arr.is_leaf: @@ -121,7 +221,6 @@ def to_ip4(arr): # bytestring or 4 * uint8 regular return arr.content.data.view("uint32"), - def to_ip6(arr): # always pass as bytes, and assume length is mod 16 in rust return arr.content.data.view("uint8"), @@ -139,6 +238,33 @@ class IPAccessor: def __init__(self, accessor) -> None: self.accessor = accessor + # TODO: bitwise_or and bitwise_and methods and their overrides + def __eq__(self, other): + arr = self.accessor.array + if isinstance(other, (str, int)): + arr2 = ak.Array([ipaddress.ip_address(other).packed]) + + return self.accessor.to_output(arr == arr2) + else: + raise ValueError + + def bitwise_or(self, other): + raise NotImplemented("Will allow arr[ip] | mask") + + __or__ = bitwise_or + def __ror__(self, value): + return self.__or__(value) + + def bitwise_and(self, other): + raise NotImplemented("Will allow arr[ip] & mask") + + __and__ = bitwise_and + def __rand__(self, value): + return self.__and__(value) + + to_int_list = dec(to_int_list, inmode="ak", match=match_ip) + to_bytestring = dec(to_bytestring, inmode="ak", match=match_ip) + is_unspecified4 = dec_ip(lib.is_unspecified4) is_broadcast4 = dec_ip(lib.is_broadcast4) is_global4 = dec_ip(lib.is_global4) @@ -155,6 +281,14 @@ def __init__(self, accessor) -> None: parse_address4 = dec(parse_address4, inmode="ak", match=match_stringlike) parse_net4 = dec(parse_net4, inmode="ak", match=match_stringlike) + network4 = dec(network4, inmode="ak", match=match_net4) + hostmask4 = dec(hostmask4, inmode="ak", match=match_net4) + netmask4 = dec(netmask4, inmode="ak", match=match_net4) + broadcast4 = dec(broadcast4, inmode="ak", match=match_net4) + trunc4 = dec(trunc4, inmode="ak", match=match_net4) + supernet4 = dec(supernet4, inmode="ak", match=match_net4) + subnets4 = dec(subnets4, inmode="ak", match=match_net4) + aggregate4 = dec(aggregate4, inmode="ak", match=match_list_net4) contains4 = dec(contains4, inmode="ak", match=match_net4) diff --git a/src/lib.rs b/src/lib.rs index e9611a0..ecf7fd8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,27 +6,17 @@ use std::net::Ipv6Addr; use std::str::{self, FromStr}; use ipnet::Ipv4Net; use numpy::pyo3::Python; -use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; +use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1, PyUntypedArrayMethods}; pub fn netmask_to_prefix4(mask: u32) -> u8 { mask.leading_ones() as u8 } -pub fn prefix_to_netmask4(prefix: u8) -> u32 { - // TODO: check for prefix >= 32 .checked_shl(prefix).unwrap_or(0) - 0xffffffff << prefix -} - pub fn netmask_to_prefix6(mask: u128) -> u8 { mask.leading_ones() as u8 } -pub fn prefix_to_netmask6(prefix: u8) -> u128 { - // TODO: check for prefix >= 128 .checked_shl(prefix).unwrap_or(0) - 0xffffffffffffffffffffffffffffffff << prefix -} - #[pyfunction] fn to_text4<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u32>) @@ -122,12 +112,12 @@ fn contains_one4<'py>(py: Python<'py>, } +// list of IP4 addresses indicated by each network #[pyfunction] fn hosts4<'py>(py: Python<'py>, addr: PyReadonlyArray1<'py, u32>, pref: PyReadonlyArray1<'py, u8>, ) -> PyResult<(Bound<'py, PyArray1>, Bound<'py, PyArray1>)> { -// returns IP4 data as uint32 and array of offsets (same length as input) let mut out: Vec = Vec::new(); let mut offsets: Vec = Vec::from([0]); for (&add, &pre) in addr.as_array().iter().zip(pref.as_array()) { @@ -138,6 +128,139 @@ fn hosts4<'py>(py: Python<'py>, Ok((out.into_pyarray_bound(py), offsets.into_pyarray_bound(py))) } +/// the hostmask implied by the given network prefix +#[pyfunction] +fn hostmask4<'py>(py: Python<'py>, + pref: PyReadonlyArray1<'py, u8>, +) -> PyResult>> { + let out: Vec = pref.as_array().iter().map( + |x| u32::max_value() >> x + ).collect(); + Ok(out.into_pyarray_bound(py)) +} + + +/// the netmask implied by the given network prefix +#[pyfunction] +fn netmask4<'py>(py: Python<'py>, + pref: PyReadonlyArray1<'py, u8>, +) -> PyResult>> { + // TODO: check for prefix >= 128 .checked_shl(prefix).unwrap_or(0) + let out: Vec = pref.as_array().iter().map( + |x| u32::max_value() << (32 - x) + ).collect(); + Ok(out.into_pyarray_bound(py)) +} + +/// the base network address of the given network values +#[pyfunction] +fn network4<'py>(py: Python<'py>, + addr: PyReadonlyArray1<'py, u32>, + pref: PyReadonlyArray1<'py, u8>, +) -> PyResult>> { + let out: Vec = addr.as_array().iter().zip(pref.as_array().iter()).map( + | (&add, &pre) | Ipv4Net::new(Ipv4Addr::from_bits(add), pre).unwrap().network().to_bits() + ).collect(); + Ok(out.into_pyarray_bound(py)) +} + + +/// the highest address of the given network values +#[pyfunction] +fn broadcast4<'py>(py: Python<'py>, + addr: PyReadonlyArray1<'py, u32>, + pref: PyReadonlyArray1<'py, u8>, +) -> PyResult>> { + let out: Vec = addr.as_array().iter().zip(pref.as_array().iter()).map( + | (&add, &pre) | Ipv4Net::new(Ipv4Addr::from_bits(add), pre).unwrap().broadcast().to_bits() + ).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn trunc4<'py>(py: Python<'py>, + addr: PyReadonlyArray1<'py, u32>, + pref: PyReadonlyArray1<'py, u8>, +) -> PyResult>> { + let out: Vec = addr.as_array().iter().zip(pref.as_array().iter()).map( + | (&add, &pre) | Ipv4Net::new(Ipv4Addr::from_bits(add), pre).unwrap().trunc().addr().to_bits() + ).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn supernet4<'py>(py: Python<'py>, + addr: PyReadonlyArray1<'py, u32>, + pref: PyReadonlyArray1<'py, u8>, +) -> PyResult>> { + let out: Vec = addr.as_array().iter().zip(pref.as_array().iter()).map( + | (&add, &pre) | Ipv4Net::new(Ipv4Addr::from_bits(add), pre).unwrap().supernet().unwrap().addr().to_bits() + ).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn subnets4<'py>(py: Python<'py>, + addr: PyReadonlyArray1<'py, u32>, + pref: PyReadonlyArray1<'py, u8>, + new_pref: u8 +) -> PyResult<(Bound<'py, PyArray1>, Bound<'py, PyArray1>)> { + let mut out: Vec = Vec::new(); + let mut counts: Vec = Vec::with_capacity(pref.len()); + let mut count: u64 = 0; + counts.push(0); + addr.as_array().iter().zip(pref.as_array().iter()).for_each( + | (&add, &pre) | { + Ipv4Net::new(Ipv4Addr::from_bits(add), pre).unwrap().subnets(new_pref).unwrap().for_each( + |x|{ + count += 1; + out.push(x.addr().to_bits()) + } + ); + counts.push(count); + } + + ); + Ok((out.into_pyarray_bound(py), counts.into_pyarray_bound(py))) +} + +#[pyfunction] +fn aggregate4<'py>(py: Python<'py>, + addr: PyReadonlyArray1<'py, u32>, + offsets: PyReadonlyArray1<'py, u64>, + pref: PyReadonlyArray1<'py, u8>, +) -> PyResult<(Bound<'py, PyArray1>, Bound<'py, PyArray1>, Bound<'py, PyArray1>)> { + let mut out_addr: Vec = Vec::new(); + let mut out_pref: Vec = Vec::new(); + let mut counts: Vec = Vec::with_capacity(pref.len()); + let mut count: u64 = 0; + let mut count_in: u64 = 0; + let mut networks: Vec = Vec::new(); + + let off_arr = offsets.as_array(); + let offs = off_arr.as_slice().unwrap(); + let ad_arr = addr.as_array(); + let mut ad_slice = ad_arr.as_slice().unwrap().iter(); + let pr_arr = pref.as_array(); + let mut pr_slice = pr_arr.as_slice().unwrap().iter(); + + for w in offs { + networks.clear(); + while count_in < *w { + networks.push(Ipv4Net::new(Ipv4Addr::from_bits(*ad_slice.next().unwrap()), *pr_slice.next().unwrap()).unwrap()); + count_in += 1; + }; + Ipv4Net::aggregate(&networks).iter().for_each( + |x| { + out_addr.push(x.addr().to_bits()); + out_pref.push(x.prefix_len()); + count += 1; + }); + counts.push(count); + } + Ok((out_addr.into_pyarray_bound(py), out_pref.into_pyarray_bound(py), counts.into_pyarray_bound(py))) +} + #[pyfunction] fn is_broadcast4<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u32>) -> PyResult>> { @@ -287,7 +410,7 @@ fn is_unspecified6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResu #[pyfunction] fn to_ipv6_mapped<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u32>) -> PyResult>> { - let mut out: Vec = Vec::with_capacity(x.len().unwrap() * 16); + let mut out: Vec = Vec::with_capacity(x.len() * 16); for &x in x.as_array().iter() { let bit = Ipv4Addr::from(x).to_ipv6_mapped().octets(); out.extend(bit); @@ -314,6 +437,14 @@ fn akimbo_ip(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(contains_one4, m)?)?; m.add_function(wrap_pyfunction!(to_ipv6_mapped, m)?)?; m.add_function(wrap_pyfunction!(hosts4, m)?)?; + m.add_function(wrap_pyfunction!(hostmask4, m)?)?; + m.add_function(wrap_pyfunction!(netmask4, m)?)?; + m.add_function(wrap_pyfunction!(network4, m)?)?; + m.add_function(wrap_pyfunction!(broadcast4, m)?)?; + m.add_function(wrap_pyfunction!(trunc4, m)?)?; + m.add_function(wrap_pyfunction!(supernet4, m)?)?; + m.add_function(wrap_pyfunction!(subnets4, m)?)?; + m.add_function(wrap_pyfunction!(aggregate4, m)?)?; m.add_function(wrap_pyfunction!(is_benchmarking6, m)?)?; m.add_function(wrap_pyfunction!(is_documentation6, m)?)?; diff --git a/tests/test_core.py b/tests/test_core.py index 8572a39..72fc2d5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2,10 +2,6 @@ import pandas as pd import pytest -import akimbo.pandas # registers .ak on pandas - -import akimbo_ip # registers .ip subaccessor - bytestring4 = pd.ArrowDtype(pa.binary(4)) bytestring16 = pd.ArrowDtype(pa.binary(16)) @@ -35,6 +31,25 @@ def test_simple6(): assert out3[1] == s1[1] +def test_to_lists(): + s1 = pd.Series([b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"], + dtype=bytestring16) + out = s1.ak.ip.to_int_list() + assert out.to_list() == [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] + ] + out2 = out.ak.ip.to_bytestring() + assert s1.to_list() == out2.to_list() + + s2 = pd.Series([0, 1], dtype="uint32") + out = s2.ak.ip.to_int_list() + assert out.to_list() == [[0, 0, 0, 0], [1, 0, 0, 0]] + out2 = out.ak.ip.to_bytestring() + assert out2.to_list() == [b'\x00\x00\x00\x00', b'\x01\x00\x00\x00'] + + def test_nested(): s = pd.DataFrame({"a": [0], "b": [0]}).ak.merge() out = s.ak.ip.is_global4(where="b")