Skip to content

Commit

Permalink
Merge pull request #11 from SunDoge/feat-use-uv
Browse files Browse the repository at this point in the history
Feat use uv
  • Loading branch information
SunDoge authored Dec 21, 2024
2 parents af37387 + 00110ac commit 643665c
Show file tree
Hide file tree
Showing 18 changed files with 168 additions and 94 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
*.model
*.zip
.vscode/
__pycache__/
*.so
13 changes: 0 additions & 13 deletions bytepiece-py/.gitignore

This file was deleted.

2 changes: 1 addition & 1 deletion bytepiece-py/.python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.12.2
3.11
10 changes: 6 additions & 4 deletions bytepiece-py/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
[package]
name = "bytepiece-py"
name = "bytepiece_py"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "bytepiece_py"
name = "_core"
# "cdylib" is necessary to produce a shared library for Python to import from.
crate-type = ["cdylib"]

[dependencies]
pyo3 = "0.21"
# "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so)
# "abi3-py39" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.9
pyo3 = { version = "0.23.3", features = ["extension-module", "abi3-py39"] }
bytepiece = { workspace = true }
4 changes: 1 addition & 3 deletions bytepiece-py/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
# bytepiece-py

Describe your project here.
?
28 changes: 18 additions & 10 deletions scripts/bench.py → bytepiece-py/examples/bench.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
import bytepiece
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "rs-bytepiece",
# "bytepiece-py",
# ]
# ///
# import bytepiece
import timeit
import bytepiece_py
import rs_bytepiece
Expand All @@ -7,21 +14,22 @@
TEXT = "BytePiece是一个Byte-based的Unigram分词器,纯Python实现,更加易读和易拓展。由于采用了新的训练算法,所以压缩率通常比现有Tokenizer更高,同时支持多进程加速训练。此外,它直接操作文本的UTF-8 Bytes,几乎不进行任何的预处理,所以更加纯粹和语言无关。"


MODEL = "models/bytepiece_80k.model"
MODEL = "../models/bytepiece_80k.model"

t1 = bytepiece.Tokenizer(MODEL)
# t1 = bytepiece.Tokenizer(MODEL)
t2 = bytepiece_py.Tokenizer(MODEL)
t3 = rs_bytepiece.Tokenizer(MODEL)

assert t1.encode(TEXT) == t2.encode(TEXT)
assert t1.decode(t1.encode(TEXT)) == t2.decode(t2.encode(TEXT))
print(t1.encode(TEXT))
# assert t1.encode(TEXT) == t2.encode(TEXT)
# assert t1.decode(t1.encode(TEXT)) == t2.decode(t2.encode(TEXT))
# print(t1.encode(TEXT))
print(t2.encode(TEXT))
print(t3.encode(TEXT))


print('bytepiece:')
print(timeit.timeit("t1.encode(TEXT)", globals=globals(), number=10000))
print('bytepiece-py (ours)')
# print('bytepiece:')
# print(timeit.timeit("t1.encode(TEXT)", globals=globals(), number=10000))
print("bytepiece-py (ours)")
print(timeit.timeit("t2.encode(TEXT)", globals=globals(), number=10000))
print('rs-bytepiece')
print("rs-bytepiece")
print(timeit.timeit("t3.encode(TEXT)", globals=globals(), number=10000))
29 changes: 13 additions & 16 deletions bytepiece-py/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,28 +1,25 @@
[project]
name = "bytepiece-py"
version = "0.2.1"
version = "0.1.1"
description = "Add your description here"
readme = "README.md"
authors = [
{ name = "SunDoge", email = "[email protected]" }
]
requires-python = ">=3.8"
dependencies = []
readme = "README.md"
requires-python = ">= 3.8"

[tool.maturin]
module-name = "bytepiece_py._core"
python-packages = ["bytepiece_py"]
python-source = "src"

[build-system]
requires = ["maturin>=1.2,<2.0"]
requires = ["maturin>=1.0,<2.0"]
build-backend = "maturin"

[tool.rye]
managed = true
dev-dependencies = [
"pip>=24.0",
[dependency-groups]
dev = [
"maturin>=1.7.8",
"rs-bytepiece>=0.2.2",
]

[tool.maturin]
python-source = "python"
module-name = "bytepiece_py._lowlevel"
features = ["pyo3/extension-module"]

[tool.rye.scripts]
dev = "maturin develop --skip-install"
11 changes: 0 additions & 11 deletions bytepiece-py/requirements-dev.lock

This file was deleted.

10 changes: 0 additions & 10 deletions bytepiece-py/requirements.lock

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unicodedata
from typing import Dict, List, Tuple, Union

from bytepiece_py import _lowlevel
from bytepiece_py import _core


def normalize(text: str) -> bytes:
Expand All @@ -11,9 +11,9 @@ def normalize(text: str) -> bytes:
class Tokenizer:
def __init__(self, pieces: Union[str, Dict[str, Tuple[str, int, str]]]) -> None:
if isinstance(pieces, str):
self._tokenizer = _lowlevel._Tokenizer.from_path(pieces)
self._tokenizer = _core._Tokenizer.from_path(pieces)
else:
self._tokenizer = _lowlevel._Tokenizer(pieces)
self._tokenizer = _core._Tokenizer(pieces)

def encode(
self,
Expand Down
3 changes: 3 additions & 0 deletions bytepiece-py/src/bytepiece_py/_core.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from __future__ import annotations

def hello_from_bin() -> str: ...
Empty file.
9 changes: 5 additions & 4 deletions bytepiece-py/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ mod error;
mod tokenizer;

use pyo3::prelude::*;
use tokenizer::_Tokenizer;

/// A Python module implemented in Rust.
/// A Python module implemented in Rust. The name of this function must match
/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
/// import the module.
#[pymodule]
fn _lowlevel(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<_Tokenizer>()?;
fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<tokenizer::_Tokenizer>()?;
Ok(())
}
9 changes: 3 additions & 6 deletions bytepiece-py/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,7 @@ impl _Tokenizer {
) -> Vec<Bound<'py, PyBytes>> {
let bs = text.as_bytes();
let tokens = py.allow_threads(|| self.inner.tokenize(&bs, alpha));
tokens
.into_iter()
.map(|bs| PyBytes::new_bound(py, bs))
.collect()
tokens.into_iter().map(|bs| PyBytes::new(py, bs)).collect()
}

#[pyo3(signature = (text, add_bos = false, add_eos = false, alpha = -1.0))]
Expand All @@ -58,11 +55,11 @@ impl _Tokenizer {

pub fn decode<'py>(&self, py: Python<'py>, ids: Vec<usize>) -> Result<Bound<'py, PyBytes>> {
let res = py.allow_threads(|| self.inner.decode(&ids))?;
Ok(PyBytes::new_bound(py, &res))
Ok(PyBytes::new(py, &res))
}

pub fn id_to_piece<'py>(&self, py: Python<'py>, id: usize) -> Bound<'py, PyBytes> {
PyBytes::new_bound(py, self.inner.id_to_piece(id))
PyBytes::new(py, self.inner.id_to_piece(id))
}

pub fn piece_to_id(&self, piece: &Bound<PyBytes>) -> usize {
Expand Down
Loading

0 comments on commit 643665c

Please sign in to comment.