Skip to content

Commit 0b13062

Browse files
committed
Python: Add ruff rules for asyncio and performance
1 parent a05b60c commit 0b13062

File tree

6 files changed

+17
-11
lines changed

6 files changed

+17
-11
lines changed

bindings/python/benches/test_tiktoken.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
2929
os.environ["RAYON_NUM_THREADS"] = str(num_threads)
3030
num_bytes = sum(map(len, map(str.encode, documents)))
3131
readable_size, unit = format_byte_size(num_bytes)
32-
print(f"==============")
32+
print("==============")
3333
print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}")
3434
filename = hf_hub_download(MODEL_ID, "original/tokenizer.model")
3535
mergeable_ranks = load_tiktoken_bpe(filename)

bindings/python/examples/using_the_visualizer.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,7 @@
552552
}
553553
],
554554
"source": [
555-
"funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n",
555+
"funnyAnnotations = [{\"startPlace\": i, \"endPlace\": i + 3, \"theTag\": str(i)} for i in range(0, 20, 4)]\n",
556556
"funnyAnnotations"
557557
]
558558
},

bindings/python/py_src/tokenizers/tools/visualizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
164164
"""
165165
if len(annotations) == 0:
166166
return {}
167-
labels = set(map(lambda x: x.label, annotations))
167+
labels = {x.label for x in annotations}
168168
num_labels = len(labels)
169169
h_step = int(255 / num_labels)
170170
if h_step < 20:

bindings/python/pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ target-version = ["py35"]
5353
[tool.ruff]
5454
line-length = 119
5555
target-version = "py311"
56+
lint.extend-select = [
57+
"ASYNC",
58+
"C4",
59+
"PERF",
60+
]
5661
lint.ignore = [
5762
# a == None in tests vs is None.
5863
"E711",

bindings/python/scripts/sentencepiece_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
138138

139139
# Save content
140140
dump(vocab, vocab_f)
141-
merges_f.writelines(map(lambda x: f"{x[0]} {x[1]}{linesep}", merges))
141+
merges_f.writelines((f"{x[0]} {x[1]}{linesep}" for x in merges))
142142
finally:
143143
# If model was downloaded from internet we need to cleanup the tmp folder.
144144
if hasattr(args, "remote_model") and exists(args.model):

bindings/python/tests/bindings/test_tokenizer.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pytest
44
import numpy as np
55
import asyncio
6+
from time import perf_counter
67
from tokenizers import AddedToken, Encoding, Tokenizer
78
from tokenizers.implementations import BertWordPieceTokenizer
89
from tokenizers.models import BPE, Model, Unigram
@@ -341,7 +342,7 @@ def test_padding(self):
341342

342343
# Can pad to the longest in a batch
343344
output = tokenizer.encode_batch(["my name", "my name is john"])
344-
assert all([len(encoding) == 4 for encoding in output])
345+
assert all(len(encoding) == 4 for encoding in output)
345346

346347
# Can pad to the specified length otherwise
347348
tokenizer.enable_padding(length=4)
@@ -950,21 +951,21 @@ async def encode_async(_):
950951
# Measure sync performance with pre-initialized executor
951952
# Warm up
952953
await asyncio.gather(*[encode_sync_with_executor(i) for i in range(10)])
953-
time.sleep(0.03)
954+
asyncio.sleep(0.03)
954955
# Actual measurement
955-
start = time.perf_counter()
956+
start = perf_counter()
956957
await asyncio.gather(*[encode_sync_with_executor(i) for i in range(n_tasks)])
957-
sync_time = time.perf_counter() - start
958+
sync_time = perf_counter() - start
958959

959960
# Measure async performance
960961
# Warm up
961962
await asyncio.gather(*[encode_async(i) for i in range(10)])
962963

963964
# Actual measurement
964-
time.sleep(0.03)
965-
start = time.perf_counter()
965+
asyncio.sleep(0.03)
966+
start = perf_counter()
966967
await asyncio.gather(*[encode_async(i) for i in range(n_tasks)])
967-
async_time = time.perf_counter() - start
968+
async_time = perf_counter() - start
968969

969970
# Log times
970971
print(f"sync vs async processing times: {sync_time:.4f}s vs {async_time:.4f}s for {n_tasks} tasks")

0 commit comments

Comments
 (0)