Skip to content

Commit ec32ec9

Browse files
committed
Update KVS and its clients [WIP]
1 parent 23ac4bc commit ec32ec9

13 files changed

+448
-178
lines changed

src/crawlee/_utils/file.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import asyncio
44
import contextlib
5-
import io
65
import json
76
import mimetypes
87
import os
@@ -83,21 +82,6 @@ def determine_file_extension(content_type: str) -> str | None:
8382
return ext[1:] if ext is not None else ext
8483

8584

86-
def is_file_or_bytes(value: Any) -> bool:
87-
"""Determine if the input value is a file-like object or bytes.
88-
89-
This function checks whether the provided value is an instance of bytes, bytearray, or io.IOBase (file-like).
90-
The method is simplified for common use cases and may not cover all edge cases.
91-
92-
Args:
93-
value: The value to be checked.
94-
95-
Returns:
96-
True if the value is either a file-like object or bytes, False otherwise.
97-
"""
98-
return isinstance(value, (bytes, bytearray, io.IOBase))
99-
100-
10185
async def json_dumps(obj: Any) -> str:
10286
"""Serialize an object to a JSON-formatted string with specific settings.
10387

src/crawlee/storage_clients/_base/_dataset_client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def item_count(self) -> int:
5656
@abstractmethod
5757
async def open(
5858
cls,
59+
*,
5960
id: str | None,
6061
name: str | None,
6162
storage_dir: Path,
@@ -82,7 +83,7 @@ async def drop(self) -> None:
8283
"""
8384

8485
@abstractmethod
85-
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
86+
async def push_data(self, *, data: list[Any] | dict[str, Any]) -> None:
8687
"""Push data to the dataset.
8788
8889
The backend method for the `Dataset.push_data` call.

src/crawlee/storage_clients/_base/_key_value_store_client.py

Lines changed: 79 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,103 +6,139 @@
66
from crawlee._utils.docs import docs_group
77

88
if TYPE_CHECKING:
9-
from contextlib import AbstractAsyncContextManager
9+
from datetime import datetime
10+
from pathlib import Path
1011

11-
from httpx import Response
12+
from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreRecord
1213

13-
from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
14+
# Properties:
15+
# - id
16+
# - name
17+
# - created_at
18+
# - accessed_at
19+
# - modified_at
20+
21+
# Methods:
22+
# - open
23+
# - drop
24+
# - get_value
25+
# - set_value
26+
# - delete_value
27+
# - iterate_keys
28+
# - get_public_url
1429

1530

1631
@docs_group('Abstract classes')
1732
class KeyValueStoreClient(ABC):
18-
"""An abstract class for key-value store resource clients.
33+
"""An abstract class for key-value store (KVS) resource clients.
1934
2035
These clients are specific to the type of resource they manage and operate under a designated storage
2136
client, like a memory storage client.
2237
"""
2338

39+
@property
2440
@abstractmethod
25-
async def get(self) -> KeyValueStoreMetadata | None:
26-
"""Get metadata about the key-value store being managed by this client.
27-
28-
Returns:
29-
An object containing the key-value store's details, or None if the key-value store does not exist.
30-
"""
41+
def id(self) -> str:
42+
"""The ID of the key-value store."""
3143

44+
@property
3245
@abstractmethod
33-
async def delete(self) -> None:
34-
"""Permanently delete the key-value store managed by this client."""
46+
def name(self) -> str | None:
47+
"""The name of the key-value store."""
3548

49+
@property
3650
@abstractmethod
37-
async def list_keys(
38-
self,
39-
*,
40-
limit: int = 1000,
41-
exclusive_start_key: str | None = None,
42-
) -> KeyValueStoreListKeysPage:
43-
"""List the keys in the key-value store.
51+
def created_at(self) -> datetime:
52+
"""The time at which the key-value store was created."""
4453

45-
Args:
46-
limit: Number of keys to be returned. Maximum value is 1000.
47-
exclusive_start_key: All keys up to this one (including) are skipped from the result.
54+
@property
55+
@abstractmethod
56+
def accessed_at(self) -> datetime:
57+
"""The time at which the key-value store was last accessed."""
4858

49-
Returns:
50-
The list of keys in the key-value store matching the given arguments.
51-
"""
59+
@property
60+
@abstractmethod
61+
def modified_at(self) -> datetime:
62+
"""The time at which the key-value store was last modified."""
5263

64+
@classmethod
5365
@abstractmethod
54-
async def get_record(self, key: str) -> KeyValueStoreRecord | None:
55-
"""Retrieve the given record from the key-value store.
66+
async def open(
67+
cls,
68+
*,
69+
id: str | None,
70+
name: str | None,
71+
storage_dir: Path,
72+
) -> KeyValueStoreClient:
73+
"""Open existing or create a new key-value store client.
74+
75+
If a key-value store with the given name already exists, the appropriate key-value store client is returned.
76+
Otherwise, a new key-value store is created and client for it is returned.
5677
5778
Args:
58-
key: Key of the record to retrieve.
79+
id: The ID of the key-value store.
80+
name: The name of the key-value store.
81+
storage_dir: The path to the storage directory. If the client persists data, it should use this directory.
5982
6083
Returns:
61-
The requested record, or None, if the record does not exist
84+
A key-value store client.
6285
"""
6386

6487
@abstractmethod
65-
async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord[bytes] | None:
66-
"""Retrieve the given record from the key-value store, without parsing it.
88+
async def drop(self) -> None:
89+
"""Drop the whole key-value store and remove all its items.
6790
68-
Args:
69-
key: Key of the record to retrieve.
70-
71-
Returns:
72-
The requested record, or None, if the record does not exist
91+
The backend method for the `KeyValueStore.drop` call.
7392
"""
7493

7594
@abstractmethod
76-
async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]:
77-
"""Retrieve the given record from the key-value store, as a stream.
95+
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
96+
"""Retrieve the given record from the key-value store.
7897
7998
Args:
8099
key: Key of the record to retrieve.
81100
82101
Returns:
83-
The requested record as a context-managed streaming Response, or None, if the record does not exist
102+
The requested record, or None, if the record does not exist
84103
"""
85104

86105
@abstractmethod
87-
async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None:
106+
async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
88107
"""Set a value to the given record in the key-value store.
89108
90109
Args:
91110
key: The key of the record to save the value to.
92111
value: The value to save into the record.
93-
content_type: The content type of the saved value.
112+
content_type: The MIME content type string. If not provided, it is inferred from the value.
94113
"""
95114

96115
@abstractmethod
97-
async def delete_record(self, key: str) -> None:
116+
async def delete_value(self, *, key: str) -> None:
98117
"""Delete the specified record from the key-value store.
99118
100119
Args:
101120
key: The key of the record which to delete.
102121
"""
103122

104123
@abstractmethod
105-
async def get_public_url(self, key: str) -> str:
124+
async def iterate_keys(
125+
self,
126+
*,
127+
exclusive_start_key: str | None = None,
128+
limit: int = 1000,
129+
) -> KeyValueStoreListKeysPage:
130+
"""List the keys in the key-value store.
131+
132+
Args:
133+
exclusive_start_key: All keys up to this one (including) are skipped from the result.
134+
limit: Number of keys to be returned.
135+
136+
Returns:
137+
The list of keys in the key-value store matching the given arguments.
138+
"""
139+
140+
@abstractmethod
141+
async def get_public_url(self, *, key: str) -> str:
106142
"""Get the public URL for the given key.
107143
108144
Args:

src/crawlee/storage_clients/_file_system/_dataset_client.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,11 @@
1010
from pydantic import ValidationError
1111
from typing_extensions import override
1212

13-
from crawlee._consts import METADATA_FILENAME
1413
from crawlee._utils.crypto import crypto_random_object_id
1514
from crawlee.storage_clients._base import DatasetClient
1615
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
1716

18-
from ._utils import json_dumps
17+
from ._utils import METADATA_FILENAME, json_dumps
1918

2019
if TYPE_CHECKING:
2120
from collections.abc import AsyncIterator

src/crawlee/storage_clients/_file_system/_key_value_store.py

Lines changed: 0 additions & 11 deletions
This file was deleted.

0 commit comments

Comments
 (0)