Skip to content
This repository was archived by the owner on Jul 29, 2024. It is now read-only.

Commit cea0d82

Browse files
Merge pull request #2 from Labelbox/raphaeljafriLB-annotation-support
Placeholder upate to Client.py
2 parents 7a1742f + b30c843 commit cea0d82

File tree

5 files changed

+380
-170
lines changed

5 files changed

+380
-170
lines changed

labelpandas/batches.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
batches.py holds the function create_batches_dict() -- which creates the following style dictionary:
3+
{
4+
project_id :
5+
[
6+
data_row_id,
7+
data_row_id,
8+
data_row_id,
9+
],
10+
project_id :
11+
[
12+
data_row_id,
13+
data_row_id,
14+
data_row_id,
15+
]
16+
}
17+
This is the format that labelbase.uploader.batch_rows_to_project() expects
18+
"""
19+
import pandas as pd
20+
21+
def create_batches_dict(table: pandas.core.frame.DataFrame, table_dict:dict,
22+
global_key_col:str, project_id_col:str,
23+
project_id:str, global_key_to_data_row_id:dict):
24+
""" From a Pandas DataFrame, creates a dictionary where {key=project_id : value=list_of_data_row_ids}
25+
Args:
26+
table : Required (pandas.core.frame.DataFrame) - Pandas DataFrame
27+
table_dict : Required (dict) - Pandas DataFrame as dict with df.to_dict("records")
28+
global_key_col : Required (str) - Column name containing the data row global key - defaults to row data
29+
project_id_col : Required (str) - Column name containing the project ID to batch a given row to
30+
project_id : Required (str) - Labelbox project ID to add data rows to - only necessary if no "project_id" column exists
31+
global_key_to_data_row_id : Required (dict) - Dictionary where {key=global_key : value=data_row_id}
32+
Returns:
33+
Dictionary where {key=project_id : value=list_of_data_row_ids}
34+
"""
35+
if project_id:
36+
project_id_to_batch_dict = {project_id : []}
37+
else:
38+
project_ids = labelpandas.connector.get_unique_values_function(table=table)
39+
project_id_to_batch_dict = {id : [] for id in project_ids}
40+
errors = []
41+
try:
42+
for row in table_dict:
43+
id = project_id if project_id else row[project_id_col]
44+
data_row_id = global_key_to_data_row_id[row[global_key_col]]
45+
project_id_to_batch_dict[id].append(data_row_id)
46+
except Exception as e:
47+
errors = e
48+
return project_id_to_batch_dict, errors

labelpandas/client.py

+96-50
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from labelbox import Client as labelboxClient
22
from labelbox.schema.dataset import Dataset as labelboxDataset
3-
from labelbase.metadata import sync_metadata_fields
4-
from labelbase.uploader import batch_create_data_rows
3+
import labelpandas
4+
import labelbase
55
import pandas as pd
6-
from labelpandas import connector
6+
77

88
class Client():
99
"""
@@ -27,64 +27,110 @@ def __init__(
2727
# return table
2828

2929
def create_data_rows_from_table(
30-
self, table:pd.core.frame.DataFrame, lb_dataset:labelboxDataset, row_data_col:str, global_key_col=None, external_id_col=None,
31-
metadata_index:dict={}, local_files:bool=False, skip_duplicates:bool=False, verbose:bool=False, divider="___"):
30+
self, table:pd.core.frame.DataFrame, dataset_id:str"", project_id:str="", priority:int=5,
31+
upload_method:str="", skip_duplicates:bool=False, verbose:bool=False, divider="///"):
3232
""" Creates Labelbox data rows given a Pandas table and a Labelbox Dataset
3333
Args:
34-
table : Required (pandas.core.frame.DataFrame) - Pandas DataFrame
35-
lb_dataset : Required (labelbox.schema.dataset.Dataset) - Labelbox dataset to add data rows to
36-
row_data_col : Required (str) - Column containing asset URL or file path
37-
local_files : Required (bool) - Determines how to handle row_data_col values
38-
If True, treats row_data_col values as file paths uploads the local files to Labelbox
39-
If False, treats row_data_col values as urls (assuming delegated access is set up)
40-
global_key_col : Optional (str) - Column name containing the data row global key - defaults to row_data_col
41-
external_id_col : Optional (str) - Column name containing the data row external ID - defaults to global_key_col
42-
metadata_index : Required (dict) - Dictionary where {key=column_name : value=metadata_type}
43-
metadata_type must be either "enum", "string", "datetime" or "number"
44-
skip_duplicates : Optional (bool) - Determines how to handle if a global key to-be-uploaded is already in use
45-
If True, will skip duplicate global_keys and not upload them
46-
If False, will generate a unique global_key with a suffix "_1", "_2" and so on
47-
verbose : Required (bool) - If True, prints details about code execution; if False, prints minimal information
48-
divider : Optional (str) - String delimiter for all name keys generated for parent/child schemas
49-
Returns:
50-
A dictionary with "upload_results" and "conversion_errors" keys
51-
- "upload_results" key pertains to the results of the data row upload itself
52-
- "conversion_errors" key pertains to any errors related to data row conversion
53-
"""
54-
55-
# Ensure all your metadata_index keys are metadata fields in Labelbox and that your Pandas DataFrame has all the right columns
56-
table = sync_metadata_fields(
57-
client=self.lb_client, table=table, get_columns_function=connector.get_columns_function, add_column_function=connector.add_column_function,
58-
get_unique_values_function=connector.get_unique_values_function, metadata_index=metadata_index, verbose=verbose
34+
table : Required (pandas.core.frame.DataFrame) - Pandas DataFrame
35+
dataset_id : Required (str) - Labelbox dataset ID to add data rows to - only necessary if no "dataset_id" column exists
36+
project_id : Required (str) - Labelbox project ID to add data rows to - only necessary if no "project_id" column exists
37+
priority : Optinoal (int) - Between 1 and 5, what priority to give to data row batches sent to projects
38+
upload_method : Optional (str) - Either "mal" or "import" - required to upload annotations (otherwise leave as "")
39+
skip_duplicates : Optional (bool) - Determines how to handle if a global key to-be-uploaded is already in use
40+
If True, will skip duplicate global_keys and not upload them
41+
If False, will generate a unique global_key with a suffix {divider} + "1", "2" and so on
42+
verbose : Required (bool) - If True, prints details about code execution; if False, prints minimal information
43+
divider : Optional (str) - String delimiter for schema name keys and suffix added to duplocate global keys
44+
"""
45+
# Create a metadata_index, attachment_index, and annotation_index
46+
# row_data_col : column with name "row_data"
47+
# global_key_col : column with name "global_key" - defaults to row_data_col
48+
# external_id_col : column with name "external_id" - defaults to global_key_col
49+
# project_id_col : column with name "project_id" - defaults to "" (requires project_id input argument if no "project_id" column exists)
50+
# dataset_id_col : column with name "dataset_id" - defaults to "" (requires project_id input argument if no "dataset_id" column exists)
51+
# external_id_col : column with name "external_id" - defaults to global_key_col
52+
# metadata_index : Dictonary where {key=column_name : value=metadata_type}
53+
# attachment_index : Dictonary where {key=column_name : value=attachment_type}
54+
# annotation_index : Dictonary where {key=column_name : value=top_level_feature_name}
55+
row_data_col, global_key_col, external_id_col, project_id_col, dataset_id_col, metadata_index, attachment_index, annotation_index = labelbase.connector.validate_columns(
56+
table=table,
57+
get_columns_function=connector.get_columns_function,
58+
get_unique_values_function=connector.get_unique_values_function,
59+
divider=divider,
60+
verbose=verbose,
61+
extra_client=None
5962
)
6063

61-
# If df returns False, the sync failed - terminate the upload
62-
if type(table) == bool:
63-
return {"upload_results" : [], "conversion_errors" : []}
64+
# Iterating over your pandas DataFrame is faster once converted to a list of dictionaries where {key=column_name : value=row_value}
65+
table_dict = table.to_dict('records')
6466

65-
# Create a dictionary where {key=global_key : value=labelbox_upload_dictionary} - this is unique to Pandas
66-
global_key_to_upload_dict, conversion_errors = connector.create_upload_dict(
67-
table=table, lb_client=self.lb_client,
68-
row_data_col=row_data_col, global_key_col=global_key_col, external_id_col=external_id_col,
69-
metadata_index=metadata_index, local_files=local_files, divider=divider, verbose=verbose
70-
)
67+
if (dataset_id_col=="") and (dataset_id==""):
68+
raise ValueError(f"To create data rows, please provide either a `dataset_id` column or a Labelbox dataset id to argument `dataset_id`")
7169

72-
# If there are conversion errors, let the user know; if there are no successful conversions, terminate the upload
73-
if conversion_errors:
74-
print(f'There were {len(conversion_errors)} errors in creating your upload list - see result["conversion_errors"] for more information')
75-
if global_key_to_upload_dict:
76-
print(f'Data row upload will continue')
77-
else:
78-
print(f'Data row upload will not continue')
79-
return {"upload_results" : [], "conversion_errors" : errors}
70+
if (upload_method!="") and (project_id_col=="") and (project_id=="") and (annotation_index!={}):
71+
raise ValueError(f"To upload annotations, please provide either a `project_id` column or a Lablebox project id to argument `project_id`")
72+
73+
# Create a dictionary where {key=dataset_id : value={key=global_key : value=data_row_upload_dict}} - this is unique to Pandas
74+
dataset_to_global_key_to_upload_dict = labelpandas.data_rows.create_data_row_upload_dict(
75+
client=self.lb_client, table=table, table_dict=table_dict,
76+
row_data_col=row_data_col, global_key_col=global_key_col, external_id_col=external_id_col, dataset_id_col=dataset_id_col,
77+
dataset_id=dataset_id, metadata_index=metadata_index, attachment_index=attachment_index,
78+
divider=divider, verbose=verbose, extra_client=None
79+
)
8080

8181
# Upload your data rows to Labelbox
82-
upload_results = batch_create_data_rows(
83-
client=self.lb_client, dataset=lb_dataset, global_key_to_upload_dict=global_key_to_upload_dict,
82+
data_row_upload_results = labelbase.uploader.batch_create_data_rows(
83+
client=self.lb_client, dataset_to_global_key_to_upload_dict=dataset_to_global_key_to_upload_dict,
8484
skip_duplicates=skip_duplicates, divider=divider, verbose=verbose
8585
)
8686

87-
return {"upload_results" : upload_results, "conversion_errors" : conversion_errors}
87+
# If project ids are provided, we can batch data rows to projects
88+
if project_id or project_id_col:
89+
90+
# Create a dictionary where {key=global_key : value=data_row_id}
91+
global_key_to_data_row_id = labelbase.uploader.create_global_key_to_data_row_dict(
92+
client=self.lb_client, global_keys=connector.get_unique_values_function(table, global_key_col)
93+
)
94+
95+
# Create a dictionary where {key=project_id : value=list_of_data_row_ids}, if applicable
96+
project_id_to_batch_dict = labelpandas.batches.create_batches_dict(
97+
client=self.lb_client, table=table, table_dict=table_dict,
98+
global_key_col=global_key_col, project_id_col=project_id_col,
99+
global_key_to_data_row_id=global_key_to_data_row_id
100+
)
101+
102+
# Batch data rows to projects, if applicable
103+
batch_to_project_results = labelbase.uploader.batch_rows_to_project(
104+
client=self.lb_client, project_id_to_batch_dict, priority=priority
105+
)
106+
107+
if (upload_method in ["mal", "import"]) and (annotation_index!={}):
108+
109+
# Create a dictionary where {key=project_id : value=annotation_upload_list}, if applicable
110+
project_id_to_upload_dict = connector.create_annotation_upload_dict(
111+
client=self.lb_client, table=table, table_dict=table_dict,
112+
row_data_col=row_data_col, global_key_col=global_key_col, project_id_col=project_id_col,
113+
project_id=project_id, annotation_index=annotation_index, global_key_to_data_row_id=global_key_to_data_row_id,
114+
divider=divider, verbose=verbose
115+
)
116+
117+
# Upload your annotations to Labelbox, if applicable
118+
annotation_upload_results = uploader.batch_upload_annotations(
119+
client=self.lb_client, project_id_to_upload_dict=project_id_to_upload_dict, how=upload_method, verbose=verbose
120+
)
121+
122+
else: # If no proper upload_method is provided or annotation_index is generated, we don't upload annotations
123+
annotation_upload_results = []
124+
125+
else: # If project ids are not provided, we don't batch data rows to projects or upload annotations
126+
batch_to_project_results = []
127+
annotation_upload_results = []
128+
129+
return {
130+
"data_row_upload_results" : data_row_upload_results,
131+
"batch_to_project_results" : batch_to_project_results,
132+
"annotation_upload_results" : annotation_upload_results
133+
}
88134

89135
# def upsert_table_metadata():
90136
# return table

0 commit comments

Comments
 (0)