Skip to content

Commit 72c0df4

Browse files
Spaarshalamb
andauthored
Made imdb download (data_imdb) function atomic (apache#14225)
* Made imdb download (data_imdb) function atomic * Removed numfmt * Removed numfmt * Fix bug, add imdb to list of benchmarks in help --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 44cf77f commit 72c0df4

File tree

1 file changed

+42
-12
lines changed

1 file changed

+42
-12
lines changed

benchmarks/bench.sh

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ external_aggr: External aggregation benchmark
8383
h2o_small: h2oai benchmark with small dataset (1e7 rows), default file format is csv
8484
h2o_medium: h2oai benchmark with medium dataset (1e8 rows), default file format is csv
8585
h2o_big: h2oai benchmark with large dataset (1e9 rows), default file format is csv
86+
imdb: Join Order Benchmark (JOB) using the IMDB dataset converted to parquet
8687
8788
**********
8889
* Supported Configuration (Environment Variables)
@@ -536,23 +537,52 @@ data_imdb() {
536537
done
537538

538539
if [ "$convert_needed" = true ]; then
539-
if [ ! -f "${imdb_dir}/imdb.tgz" ]; then
540-
echo "Downloading IMDB dataset..."
540+
# Expected size of the dataset
541+
expected_size="1263193115" # 1.18 GB
542+
543+
echo -n "Looking for imdb.tgz... "
544+
if [ -f "${imdb_temp_gz}" ]; then
545+
echo "found"
546+
echo -n "Checking size... "
547+
OUTPUT_SIZE=$(wc -c "${imdb_temp_gz}" 2>/dev/null | awk '{print $1}' || true)
541548

549+
#Checking the size of the existing file
550+
if [ "${OUTPUT_SIZE}" = "${expected_size}" ]; then
551+
# Existing file is of the expected size, no need for download
552+
echo "OK ${OUTPUT_SIZE}"
553+
else
554+
# Existing file is partially installed, remove it and initiate a new download
555+
echo "MISMATCH"
556+
echo "Size less than expected: ${OUTPUT_SIZE} found, ${expected_size} required"
557+
echo "Downloading IMDB dataset..."
558+
rm -f "${imdb_temp_gz}"
559+
560+
# Download the dataset
561+
curl -o "${imdb_temp_gz}" "${imdb_url}"
562+
563+
# Size check of the installed file
564+
DOWNLOADED_SIZE=$(wc -c "${imdb_temp_gz}" | awk '{print $1}')
565+
if [ "${DOWNLOADED_SIZE}" != "${expected_size}" ]; then
566+
echo "Error: Download size mismatch"
567+
echo "Expected: ${expected_size}"
568+
echo "Got: ${DOWNLADED_SIZE}"
569+
echo "Please re-initiate the download"
570+
return 1
571+
fi
572+
fi
573+
else
574+
# No existing file found, initiate a new download
575+
echo "not found"
576+
echo "Downloading IMDB dataset ${expected_size} expected)..."
542577
# Download the dataset
543578
curl -o "${imdb_temp_gz}" "${imdb_url}"
544-
545-
# Extract the dataset
546-
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
547-
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
548-
else
549-
echo "IMDB.tgz already exists."
550-
551-
# Extract the dataset
552-
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
553-
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
554579
fi
580+
581+
# Extract the dataset
582+
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
583+
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
555584
echo "IMDB dataset downloaded and extracted."
585+
556586
else
557587
echo "IMDB dataset already exists and contains required parquet files."
558588
fi

0 commit comments

Comments
 (0)