@@ -83,6 +83,7 @@ external_aggr: External aggregation benchmark
83
83
h2o_small: h2oai benchmark with small dataset (1e7 rows), default file format is csv
84
84
h2o_medium: h2oai benchmark with medium dataset (1e8 rows), default file format is csv
85
85
h2o_big: h2oai benchmark with large dataset (1e9 rows), default file format is csv
86
+ imdb: Join Order Benchmark (JOB) using the IMDB dataset converted to parquet
86
87
87
88
**********
88
89
* Supported Configuration (Environment Variables)
@@ -536,23 +537,52 @@ data_imdb() {
536
537
done
537
538
538
539
if [ " $convert_needed " = true ]; then
539
- if [ ! -f " ${imdb_dir} /imdb.tgz" ]; then
540
- echo " Downloading IMDB dataset..."
540
+ # Expected size of the dataset
541
+ expected_size=" 1263193115" # 1.18 GB
542
+
543
+ echo -n " Looking for imdb.tgz... "
544
+ if [ -f " ${imdb_temp_gz} " ]; then
545
+ echo " found"
546
+ echo -n " Checking size... "
547
+ OUTPUT_SIZE=$( wc -c " ${imdb_temp_gz} " 2> /dev/null | awk ' {print $1}' || true)
541
548
549
+ # Checking the size of the existing file
550
+ if [ " ${OUTPUT_SIZE} " = " ${expected_size} " ]; then
551
+ # Existing file is of the expected size, no need for download
552
+ echo " OK ${OUTPUT_SIZE} "
553
+ else
554
+ # Existing file is partially installed, remove it and initiate a new download
555
+ echo " MISMATCH"
556
+ echo " Size less than expected: ${OUTPUT_SIZE} found, ${expected_size} required"
557
+ echo " Downloading IMDB dataset..."
558
+ rm -f " ${imdb_temp_gz} "
559
+
560
+ # Download the dataset
561
+ curl -o " ${imdb_temp_gz} " " ${imdb_url} "
562
+
563
+ # Size check of the installed file
564
+ DOWNLOADED_SIZE=$( wc -c " ${imdb_temp_gz} " | awk ' {print $1}' )
565
+ if [ " ${DOWNLOADED_SIZE} " != " ${expected_size} " ]; then
566
+ echo " Error: Download size mismatch"
567
+ echo " Expected: ${expected_size} "
568
+ echo " Got: ${DOWNLADED_SIZE} "
569
+ echo " Please re-initiate the download"
570
+ return 1
571
+ fi
572
+ fi
573
+ else
574
+ # No existing file found, initiate a new download
575
+ echo " not found"
576
+ echo " Downloading IMDB dataset ${expected_size} expected)..."
542
577
# Download the dataset
543
578
curl -o " ${imdb_temp_gz} " " ${imdb_url} "
544
-
545
- # Extract the dataset
546
- tar -xzvf " ${imdb_temp_gz} " -C " ${imdb_dir} "
547
- $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
548
- else
549
- echo " IMDB.tgz already exists."
550
-
551
- # Extract the dataset
552
- tar -xzvf " ${imdb_temp_gz} " -C " ${imdb_dir} "
553
- $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
554
579
fi
580
+
581
+ # Extract the dataset
582
+ tar -xzvf " ${imdb_temp_gz} " -C " ${imdb_dir} "
583
+ $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
555
584
echo " IMDB dataset downloaded and extracted."
585
+
556
586
else
557
587
echo " IMDB dataset already exists and contains required parquet files."
558
588
fi
0 commit comments