From d50c6d588bbd9485d0bc7fe3e4cc6fb11a506884 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Fri, 18 Oct 2024 00:04:39 +0200 Subject: [PATCH 01/61] Minor fix --- .../pixels/core/PixelsReaderStreamImpl.java | 24 +++++++++---------- .../src/main/resources/log4j2.properties | 0 2 files changed, 12 insertions(+), 12 deletions(-) rename pixels-turbo/{pixels-worker-vhive => pixels-worker-common}/src/main/resources/log4j2.properties (100%) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 581da9b4f..d6f5e5631 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -151,13 +151,6 @@ public void channelRead0(ChannelHandlerContext ctx, HttpObject msg) ", partition ID header: " + req.headers().get("X-Partition-Id") + ", HTTP request object body total length: " + req.content().readableBytes()); - // schema packet: only 1 packet expected, so close the connection immediately - // partitioned mode: close the connection if all partitions received - // else (non-partitioned mode, data packet): close connection if empty packet received - boolean needCloseParentChannel = partitionId == PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER || - (partitioned && numPartitionsReceived.get() == numPartitions) || - (Objects.equals(req.headers().get(CONNECTION), CLOSE.toString()) && - req.content().readableBytes() == 0); ByteBuf byteBuf = req.content(); try { @@ -178,7 +171,7 @@ public void channelRead0(ChannelHandlerContext ctx, HttpObject msg) catch (IOException e) { logger.error("Invalid stream header values: ", e); - sendResponseAndClose(ctx, req, BAD_REQUEST, needCloseParentChannel); + sendResponseAndClose(ctx, req, BAD_REQUEST, false); return; } } @@ -193,7 +186,7 @@ else if (partitioned) catch (InvalidProtocolBufferException | IndexOutOfBoundsException e) { logger.error("Malformed or corrupted stream header", e); - sendResponseAndClose(ctx, req, BAD_REQUEST, needCloseParentChannel); + sendResponseAndClose(ctx, req, BAD_REQUEST, false); return; } @@ -209,7 +202,7 @@ else if (partitioned) if (partitionId < 0 || partitionId >= numPartitions) { logger.warn("Client sent invalid partitionId value: " + partitionId); - sendResponseAndClose(ctx, req, BAD_REQUEST, needCloseParentChannel); + sendResponseAndClose(ctx, req, BAD_REQUEST, false); return; } byteBufBlockingMap.put(partitionId, byteBuf); @@ -222,6 +215,13 @@ else if (partitioned) } } + // schema packet: only 1 packet expected, so close the connection immediately + // partitioned mode: close the connection if all partitions received + // else (non-partitioned mode, data packet): close connection if empty packet received + boolean needCloseParentChannel = partitionId == PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER || + (partitioned && numPartitionsReceived.get() == numPartitions) || + (Objects.equals(req.headers().get(CONNECTION), CLOSE.toString()) && + req.content().readableBytes() == 0); sendResponseAndClose(ctx, req, HttpResponseStatus.OK, needCloseParentChannel); } @@ -539,11 +539,11 @@ public void close() try { - if (!this.httpServerFuture.isDone()) this.httpServerFuture.get(5, TimeUnit.SECONDS); + if (!this.httpServerFuture.isDone()) this.httpServerFuture.get(300, TimeUnit.SECONDS); } catch (TimeoutException e) { - logger.warn("In close(), HTTP server did not shut down in 5 seconds, doing forceful shutdown"); + logger.warn("In close(), HTTP server did not shut down in 300 seconds, doing forceful shutdown"); this.httpServerFuture.cancel(true); } catch (InterruptedException | ExecutionException e) diff --git a/pixels-turbo/pixels-worker-vhive/src/main/resources/log4j2.properties b/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties similarity index 100% rename from pixels-turbo/pixels-worker-vhive/src/main/resources/log4j2.properties rename to pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties From 39a70a3d435b9d0355dba9df2ecbc041a7736c09 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 24 Oct 2024 13:21:08 +0200 Subject: [PATCH 02/61] log in PixelsReaderStreamImpl --- .../java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index d6f5e5631..4c8125c8f 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -84,6 +84,7 @@ public class PixelsReaderStreamImpl implements PixelsReader // In partitioned mode, we use byteBufBlockingMap to map hash value to corresponding ByteBuf private final BlockingMap byteBufBlockingMap; private final boolean partitioned; + private final int httpPort; private final AtomicReference numPartitionsReceived = new AtomicReference<>(0); private final List recordReaders; @@ -113,7 +114,7 @@ public PixelsReaderStreamImpl(String endpoint, boolean partitioned, int numParti this.streamHeader = null; URI uri = new URI(endpoint); String IP = uri.getHost(); - int httpPort = uri.getPort(); + this.httpPort = uri.getPort(); logger.debug("In Pixels stream reader constructor, IP: " + IP + ", port: " + httpPort + ", partitioned: " + partitioned + ", numPartitions: " + numPartitions); if (!Objects.equals(IP, "127.0.0.1") && !Objects.equals(IP, "localhost")) @@ -543,7 +544,7 @@ public void close() } catch (TimeoutException e) { - logger.warn("In close(), HTTP server did not shut down in 300 seconds, doing forceful shutdown"); + logger.warn("In close(), HTTP server on port " + httpPort + " did not shut down in 300 seconds, doing forceful shutdown"); this.httpServerFuture.cancel(true); } catch (InterruptedException | ExecutionException e) From 2f76bd6505b7483549c89f7c3d90cccec56a7ff6 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 24 Oct 2024 13:38:32 +0200 Subject: [PATCH 03/61] Add isSmallTable in partition input, to fix a hardcode in base partition stream worker --- .../io/pixelsdb/pixels/planner/PixelsPlanner.java | 14 ++++++++++++++ .../plan/physical/input/PartitionInput.java | 14 ++++++++++++++ .../worker/common/BasePartitionStreamWorker.java | 7 +------ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index dcc8d817e..57f8503db 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -963,6 +963,10 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) rightTable, rightInputSplits, rightKeyColumnIds, rightPartitionProjection, numPartition, getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/" + rightTable.getTableName() + "/"); + for (PartitionInput rightPartitionInput : rightPartitionInputs) + { + rightPartitionInput.setSmallTable(join.getJoinEndian() != JoinEndian.SMALL_LEFT); + } PartitionedTableInfo rightTableInfo = getPartitionedTableInfo( rightTable, rightKeyColumnIds, rightPartitionInputs, rightPartitionProjection); @@ -998,6 +1002,11 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) leftTable, leftInputSplits, leftKeyColumnIds, leftPartitionProjection, numPartition, getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/" + leftTable.getTableName() + "/"); + for (PartitionInput leftPartitionInput : leftPartitionInputs) + { + leftPartitionInput.setSmallTable(join.getJoinEndian() == JoinEndian.SMALL_LEFT); + } + PartitionedTableInfo leftTableInfo = getPartitionedTableInfo( leftTable, leftKeyColumnIds, leftPartitionInputs, leftPartitionProjection); @@ -1006,6 +1015,11 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) rightTable, rightInputSplits, rightKeyColumnIds, rightPartitionProjection, numPartition, getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/" + rightTable.getTableName() + "/"); + for (PartitionInput rightPartitionInput : rightPartitionInputs) + { + rightPartitionInput.setSmallTable(join.getJoinEndian() != JoinEndian.SMALL_LEFT); + } + PartitionedTableInfo rightTableInfo = getPartitionedTableInfo( rightTable, rightKeyColumnIds, rightPartitionInputs, rightPartitionProjection); diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java index d8c7dd868..71b844a08 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java @@ -51,6 +51,10 @@ public class PartitionInput extends Input * The information about the hash partitioning. */ private PartitionInfo partitionInfo; + /** + * Whether this table is the small table in a join. + */ + private boolean isSmallTable; /** * Default constructor for Jackson. @@ -109,4 +113,14 @@ public void setPartitionInfo(PartitionInfo partitionInfo) { this.partitionInfo = partitionInfo; } + + public boolean isSmallTable() + { + return isSmallTable; + } + + public void setSmallTable(boolean isSmallTable) + { + this.isSmallTable = isSmallTable; + } } diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 414d42a8a..1b8c13cff 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -183,14 +183,9 @@ public PartitionOutput process(PartitionInput event) .collect(ImmutableList.toImmutableList()); List outputEndpoints = downStreamWorkers.stream() .map(CFWorkerInfo::getIp) - .map(ip -> "http://" + ip + ":" - + (Objects.equals(event.getTableInfo().getTableName(), "part") ? "18688" : "18686") + "/") + .map(ip -> "http://" + ip + ":" + (event.isSmallTable() ? "18688" : "18686") + "/") // .map(URI::create) .collect(Collectors.toList()); - // todo: Need to pass whether the table is the large table or the small table here into the partition worker. - // Perhaps add a boolean field in the PartitionInput class. - // Currently, we hardcode the table name for TPC-H Q14 - the large table (rightTable for join) uses port 18686 - // while the small table (leftTable for join) uses port 18688. StreamWorkerCommon.passSchemaToNextLevel(writerSchema.get(), outputStorageInfo, outputEndpoints); PixelsWriter pixelsWriter = StreamWorkerCommon.getWriter(writerSchema.get(), From bd559fa2bd98cf9cfee48970f271eaeab7d25718 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 24 Oct 2024 13:42:41 +0200 Subject: [PATCH 04/61] Fix log4j2.properties in pixels-turbo/pixels-worker-common --- .../src/main/resources/log4j2.properties | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties b/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties index fe8abaa4b..61c9e552b 100644 --- a/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties +++ b/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties @@ -1,11 +1,11 @@ -name=pixels-worker-vhive +name=pixels-worker-common status=warn shutdownHook=disable rootLogger.level=info rootLogger.appenderRef.stdout.ref=STDOUT rootLogger.appenderRef.log.ref=log -filter.threshold.type=ThresholdFilter -filter.threshold.level=info +logger.pixelsdb.name=io.pixelsdb.pixels +logger.pixelsdb.level=info appender.console.type=Console appender.console.name=STDOUT appender.console.layout.type=PatternLayout @@ -13,7 +13,7 @@ appender.console.layout.pattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%c]-[%p] %m%n appender.rolling.type=File appender.rolling.name=log appender.rolling.append=true -appender.rolling.fileName=pixels-worker-vhive.log +appender.rolling.fileName=pixels-worker-common.log appender.rolling.layout.type=PatternLayout appender.rolling.layout.pattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%c]-[%p] %m%n From 5d9be0d1da45511313c87a9b63ef7c50dc5ae81d Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 24 Oct 2024 18:15:05 +0200 Subject: [PATCH 05/61] Also fix hardcode in BasePartitionStreamWorker --- .../pixels/worker/common/BasePartitionStreamWorker.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 1b8c13cff..f67e68f50 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -69,10 +69,8 @@ public BasePartitionStreamWorker(WorkerContext context) super(context); this.logger = context.getLogger(); this.workerMetrics = context.getWorkerMetrics(); - this.workerCoordinateService = new WorkerCoordinateService("128.110.218.225", 18894); - // Hardcoded for Cloudlab. todo: Need to figure out how to get the daemon IP dynamically. - // Perhaps add a field in the WorkerContext class to store the daemon IP, - // or to have the Pixels planner pass the daemon IP in the Input. + this.workerCoordinateService = new WorkerCoordinateService( + StreamWorkerCommon.getCoordinatorIp(), StreamWorkerCommon.getCoordinatorPort()); } @Override From 643e6c21300b50e8fc3f5eb00d35d11101cc3dfc Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sun, 27 Oct 2024 19:02:13 +0100 Subject: [PATCH 06/61] Fix bug in BasePartitionedJoinStreamWorker --- .../BasePartitionedJoinStreamWorker.java | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 6036f61ca..f17660a88 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -527,17 +527,8 @@ protected static int joinWithRightTable( pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - Set rightHashValues = new HashSet<>(numPartition); - for (int hashValue = 0; hashValue < numPartition; ++hashValue) - { - rightHashValues.add(hashValue); - } for (int hashValue : hashValues) { - if (!rightHashValues.contains(hashValue)) - { - continue; - } PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, hashValue, numPartition); VectorizedRowBatch rowBatch; @@ -646,17 +637,8 @@ protected static int joinWithRightTableAndPartition( pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - Set rightHashValues = new HashSet<>(numPartition); - for (int hashValue = 0; hashValue < numPartition; ++hashValue) - { - rightHashValues.add(hashValue); - } for (int hashValue : hashValues) { - if (!rightHashValues.contains(hashValue)) - { - continue; - } PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, hashValue, numPartition); VectorizedRowBatch rowBatch; From c9039e8d8dfcaffe546566696e54e9e2806f0a5d Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sun, 27 Oct 2024 19:06:00 +0100 Subject: [PATCH 07/61] Use -2 as numPartitions when getting schema readers --- .../java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 4c8125c8f..9cac57272 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -99,7 +99,7 @@ public class PixelsReaderStreamImpl implements PixelsReader public PixelsReaderStreamImpl(String endpoint) throws Exception { - this(endpoint, false, -1); + this(endpoint, false, -2); } public PixelsReaderStreamImpl(int port) throws Exception From 43712363a3568bad63d87fe7c6d642e198d88106 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 29 Oct 2024 15:48:29 +0100 Subject: [PATCH 08/61] Also fix hardcode in BasePartitionedJoinStreamWorker --- .../pixels/worker/common/BasePartitionedJoinStreamWorker.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index f17660a88..d7304c74a 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -66,7 +66,8 @@ public BasePartitionedJoinStreamWorker(WorkerContext context) // this.logger = context.getLogger(); this.workerMetrics = context.getWorkerMetrics(); this.workerMetrics.clear(); - this.workerCoordinateService = new WorkerCoordinateService("128.110.218.225", 18894); + this.workerCoordinateService = new WorkerCoordinateService( + StreamWorkerCommon.getCoordinatorIp(), StreamWorkerCommon.getCoordinatorPort()); } @Override From a76601ace133d20e45cc88199cab692142000835 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 19:56:27 +0100 Subject: [PATCH 09/61] Add postPartitionId and postPartitionIsSmallTable in JoinInfo, to extend streaming support to post partition of joins --- .../pixels/planner/PixelsPlanner.java | 7 +++-- .../PartitionedJoinStreamOperator.java | 3 +- .../plan/physical/domain/JoinInfo.java | 29 +++++++++++++++++++ .../physical/domain/PartitionedJoinInfo.java | 12 ++++++++ .../BasePartitionedJoinStreamWorker.java | 19 +++++++++--- 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index 57f8503db..1cb264885 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -1345,6 +1345,7 @@ private List getPartitionedJoinInputs( { boolean postPartition = false; PartitionInfo postPartitionInfo = null; + boolean postPartitionIsSmallTable = false; if (parent.isPresent() && parent.get().getJoin().getJoinAlgo() == JoinAlgorithm.PARTITIONED) { postPartition = true; @@ -1359,10 +1360,12 @@ private List getPartitionedJoinInputs( if (joinedTable == parent.get().getJoin().getLeftTable()) { postPartitionInfo = new PartitionInfo(parent.get().getJoin().getLeftKeyColumnIds(), numPostPartition); + postPartitionIsSmallTable = parent.get().getJoin().getJoinEndian() == JoinEndian.SMALL_LEFT; } else { postPartitionInfo = new PartitionInfo(parent.get().getJoin().getRightKeyColumnIds(), numPostPartition); + postPartitionIsSmallTable = parent.get().getJoin().getJoinEndian() != JoinEndian.SMALL_LEFT; } } @@ -1392,7 +1395,7 @@ private List getPartitionedJoinInputs( { PartitionedJoinInfo joinInfo = new PartitionedJoinInfo(joinedTable.getJoin().getJoinType(), joinedTable.getJoin().getLeftColumnAlias(), joinedTable.getJoin().getRightColumnAlias(), - leftProjection, rightProjection, postPartition, postPartitionInfo, numPartition, ImmutableList.of(i)); + leftProjection, rightProjection, postPartition, postPartitionInfo, postPartitionIsSmallTable, numPartition, ImmutableList.of(i)); joinInput = new PartitionedJoinInput(transId, leftTableInfo, rightTableInfo, joinInfo, false, null, output); } @@ -1400,7 +1403,7 @@ private List getPartitionedJoinInputs( { PartitionedJoinInfo joinInfo = new PartitionedJoinInfo(joinedTable.getJoin().getJoinType().flip(), joinedTable.getJoin().getRightColumnAlias(), joinedTable.getJoin().getLeftColumnAlias(), - rightProjection, leftProjection, postPartition, postPartitionInfo, numPartition, ImmutableList.of(i)); + rightProjection, leftProjection, postPartition, postPartitionInfo, postPartitionIsSmallTable, numPartition, ImmutableList.of(i)); joinInput = new PartitionedJoinInput(transId, rightTableInfo, leftTableInfo, joinInfo, false, null, output); } diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index b8d16747b..f17a1f9cd 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -27,6 +27,7 @@ import io.pixelsdb.pixels.planner.coordinate.PlanCoordinatorFactory; import io.pixelsdb.pixels.planner.plan.physical.input.JoinInput; import io.pixelsdb.pixels.planner.plan.physical.input.PartitionInput; +import io.pixelsdb.pixels.planner.plan.physical.input.PartitionedJoinInput; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -107,7 +108,7 @@ else if (smallChild != null) for (PartitionInput partitionInput : largePartitionInputs) { largePartitionOutputs[i++] = InvokerFactory.Instance() - .getInvoker(WorkerType.PARTITION_STREAMING).invoke((partitionInput)); + .getInvoker(WorkerType.PARTITION_STREAMING).invoke(partitionInput); } logger.debug("invoke large partition of " + this.getName()); diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/JoinInfo.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/JoinInfo.java index cbc6aea5a..a7cce5b7d 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/JoinInfo.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/JoinInfo.java @@ -59,6 +59,15 @@ public class JoinInfo * The partition information of the output if outputPartitioned is true. */ private PartitionInfo postPartitionInfo; + /** + * The partition id of this worker in post partition if outputPartitioned is true. + */ + private int postPartitionId; + /** + * Whether this table is the small table in the next-level join. This is used to determine the HTTP port + * when using streaming. + */ + private boolean postPartitionIsSmallTable; /** * Default constructor for Jackson. @@ -147,4 +156,24 @@ public void setPostPartitionInfo(PartitionInfo postPartitionInfo) { this.postPartitionInfo = postPartitionInfo; } + + public int getPostPartitionId() + { + return postPartitionId; + } + + public void setPostPartitionId(int postPartitionId) + { + this.postPartitionId = postPartitionId; + } + + public boolean getPostPartitionIsSmallTable() + { + return postPartitionIsSmallTable; + } + + public void setPostPartitionIsSmallTable(boolean postPartitionIsSmallTable) + { + this.postPartitionIsSmallTable = postPartitionIsSmallTable; + } } diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/PartitionedJoinInfo.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/PartitionedJoinInfo.java index d5f939ff2..eb9dad94d 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/PartitionedJoinInfo.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/PartitionedJoinInfo.java @@ -53,6 +53,18 @@ public PartitionedJoinInfo(JoinType joinType, String[] smallColumnAlias, String[ this.hashValues = hashValues; } + public PartitionedJoinInfo(JoinType joinType, String[] smallColumnAlias, String[] largeColumnAlias, + boolean[] smallProjection, boolean[] largeProjection, boolean postPartition, + PartitionInfo postPartitionInfo, boolean postPartitionIsSmallTable, + int numPartition, List hashValues) + { + super(joinType, smallColumnAlias, largeColumnAlias, smallProjection, largeProjection, + postPartition, postPartitionInfo); + this.numPartition = numPartition; + this.hashValues = hashValues; + this.setPostPartitionIsSmallTable(postPartitionIsSmallTable); + } + public int getNumPartition() { return numPartition; diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index d7304c74a..bb0623065 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -19,6 +19,7 @@ */ package io.pixelsdb.pixels.worker.common; +import com.google.common.collect.ImmutableList; import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.core.PixelsReader; import io.pixelsdb.pixels.core.PixelsWriter; @@ -296,11 +297,20 @@ public JoinOutput process(PartitionedJoinInput event) PixelsWriter pixelsWriter; if (partitionOutput) { + List downStreamWorkers = workerCoordinateService.getDownstreamWorkers(worker.getWorkerId()) + .stream() + .sorted(Comparator.comparing(worker -> worker.getHashValues().get(0))) + .collect(ImmutableList.toImmutableList()); + List outputEndpoints = downStreamWorkers.stream() + .map(CFWorkerInfo::getIp) + .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") + // .map(URI::create) + .collect(Collectors.toList()); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, - encoding, true, -1, Arrays.stream( + encoding, true, event.getJoinInfo().getPostPartitionId(), Arrays.stream( outputPartitionInfo.getKeyColumnIds()).boxed(). - collect(Collectors.toList())); + collect(Collectors.toList()), outputEndpoints, false); for (int hash = 0; hash < outputPartitionInfo.getNumPartition(); ++hash) { ConcurrentLinkedQueue batches = result.get(hash); @@ -346,9 +356,10 @@ public JoinOutput process(PartitionedJoinInput event) requireNonNull(outputPartitionInfo, "outputPartitionInfo is null"); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, - encoding, true, -1, Arrays.stream( + encoding, true, event.getJoinInfo().getPostPartitionId(), Arrays.stream( outputPartitionInfo.getKeyColumnIds()).boxed(). - collect(Collectors.toList())); + collect(Collectors.toList())); // , outputEndpoints, false); + // TODO: Adapt the left-outer tail to streaming mode. joiner.writeLeftOuterAndPartition(pixelsWriter, StreamWorkerCommon.rowBatchSize, outputPartitionInfo.getNumPartition(), outputPartitionInfo.getKeyColumnIds()); } From dea7cf2f767c2dfdafdfd2154492ed6951a1a1b4 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 20:03:45 +0100 Subject: [PATCH 10/61] Add postPartitionId and postPartitionIsSmallTable in JoinInfo, to extend streaming support to post partition of joins --- .../planner/plan/physical/PartitionedJoinStreamOperator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index f17a1f9cd..e4145ca63 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -63,7 +63,8 @@ public CompletableFuture[]> execute() { JoinInput joinInput = joinInputs.get(i); joinInput.setSmallPartitionWorkerNum(smallPartitionInputs.size()); // XXX: could be 0 - joinInput.setLargePartitionWorkerNum(largePartitionInputs.size()); + joinInput.setLargePartitionWorkerNum(largePartitionInputs.size()); // XXX: Can do this in PixelsPlanner + ((PartitionedJoinInput)joinInput).getJoinInfo().setPostPartitionId(i); if (joinAlgo == JoinAlgorithm.PARTITIONED) { joinOutputs[i] = InvokerFactory.Instance() From bddce98a4e55ba9d0724e110bc82815985e88e89 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 20:05:22 +0100 Subject: [PATCH 11/61] Comments --- .../pixels/planner/plan/physical/input/PartitionInput.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java index 71b844a08..43501789d 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java @@ -52,7 +52,8 @@ public class PartitionInput extends Input */ private PartitionInfo partitionInfo; /** - * Whether this table is the small table in a join. + * Whether this table is the small table in the next-level join. This is used to determine the HTTP port + * when using streaming. */ private boolean isSmallTable; From 0d9054ef58a655da49ca09a0d1f7801a9cd52ebb Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 20:05:38 +0100 Subject: [PATCH 12/61] Comments --- .../pixels/worker/common/BasePartitionStreamWorker.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index f67e68f50..a058a4aeb 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -71,6 +71,10 @@ public BasePartitionStreamWorker(WorkerContext context) this.workerMetrics = context.getWorkerMetrics(); this.workerCoordinateService = new WorkerCoordinateService( StreamWorkerCommon.getCoordinatorIp(), StreamWorkerCommon.getCoordinatorPort()); + // In cloud functions, configuration files "pixels.properties" are not present, and so the pre-packaged + // configuration file "pixels-common/src/main/resources/pixels.properties" will be used during runtime. + // Therefore, you need to modify the coordinator host and port in the pre-packaged configuration file on localhost + // where you rebuild the Docker image. } @Override From 09bc92c95313242b87c17586cf452ae2ed2089cf Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 20:09:39 +0100 Subject: [PATCH 13/61] Interconnection between workers and stream workers, by modifying storage infos in PixelsPlanner --- .../pixels/planner/PixelsPlanner.java | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index 1cb264885..fa9d87841 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -62,6 +62,7 @@ public class PixelsPlanner private static final Logger logger = LogManager.getLogger(PixelsPlanner.class); private static final StorageInfo InputStorageInfo; private static final StorageInfo IntermediateStorageInfo; + private static final StorageInfo IntermediateStreamStorageInfo; // Streaming only used between partition worker -> partitioned join worker private static final String IntermediateFolder; private static final int IntraWorkerParallelism; private static final ExchangeMethod EnabledExchangeMethod; @@ -90,10 +91,10 @@ public class PixelsPlanner ConfigFactory.Instance().getProperty("executor.input.storage.scheme")); InputStorageInfo = StorageInfoBuilder.BuildFromConfig(inputStorageScheme); - Storage.Scheme interStorageScheme = EnabledExchangeMethod == ExchangeMethod.batch ? - Storage.Scheme.from(ConfigFactory.Instance().getProperty("executor.intermediate.storage.scheme")) : - Storage.Scheme.valueOf("httpstream"); + Storage.Scheme interStorageScheme = Storage.Scheme.from( + ConfigFactory.Instance().getProperty("executor.intermediate.storage.scheme")); IntermediateStorageInfo = StorageInfoBuilder.BuildFromConfig(interStorageScheme); + IntermediateStreamStorageInfo = StorageInfoBuilder.BuildFromConfig(Storage.Scheme.valueOf("httpstream")); String interStorageFolder = ConfigFactory.Instance().getProperty("executor.intermediate.folder"); if (!interStorageFolder.endsWith("/")) { @@ -281,7 +282,7 @@ else if (originTable.getTableType() == Table.TableType.JOINED) joinInput.setPartialAggregationInfo(partialAggregationInfo); String fileName = "partial_aggr_" + outputId++; MultiOutputInfo outputInfo = joinInput.getOutput(); - outputInfo.setStorageInfo(IntermediateStorageInfo); + outputInfo.setStorageInfo(IntermediateStorageInfo); // IntermediateStreamStorageInfo? outputInfo.setPath(intermediateBase); outputInfo.setFileNames(ImmutableList.of(fileName)); aggrInputFilesBuilder.add(intermediateBase + fileName); @@ -521,12 +522,14 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) boolean leftIsBase = leftTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo leftTableInfo = new PartitionedTableInfo( leftTable.getTableName(), leftIsBase, leftTable.getColumnNames(), - leftIsBase ? InputStorageInfo : IntermediateStorageInfo, + leftIsBase ? InputStorageInfo : + (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), leftPartitionedFiles, IntraWorkerParallelism, leftKeyColumnIds); boolean rightIsBase = rightTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo rightTableInfo = new PartitionedTableInfo( rightTable.getTableName(), rightIsBase, rightTable.getColumnNames(), - rightIsBase ? InputStorageInfo : IntermediateStorageInfo, + rightIsBase ? InputStorageInfo : + (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), rightPartitionedFiles, IntraWorkerParallelism, rightKeyColumnIds); int numPartition = PlanOptimizer.Instance().getJoinNumPartition( @@ -954,7 +957,8 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) boolean leftIsBase = leftTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo leftTableInfo = new PartitionedTableInfo( leftTable.getTableName(), leftIsBase, leftTable.getColumnNames(), - leftIsBase ? InputStorageInfo : IntermediateStorageInfo, + leftIsBase ? InputStorageInfo : + (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), // ??? leftPartitionedFiles, IntraWorkerParallelism, leftKeyColumnIds); boolean[] rightPartitionProjection = getPartitionProjection(rightTable, join.getRightProjection()); @@ -1271,13 +1275,19 @@ private PartitionedTableInfo getPartitionedTableInfo( if (table.getTableType() == Table.TableType.BASE) { return new PartitionedTableInfo(table.getTableName(), true, - newColumnsToRead, InputStorageInfo, rightPartitionedFiles.build(), + newColumnsToRead, + EnabledExchangeMethod == ExchangeMethod.batch ? InputStorageInfo : IntermediateStreamStorageInfo, + rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); } else { return new PartitionedTableInfo(table.getTableName(), false, - newColumnsToRead, IntermediateStorageInfo, rightPartitionedFiles.build(), + newColumnsToRead, + EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo, + rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); + // XXX: This only applies to joined tables, when the current join reads table from a post partition of a previous join. + // If the table type is AGGREAGATED, we should use IntermediateStorageInfo. } } @@ -1312,11 +1322,13 @@ private List getPartitionInputs(Table inputTable, List getPartitionedJoinInputs( String path = getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/"; - MultiOutputInfo output = new MultiOutputInfo(path, IntermediateStorageInfo, true, outputFileNames.build()); + MultiOutputInfo output = new MultiOutputInfo(path, + postPartition && EnabledExchangeMethod == ExchangeMethod.stream ? IntermediateStreamStorageInfo : IntermediateStorageInfo, + true, outputFileNames.build()); boolean[] leftProjection = leftPartitionProjection == null ? joinedTable.getJoin().getLeftProjection() : rewriteProjectionForPartitionedJoin(joinedTable.getJoin().getLeftProjection(), leftPartitionProjection); From 5dfb29678c4ce148d405ea92f2ee62064bfd8ec5 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Mon, 4 Nov 2024 16:51:17 +0100 Subject: [PATCH 14/61] VhiveInvoker should not call blocking GRPC in its constructor --- .../pixels/invoker/vhive/VhiveInvoker.java | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pixels-turbo/pixels-invoker-vhive/src/main/java/io/pixelsdb/pixels/invoker/vhive/VhiveInvoker.java b/pixels-turbo/pixels-invoker-vhive/src/main/java/io/pixelsdb/pixels/invoker/vhive/VhiveInvoker.java index cc2428be1..246dc16ff 100644 --- a/pixels-turbo/pixels-invoker-vhive/src/main/java/io/pixelsdb/pixels/invoker/vhive/VhiveInvoker.java +++ b/pixels-turbo/pixels-invoker-vhive/src/main/java/io/pixelsdb/pixels/invoker/vhive/VhiveInvoker.java @@ -33,21 +33,23 @@ public abstract class VhiveInvoker implements Invoker { private static final Logger logger = LogManager.getLogger(VhiveInvoker.class); private final String functionName; - private final int memoryMB; + private int memoryMB; protected VhiveInvoker(String functionName) { this.functionName = functionName; - int memoryMB = 0; - try - { - TurboProto.GetMemoryResponse response = Vhive.Instance().getAsyncClient().getMemory().get(); - memoryMB = (int) response.getMemoryMB(); - } catch (Exception e) - { - logger.error("failed to get memory: " + e); - } - this.memoryMB = memoryMB; + new Thread(() -> { + int memoryMB = 0; + try + { + TurboProto.GetMemoryResponse response = Vhive.Instance().getAsyncClient().getMemory().get(); + memoryMB = (int) response.getMemoryMB(); + } catch (Exception e) + { + logger.error("failed to get memory: " + e); + } + this.memoryMB = memoryMB; + }).start(); } @Override From 30baa67f9059a7c7f251044d05eb8a025f83cfbd Mon Sep 17 00:00:00 2001 From: jasha64 Date: Mon, 4 Nov 2024 22:16:30 +0100 Subject: [PATCH 15/61] Fix smallPartitionWorkerNum and largePartitionWorkerNum in PartitionedJoinStreamOperator --- .../PartitionedJoinStreamOperator.java | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index e4145ca63..ad979f423 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -59,11 +59,33 @@ public CompletableFuture[]> execute() { // First, bootstrap the join workers. joinOutputs = new CompletableFuture[joinInputs.size()]; + int smallPartitionWorkerNum; + if (!smallPartitionInputs.isEmpty()) + { + smallPartitionWorkerNum = smallPartitionInputs.size(); + } + else if (smallChild != null) { + smallPartitionWorkerNum = smallChild.getJoinInputs().size(); + } + else { + throw new IllegalStateException("smallPartitionInputs and smallChild are both null"); + } + int largePartitionWorkerNum; + if (!largePartitionInputs.isEmpty()) + { + largePartitionWorkerNum = largePartitionInputs.size(); + } + else if (largeChild != null) { + largePartitionWorkerNum = largeChild.getJoinInputs().size(); + } + else { + throw new IllegalStateException("largePartitionInputs and largeChild are both null"); + } for (int i = 0; i < joinInputs.size(); ++i) { JoinInput joinInput = joinInputs.get(i); - joinInput.setSmallPartitionWorkerNum(smallPartitionInputs.size()); // XXX: could be 0 - joinInput.setLargePartitionWorkerNum(largePartitionInputs.size()); // XXX: Can do this in PixelsPlanner + joinInput.setSmallPartitionWorkerNum(smallPartitionWorkerNum); + joinInput.setLargePartitionWorkerNum(largePartitionWorkerNum); // XXX: Can do this in PixelsPlanner ((PartitionedJoinInput)joinInput).getJoinInfo().setPostPartitionId(i); if (joinAlgo == JoinAlgorithm.PARTITIONED) { From 66c9c4720795f2f4b6a4e7c7dcc5a899f3f20a34 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Mon, 4 Nov 2024 22:20:46 +0100 Subject: [PATCH 16/61] Bug fix, support multiple partition workers --- .../pixels/core/PixelsReaderStreamImpl.java | 6 +- .../BasePartitionedJoinStreamWorker.java | 324 +++++++----------- 2 files changed, 134 insertions(+), 196 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 9cac57272..5078434a0 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -216,12 +216,12 @@ else if (partitioned) } } - // schema packet: only 1 packet expected, so close the connection immediately + // schema reader: only 1 packet expected, so close the connection immediately // partitioned mode: close the connection if all partitions received // else (non-partitioned mode, data packet): close connection if empty packet received - boolean needCloseParentChannel = partitionId == PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER || + boolean needCloseParentChannel = numPartitions == PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER || (partitioned && numPartitionsReceived.get() == numPartitions) || - (Objects.equals(req.headers().get(CONNECTION), CLOSE.toString()) && + (numPartitions == -1 && Objects.equals(req.headers().get(CONNECTION), CLOSE.toString()) && req.content().readableBytes() == 0); sendResponseAndClose(ctx, req, HttpResponseStatus.OK, needCloseParentChannel); } diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index bb0623065..7fc83cbaa 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -96,7 +96,11 @@ public JoinOutput process(PartitionedJoinInput event) List leftPartitioned = event.getSmallTable().getInputFiles(); requireNonNull(leftPartitioned, "leftPartitioned is null"); checkArgument(leftPartitioned.size() > 0, "leftPartitioned is empty"); - int leftParallelism = event.getSmallTable().getParallelism(); + int leftParallelism = 1; // event.getSmallTable().getParallelism(); + // todo: Intra-worker parallelism support in streaming mode + // Currently, we only support an intra-worker parallelism of 1 (no parallelism) in streaming mode. + // Need to allow each join worker to use multiple ports to read input in parallel, so as to + // build the hash table in parallel, thus achieving intra-worker parallelism. checkArgument(leftParallelism > 0, "leftParallelism is not positive"); String[] leftColumnsToRead = event.getSmallTable().getColumnsToRead(); int[] leftKeyColumnIds = event.getSmallTable().getKeyColumnIds(); @@ -106,7 +110,7 @@ public JoinOutput process(PartitionedJoinInput event) List rightPartitioned = event.getLargeTable().getInputFiles(); requireNonNull(rightPartitioned, "rightPartitioned is null"); checkArgument(rightPartitioned.size() > 0, "rightPartitioned is empty"); - int rightParallelism = event.getLargeTable().getParallelism(); + int rightParallelism = 1; // event.getLargeTable().getParallelism(); checkArgument(rightParallelism > 0, "rightParallelism is not positive"); String[] rightColumnsToRead = event.getLargeTable().getColumnsToRead(); int[] rightKeyColumnIds = event.getLargeTable().getKeyColumnIds(); @@ -202,8 +206,6 @@ public JoinOutput process(PartitionedJoinInput event) for (int i = 0; i < leftPartitioned.size(); i += leftSplitSize) { List parts = new LinkedList<>(); - // XXX: Can allow 1 join worker to use multiple ports to read input in parallel, so as to - // build the hash table in parallel. for (int j = i; j < i + leftSplitSize && j < leftPartitioned.size(); ++j) { parts.add(leftPartitioned.get(j)); @@ -306,6 +308,7 @@ public JoinOutput process(PartitionedJoinInput event) .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") // .map(URI::create) .collect(Collectors.toList()); + StreamWorkerCommon.passSchemaToNextLevel(joiner.getJoinedSchema(), outputStorageInfo, outputEndpoints); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, encoding, true, event.getJoinInfo().getPostPartitionId(), Arrays.stream( @@ -428,73 +431,51 @@ protected static void buildHashTable(long transId, Joiner joiner, List l WorkerMetrics.Timer computeCostTimer = new WorkerMetrics.Timer(); long readBytes = 0L; int numReadRequests = 0; - while (!leftParts.isEmpty()) + + readCostTimer.start(); + PixelsReader pixelsReader = null; + try { - for (Iterator it = leftParts.iterator(); it.hasNext(); ) + pixelsReader = StreamWorkerCommon.getReader(leftScheme, "http://localhost:18688/", true, numPartition); + readCostTimer.stop(); + checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); + for (int hashValue : hashValues) { - String leftPartitioned = it.next(); - readCostTimer.start(); - PixelsReader pixelsReader = null; - try - { - pixelsReader = StreamWorkerCommon.getReader(leftScheme, "http://localhost:18688/", true, numPartition); - readCostTimer.stop(); - checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - for (int hashValue : hashValues) - { - PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, leftCols, pixelsReader, - hashValue, numPartition); - VectorizedRowBatch rowBatch; - PixelsRecordReader recordReader = pixelsReader.read(option); - // XXX: perhaps do not need to re-initialize the record reader for each hash value. - if (recordReader == null) continue; - checkArgument(recordReader.isValid(), "failed to get record reader"); - - computeCostTimer.start(); - do - { - rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); - if (rowBatch.size > 0) - { - joiner.populateLeftTable(rowBatch); - } - } while (!rowBatch.endOfFile); - computeCostTimer.stop(); - computeCostTimer.minus(recordReader.getReadTimeNanos()); - readCostTimer.add(recordReader.getReadTimeNanos()); - readBytes += recordReader.getCompletedBytes(); - numReadRequests += recordReader.getNumReadRequests(); - } - it.remove(); - } - catch (Throwable e) - { - if (e instanceof IOException) - { - continue; - } - throw new WorkerException("failed to scan the partitioned file '" + - leftPartitioned + "' and build the hash table", e); - } - finally + PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, leftCols, pixelsReader, + hashValue, numPartition); + VectorizedRowBatch rowBatch; + PixelsRecordReader recordReader = pixelsReader.read(option); + // XXX: perhaps do not need to re-initialize the record reader for each hash value. + if (recordReader == null) continue; + checkArgument(recordReader.isValid(), "failed to get record reader"); + + computeCostTimer.start(); + do { - if (pixelsReader != null) + rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); + if (rowBatch.size > 0) { - logger.debug("closing pixels reader"); - pixelsReader.close(); + joiner.populateLeftTable(rowBatch); } - } + } while (!rowBatch.endOfFile); + computeCostTimer.stop(); + computeCostTimer.minus(recordReader.getReadTimeNanos()); + readCostTimer.add(recordReader.getReadTimeNanos()); + readBytes += recordReader.getCompletedBytes(); + numReadRequests += recordReader.getNumReadRequests(); } - if (!leftParts.isEmpty()) + } + catch (Throwable e) + { + if (!(e instanceof IOException)) + throw new WorkerException("failed to scan the partitioned file and build the hash table", e); + } + finally + { + if (pixelsReader != null) { - try - { - TimeUnit.MILLISECONDS.sleep(100); - } - catch (InterruptedException e) - { - throw new WorkerException("interrupted while waiting for the partitioned files"); - } + logger.debug("closing pixels reader"); + pixelsReader.close(); } } workerMetrics.addReadBytes(readBytes); @@ -527,79 +508,57 @@ protected static int joinWithRightTable( WorkerMetrics.Timer computeCostTimer = new WorkerMetrics.Timer(); long readBytes = 0L; int numReadRequests = 0; - while (!rightParts.isEmpty()) + + readCostTimer.start(); + PixelsReader pixelsReader = null; + try { - for (Iterator it = rightParts.iterator(); it.hasNext(); ) + pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); + readCostTimer.stop(); + checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); + for (int hashValue : hashValues) { - String rightPartitioned = it.next(); - readCostTimer.start(); - PixelsReader pixelsReader = null; - try + PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, + hashValue, numPartition); + VectorizedRowBatch rowBatch; + PixelsRecordReader recordReader = pixelsReader.read(option); + checkArgument(recordReader.isValid(), "failed to get record reader"); + + computeCostTimer.start(); + do { - pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); - readCostTimer.stop(); - checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - for (int hashValue : hashValues) + rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); + if (rowBatch.size > 0) { - PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, - hashValue, numPartition); - VectorizedRowBatch rowBatch; - PixelsRecordReader recordReader = pixelsReader.read(option); - checkArgument(recordReader.isValid(), "failed to get record reader"); - - computeCostTimer.start(); - do + List joinedBatches = joiner.join(rowBatch); + for (VectorizedRowBatch joined : joinedBatches) { - rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); - if (rowBatch.size > 0) + if (!joined.isEmpty()) { - List joinedBatches = joiner.join(rowBatch); - for (VectorizedRowBatch joined : joinedBatches) - { - if (!joined.isEmpty()) - { - joinResult.add(joined); - joinedRows += joined.size; - } - } + joinResult.add(joined); + joinedRows += joined.size; } - } while (!rowBatch.endOfFile); - computeCostTimer.stop(); - computeCostTimer.minus(recordReader.getReadTimeNanos()); - readCostTimer.add(recordReader.getReadTimeNanos()); - readBytes += recordReader.getCompletedBytes(); - numReadRequests += recordReader.getNumReadRequests(); - } - it.remove(); - } - catch (Throwable e) - { - if (e instanceof IOException) - { - continue; - } - throw new WorkerException("failed to scan the partitioned file '" + - rightPartitioned + "' and do the join", e); - } - finally - { - if (pixelsReader != null) - { - logger.debug("closing pixels reader"); - pixelsReader.close(); + } } - } + } while (!rowBatch.endOfFile); + computeCostTimer.stop(); + computeCostTimer.minus(recordReader.getReadTimeNanos()); + readCostTimer.add(recordReader.getReadTimeNanos()); + readBytes += recordReader.getCompletedBytes(); + numReadRequests += recordReader.getNumReadRequests(); } - if (!rightParts.isEmpty()) + } + catch (Throwable e) + { + if (!(e instanceof IOException)) + throw new WorkerException("failed to scan the partitioned file and do the join", e); + } + finally + { + if (pixelsReader != null) { - try - { - TimeUnit.MILLISECONDS.sleep(100); - } - catch (InterruptedException e) - { - throw new WorkerException("interrupted while waiting for the partitioned files"); - } + logger.debug("closing pixels reader"); + pixelsReader.close(); } } workerMetrics.addReadBytes(readBytes); @@ -637,84 +596,63 @@ protected static int joinWithRightTableAndPartition( WorkerMetrics.Timer computeCostTimer = new WorkerMetrics.Timer(); long readBytes = 0L; int numReadRequests = 0; - while (!rightParts.isEmpty()) + + readCostTimer.start(); + PixelsReader pixelsReader = null; + try { - for (Iterator it = rightParts.iterator(); it.hasNext(); ) + pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); + readCostTimer.stop(); + checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); + // XXX: check that the hashValue in row group headers match the hashValue assigned to this worker + for (int hashValue : hashValues) { - String rightPartitioned = it.next(); - readCostTimer.start(); - PixelsReader pixelsReader = null; - try + PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, + hashValue, numPartition); + VectorizedRowBatch rowBatch; + PixelsRecordReader recordReader = pixelsReader.read(option); + if (recordReader == null) continue; + checkArgument(recordReader.isValid(), "failed to get record reader"); + + computeCostTimer.start(); + do { - pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); - readCostTimer.stop(); - checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - for (int hashValue : hashValues) + rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); + if (rowBatch.size > 0) { - PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, - hashValue, numPartition); - VectorizedRowBatch rowBatch; - PixelsRecordReader recordReader = pixelsReader.read(option); - if (recordReader == null) continue; - checkArgument(recordReader.isValid(), "failed to get record reader"); - - computeCostTimer.start(); - do + List joinedBatches = joiner.join(rowBatch); + for (VectorizedRowBatch joined : joinedBatches) { - rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); - if (rowBatch.size > 0) + if (!joined.isEmpty()) { - List joinedBatches = joiner.join(rowBatch); - for (VectorizedRowBatch joined : joinedBatches) + Map parts = partitioner.partition(joined); + for (Map.Entry entry : parts.entrySet()) { - if (!joined.isEmpty()) - { - Map parts = partitioner.partition(joined); - for (Map.Entry entry : parts.entrySet()) - { - partitionResult.get(entry.getKey()).add(entry.getValue()); - } - joinedRows += joined.size; - } + partitionResult.get(entry.getKey()).add(entry.getValue()); } + joinedRows += joined.size; } - } while (!rowBatch.endOfFile); - computeCostTimer.stop(); - computeCostTimer.minus(recordReader.getReadTimeNanos()); - readCostTimer.add(recordReader.getReadTimeNanos()); - readBytes += recordReader.getCompletedBytes(); - numReadRequests += recordReader.getNumReadRequests(); - } - it.remove(); - } - catch (Throwable e) - { - if (e instanceof IOException) - { - continue; - } - throw new WorkerException("failed to scan the partitioned file '" + - rightPartitioned + "' and do the join", e); - } - finally - { - if (pixelsReader != null) - { - logger.debug("closing pixels reader"); - pixelsReader.close(); + } } - } + } while (!rowBatch.endOfFile); + computeCostTimer.stop(); + computeCostTimer.minus(recordReader.getReadTimeNanos()); + readCostTimer.add(recordReader.getReadTimeNanos()); + readBytes += recordReader.getCompletedBytes(); + numReadRequests += recordReader.getNumReadRequests(); } - if (!rightParts.isEmpty()) + } + catch (Throwable e) + { + if (!(e instanceof IOException)) + throw new WorkerException("failed to scan the partitioned file and do the join", e); + } + finally + { + if (pixelsReader != null) { - try - { - TimeUnit.MILLISECONDS.sleep(100); - } - catch (InterruptedException e) - { - throw new WorkerException("interrupted while waiting for the partitioned files"); - } + logger.debug("closing pixels reader"); + pixelsReader.close(); } } From 76cad0a6323eca60cd3902efe49a6d9f2cac7a59 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Mon, 4 Nov 2024 22:21:24 +0100 Subject: [PATCH 17/61] Minor fix --- .../src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index fa9d87841..c24158da0 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -994,7 +994,7 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) new PartitionedJoinBatchOperator(joinedTable.getTableName(), rightPartitionInputs, null, joinInputs, joinAlgo) : new PartitionedJoinStreamOperator(joinedTable.getTableName(), - null, rightPartitionInputs, joinInputs, joinAlgo); + rightPartitionInputs, null, joinInputs, joinAlgo); joinOperator.setLargeChild(childOperator); } } From 901ab03a80be7516feacb3c3929b7cddd444dedb Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 5 Nov 2024 17:02:50 +0100 Subject: [PATCH 18/61] Bug fix in pixels stream reader --- .../pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 10 +--------- .../core/reader/PixelsRecordReaderStreamImpl.java | 8 ++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 5078434a0..a369a2d67 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -438,15 +438,7 @@ public int getRowGroupNum() @Override public boolean isPartitioned() { - try - { - streamHeaderLatch.await(); - } - catch (InterruptedException e) - { - logger.error("Interrupted while waiting for stream header", e); - } - return this.streamHeader.hasPartitioned() && this.streamHeader.getPartitioned(); + return partitioned; } /** diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java index 0588cbd47..3eee68314 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java @@ -37,6 +37,8 @@ import java.util.*; import java.util.concurrent.BlockingQueue; +import static com.google.common.base.Preconditions.checkArgument; + /** * PixelsRecordReaderStreamImpl is the variant of {@link PixelsRecordReaderImpl} for streaming mode. *

@@ -153,6 +155,12 @@ public PixelsRecordReaderStreamImpl(boolean partitioned, */ public void lateInitialization(PixelsStreamProto.StreamHeader streamHeader) throws IOException { + if (this.streamHeader != null) + { + checkArgument(this.streamHeader == streamHeader, + "streamHeader used for lateInitialization() is not the same as the one in the RecordReader."); + return; + } this.streamHeader = streamHeader; checkBeforeRead(); } From e51859952b111a955eb85f138f4682c54d3dc730 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 5 Nov 2024 17:04:07 +0100 Subject: [PATCH 19/61] Bug fix pixels stream reader --- .../worker/common/BasePartitionedJoinStreamWorker.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 7fc83cbaa..0912793d0 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -447,7 +447,9 @@ protected static void buildHashTable(long transId, Joiner joiner, List l PixelsRecordReader recordReader = pixelsReader.read(option); // XXX: perhaps do not need to re-initialize the record reader for each hash value. if (recordReader == null) continue; - checkArgument(recordReader.isValid(), "failed to get record reader"); + // We no longer check the validity of the record reader here, because the record reader + // might not have been initialized yet due to the absence of the stream header. + // checkArgument(recordReader.isValid(), "failed to get record reader"); computeCostTimer.start(); do @@ -522,7 +524,7 @@ protected static int joinWithRightTable( hashValue, numPartition); VectorizedRowBatch rowBatch; PixelsRecordReader recordReader = pixelsReader.read(option); - checkArgument(recordReader.isValid(), "failed to get record reader"); + // checkArgument(recordReader.isValid(), "failed to get record reader"); computeCostTimer.start(); do @@ -612,7 +614,7 @@ protected static int joinWithRightTableAndPartition( VectorizedRowBatch rowBatch; PixelsRecordReader recordReader = pixelsReader.read(option); if (recordReader == null) continue; - checkArgument(recordReader.isValid(), "failed to get record reader"); + // checkArgument(recordReader.isValid(), "failed to get record reader"); computeCostTimer.start(); do From 616e307d932fef445a9878562d020f2bb880890c Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 5 Nov 2024 17:13:35 +0100 Subject: [PATCH 20/61] Comments and logs --- .../io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 2 +- .../worker/common/BasePartitionedJoinStreamWorker.java | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index a369a2d67..1bd7ab81e 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -192,7 +192,7 @@ else if (partitioned) } // We only need to put the byteBuf into the blocking queue to pass it to the recordReader, if the - // client is a data writer rather than a schema writer. In the latter case, + // packet is a data packet rather than a schema packet. Because in the latter case, // the schema packet has been processed when parsing the stream header above. if (partitionId != PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER) { diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 0912793d0..06973e4c2 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -308,6 +308,9 @@ public JoinOutput process(PartitionedJoinInput event) .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") // .map(URI::create) .collect(Collectors.toList()); + // In partitioned mode, the schema is sent in an over-replicated manner: + // every previous-stage worker (rather than one of them) sends a schema packet + // before sending its intermediate data, to prevent errors from possibly out-of-order packet arrivals. StreamWorkerCommon.passSchemaToNextLevel(joiner.getJoinedSchema(), outputStorageInfo, outputEndpoints); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, @@ -476,7 +479,7 @@ protected static void buildHashTable(long transId, Joiner joiner, List l { if (pixelsReader != null) { - logger.debug("closing pixels reader"); + logger.debug("closing pixels reader on port 18688"); pixelsReader.close(); } } @@ -559,7 +562,7 @@ protected static int joinWithRightTable( { if (pixelsReader != null) { - logger.debug("closing pixels reader"); + logger.debug("closing pixels reader on port 18686"); pixelsReader.close(); } } @@ -653,7 +656,7 @@ protected static int joinWithRightTableAndPartition( { if (pixelsReader != null) { - logger.debug("closing pixels reader"); + logger.debug("closing pixels reader on 18686"); pixelsReader.close(); } } From 2952fc3478ac3b88c440f5fd6c81c9922de18de1 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 00:18:58 +0100 Subject: [PATCH 21/61] Bug fix in stream reader and writer: process empty partition results --- .../pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 7 +++++-- .../pixelsdb/pixels/core/PixelsWriterStreamImpl.java | 11 ++++++++--- .../core/reader/PixelsRecordReaderStreamImpl.java | 10 ++++++++++ .../io/pixelsdb/pixels/core/utils/BlockingMap.java | 2 +- .../worker/common/BasePartitionStreamWorker.java | 5 ++++- .../common/BasePartitionedJoinStreamWorker.java | 11 ++++++++--- 6 files changed, 36 insertions(+), 10 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 1bd7ab81e..98f41029f 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -179,8 +179,10 @@ public void channelRead0(ChannelHandlerContext ctx, HttpObject msg) else if (partitioned) { // In partitioned mode, every packet brings a streamHeader to prevent errors from possibly - // out-of-order packet arrivals, so we need to parse it, but do not need the return value - // (except for the first incoming packet processed above). + // out-of-order packet arrivals, so we need to parse it, but do not need the return value + // (except for the first incoming packet processed above). + // XXX: Now we have each worker pass the schema in a separate packet, so this is no longer + // necessary. We can remove this block of code in PixelsWriterStreamImpl. parseStreamHeader(byteBuf); } } @@ -516,6 +518,7 @@ public PixelsProto.Footer getFooter() public void close() throws IOException { + logger.debug("Closing PixelsReaderStreamImpl"); new Thread(() -> { // Conditions for closing: // 1. streamHeaderLatch.await() to ensure that the stream header has been received diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java index 127aeb1f0..53359f237 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java @@ -512,6 +512,7 @@ public void addRowBatch(VectorizedRowBatch rowBatch, int hashValue) throws IOExc currHashValue = hashValue; hashValueIsSet = true; curRowGroupDataLength = 0; + if (rowBatch == null) return; curRowGroupNumOfRows += rowBatch.size; writeColumnVectors(rowBatch.cols, rowBatch.size); } @@ -550,8 +551,9 @@ public void close() { try { - if (curRowGroupNumOfRows != 0) + if (partitioned || curRowGroupNumOfRows != 0) { + // In partitioned mode, even an empty row group has to be sent to the server. writeRowGroup(); } // If the outgoing stream is empty (addRowBatch() and thus writeRowGroup() never called), we artificially @@ -609,7 +611,9 @@ else if (isFirstRowGroup) private void writeRowGroup() throws IOException { - if (isFirstRowGroup || partitioned) + // XXX: Now that we have each worker pass the schema in a separate packet in partitioned mode, it is no longer + // necessary to add a stream header to every packet. We can modify this block of code. + if (isFirstRowGroup || partitioned) // if (isFirstRowGroup) { writeStreamHeader(); isFirstRowGroup = false; @@ -769,7 +773,8 @@ private void writeRowGroup() throws IOException uri = URI.create(fileNameToUri(fileName)); } String reqUri = partitioned ? uris.get(currHashValue).toString() : uri.toString(); - logger.debug("Sending row group with length: " + byteBuf.writerIndex() + " to endpoint: " + reqUri); + logger.debug("Sending row group to endpoint: " + reqUri + ", length: " + byteBuf.writerIndex() + + ", partitionId: " + partitionId); Request req = httpClient.preparePost(reqUri) .setBody(byteBuf.nioBuffer()) .addHeader("X-Partition-Id", String.valueOf(partitionId)) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java index 3eee68314..4d1dab6f4 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java @@ -434,6 +434,7 @@ public int prepareBatch(int batchSize) */ private VectorizedRowBatch createEmptyEOFRowBatch(int size) { + logger.debug("In createEmptyEOFRowBatch(), size = " + size); TypeDescription resultSchema = TypeDescription.createSchema(new ArrayList<>()); VectorizedRowBatch resultRowBatch = resultSchema.createRowBatch(0); resultRowBatch.projectionSize = 0; @@ -502,6 +503,14 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) } int rgRowCount = (int) curRowGroupStreamFooter.getNumberOfRows(); + if (rgRowCount == 0) + { + // Empty row group, mark the current row group as unreadable. + curRowGroupByteBuf.readerIndex(curRowGroupByteBuf.readerIndex() + curRowGroupByteBuf.readableBytes()); + curRGIdx++; + return resultSchema.createRowBatch(0, resultColumnsEncoded); + } + int curBatchSize; ColumnVector[] columnVectors = resultRowBatch.cols; @@ -710,6 +719,7 @@ private void acquireNewRowGroup(boolean reuse) throws IOException else // incoming byteBuf unreadable, must be end of stream { + logger.debug("In acquireNewRowGroup(), end of file"); // checkValid = false; // Issue #105: to reject continuous read. if (reuse && resultRowBatch != null) // XXX: Before we implement necessary checks, the close() below might be called before our readBatch() diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/utils/BlockingMap.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/utils/BlockingMap.java index c1f0107fc..1505e937d 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/utils/BlockingMap.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/utils/BlockingMap.java @@ -51,7 +51,7 @@ public void put(K key, V value) public V get(K key) throws InterruptedException { - V ret = getQueue(key).poll(60, TimeUnit.SECONDS); + V ret = getQueue(key).poll(300, TimeUnit.SECONDS); if (ret == null) { throw new RuntimeException("BlockingMap.get() timed out"); diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index a058a4aeb..087737878 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -205,8 +205,11 @@ public PartitionOutput process(PartitionInput event) { pixelsWriter.addRowBatch(batch, hash); } - hashValues.add(hash); } + else { + pixelsWriter.addRowBatch(null, hash); + } + hashValues.add(hash); } partitionOutput.addOutput(outputPath); partitionOutput.setHashValues(hashValues); diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 06973e4c2..825751af0 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -243,8 +243,10 @@ public JoinOutput process(PartitionedJoinInput event) } // scan the right table and do the join. - if (joiner.getSmallTableSize() > 0) - { + // We no longer check this condition in streaming mode, because even if the joiner is empty, + // we have to read from the right table to enforce the streaming protocol. +// if (joiner.getSmallTableSize() > 0) +// { int rightSplitSize = rightPartitioned.size() / rightParallelism; if (rightPartitioned.size() % rightParallelism > 0) { @@ -290,7 +292,7 @@ public JoinOutput process(PartitionedJoinInput event) { throw new WorkerException("error occurred threads, please check the stacktrace before this log record"); } - } +// } String outputPath = outputFolder + outputInfo.getFileNames().get(0); try @@ -327,6 +329,9 @@ public JoinOutput process(PartitionedJoinInput event) pixelsWriter.addRowBatch(batch, hash); } } + else { + pixelsWriter.addRowBatch(null, hash); + } } } else From d9cef74db385f40b50a5f92a938be3ebae1837cd Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 00:21:45 +0100 Subject: [PATCH 22/61] Bug fix in PartitionedJoinOperator --- .../pixels/planner/plan/physical/PartitionedJoinOperator.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinOperator.java index a76a65773..619b065a9 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinOperator.java @@ -192,6 +192,7 @@ public void initPlanCoordinator(PlanCoordinator planCoordinator, int parentStage partitionInput.getTableInfo().setInputSplits(ImmutableList.of(inputSplit)); tasks.add(new Task(taskId++, JSON.toJSONString(partitionInput))); } + partitionInput.getTableInfo().setInputSplits(inputSplits); } StageCoordinator partitionStageCoordinator = new StageCoordinator(smallPartitionStageId, tasks); planCoordinator.addStageCoordinator(partitionStageCoordinator, partitionStageDependency); @@ -215,6 +216,7 @@ public void initPlanCoordinator(PlanCoordinator planCoordinator, int parentStage partitionInput.getTableInfo().setInputSplits(ImmutableList.of(inputSplit)); tasks.add(new Task(taskId++, JSON.toJSONString(partitionInput))); } + partitionInput.getTableInfo().setInputSplits(inputSplits); // restore the input splits after modifying partitionInput as a temporary variable } StageCoordinator partitionStageCoordinator = new StageCoordinator(largePartitionStageId, tasks); planCoordinator.addStageCoordinator(partitionStageCoordinator, partitionStageDependency); From 65d45d31338942be8a5ad62916e89863e0f08fa9 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 00:23:39 +0100 Subject: [PATCH 23/61] Revert "Interconnection between workers and stream workers, by modifying storage infos in PixelsPlanner" This reverts commit 09bc92c95313242b87c17586cf452ae2ed2089cf. --- .../pixels/planner/PixelsPlanner.java | 38 ++++++------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index c24158da0..bde19dd02 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -62,7 +62,6 @@ public class PixelsPlanner private static final Logger logger = LogManager.getLogger(PixelsPlanner.class); private static final StorageInfo InputStorageInfo; private static final StorageInfo IntermediateStorageInfo; - private static final StorageInfo IntermediateStreamStorageInfo; // Streaming only used between partition worker -> partitioned join worker private static final String IntermediateFolder; private static final int IntraWorkerParallelism; private static final ExchangeMethod EnabledExchangeMethod; @@ -91,10 +90,10 @@ public class PixelsPlanner ConfigFactory.Instance().getProperty("executor.input.storage.scheme")); InputStorageInfo = StorageInfoBuilder.BuildFromConfig(inputStorageScheme); - Storage.Scheme interStorageScheme = Storage.Scheme.from( - ConfigFactory.Instance().getProperty("executor.intermediate.storage.scheme")); + Storage.Scheme interStorageScheme = EnabledExchangeMethod == ExchangeMethod.batch ? + Storage.Scheme.from(ConfigFactory.Instance().getProperty("executor.intermediate.storage.scheme")) : + Storage.Scheme.valueOf("httpstream"); IntermediateStorageInfo = StorageInfoBuilder.BuildFromConfig(interStorageScheme); - IntermediateStreamStorageInfo = StorageInfoBuilder.BuildFromConfig(Storage.Scheme.valueOf("httpstream")); String interStorageFolder = ConfigFactory.Instance().getProperty("executor.intermediate.folder"); if (!interStorageFolder.endsWith("/")) { @@ -282,7 +281,7 @@ else if (originTable.getTableType() == Table.TableType.JOINED) joinInput.setPartialAggregationInfo(partialAggregationInfo); String fileName = "partial_aggr_" + outputId++; MultiOutputInfo outputInfo = joinInput.getOutput(); - outputInfo.setStorageInfo(IntermediateStorageInfo); // IntermediateStreamStorageInfo? + outputInfo.setStorageInfo(IntermediateStorageInfo); outputInfo.setPath(intermediateBase); outputInfo.setFileNames(ImmutableList.of(fileName)); aggrInputFilesBuilder.add(intermediateBase + fileName); @@ -522,14 +521,12 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) boolean leftIsBase = leftTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo leftTableInfo = new PartitionedTableInfo( leftTable.getTableName(), leftIsBase, leftTable.getColumnNames(), - leftIsBase ? InputStorageInfo : - (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), + leftIsBase ? InputStorageInfo : IntermediateStorageInfo, leftPartitionedFiles, IntraWorkerParallelism, leftKeyColumnIds); boolean rightIsBase = rightTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo rightTableInfo = new PartitionedTableInfo( rightTable.getTableName(), rightIsBase, rightTable.getColumnNames(), - rightIsBase ? InputStorageInfo : - (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), + rightIsBase ? InputStorageInfo : IntermediateStorageInfo, rightPartitionedFiles, IntraWorkerParallelism, rightKeyColumnIds); int numPartition = PlanOptimizer.Instance().getJoinNumPartition( @@ -957,8 +954,7 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) boolean leftIsBase = leftTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo leftTableInfo = new PartitionedTableInfo( leftTable.getTableName(), leftIsBase, leftTable.getColumnNames(), - leftIsBase ? InputStorageInfo : - (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), // ??? + leftIsBase ? InputStorageInfo : IntermediateStorageInfo, leftPartitionedFiles, IntraWorkerParallelism, leftKeyColumnIds); boolean[] rightPartitionProjection = getPartitionProjection(rightTable, join.getRightProjection()); @@ -1275,19 +1271,13 @@ private PartitionedTableInfo getPartitionedTableInfo( if (table.getTableType() == Table.TableType.BASE) { return new PartitionedTableInfo(table.getTableName(), true, - newColumnsToRead, - EnabledExchangeMethod == ExchangeMethod.batch ? InputStorageInfo : IntermediateStreamStorageInfo, - rightPartitionedFiles.build(), + newColumnsToRead, InputStorageInfo, rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); } else { return new PartitionedTableInfo(table.getTableName(), false, - newColumnsToRead, - EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo, - rightPartitionedFiles.build(), + newColumnsToRead, IntermediateStorageInfo, rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); - // XXX: This only applies to joined tables, when the current join reads table from a post partition of a previous join. - // If the table type is AGGREAGATED, we should use IntermediateStorageInfo. } } @@ -1322,13 +1312,11 @@ private List getPartitionInputs(Table inputTable, List getPartitionedJoinInputs( String path = getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/"; - MultiOutputInfo output = new MultiOutputInfo(path, - postPartition && EnabledExchangeMethod == ExchangeMethod.stream ? IntermediateStreamStorageInfo : IntermediateStorageInfo, - true, outputFileNames.build()); + MultiOutputInfo output = new MultiOutputInfo(path, IntermediateStorageInfo, true, outputFileNames.build()); boolean[] leftProjection = leftPartitionProjection == null ? joinedTable.getJoin().getLeftProjection() : rewriteProjectionForPartitionedJoin(joinedTable.getJoin().getLeftProjection(), leftPartitionProjection); From 260dc70c9a5c6b4a5ab7946437db958bd09f02f0 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 20:20:10 +0100 Subject: [PATCH 24/61] Add partitionId in PartitionInput, to fix a hardcode in BasePartitionStreamWorker --- .../physical/PartitionedJoinStreamOperator.java | 13 ++++++++++++- .../plan/physical/input/PartitionInput.java | 14 ++++++++++++++ .../worker/common/BasePartitionStreamWorker.java | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index ad979f423..8f1f7c9d3 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -63,6 +63,11 @@ public CompletableFuture[]> execute() if (!smallPartitionInputs.isEmpty()) { smallPartitionWorkerNum = smallPartitionInputs.size(); + for (int i = 0; i < smallPartitionInputs.size(); ++i) + { + PartitionInput partitionInput = smallPartitionInputs.get(i); + partitionInput.setPartitionId(i); + } } else if (smallChild != null) { smallPartitionWorkerNum = smallChild.getJoinInputs().size(); @@ -74,6 +79,11 @@ else if (smallChild != null) { if (!largePartitionInputs.isEmpty()) { largePartitionWorkerNum = largePartitionInputs.size(); + for (int i = 0; i < largePartitionInputs.size(); ++i) + { + PartitionInput partitionInput = largePartitionInputs.get(i); + partitionInput.setPartitionId(i); + } } else if (largeChild != null) { largePartitionWorkerNum = largeChild.getJoinInputs().size(); @@ -179,7 +189,8 @@ else if (largeChild != null) logger.debug("invoke large partition of " + this.getName()); } - // todo: Finally, wait for the readiness of the partition operators + // todo: Finally, wait for the readiness of the partition workers + // (need to modify the partition workers to pull tasks from the worker coordinator server). return joinOutputs; }); diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java index 313013c5c..8301ee1ce 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java @@ -56,6 +56,10 @@ public class PartitionInput extends Input * when using streaming. */ private boolean isSmallTable; + /** + * The id of this partition in the current stage. + */ + private int partitionId; /** * Default constructor for Jackson. @@ -124,4 +128,14 @@ public void setSmallTable(boolean isSmallTable) { this.isSmallTable = isSmallTable; } + + public int getPartitionId() + { + return partitionId; + } + + public void setPartitionId(int partitionId) + { + this.partitionId = partitionId; + } } diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 7a30cd545..9c6e7e805 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -193,7 +193,7 @@ public PartitionOutput process(PartitionInput event) StreamWorkerCommon.passSchemaToNextLevel(writerSchema.get(), outputStorageInfo, outputEndpoints); PixelsWriter pixelsWriter = StreamWorkerCommon.getWriter(writerSchema.get(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, encoding, - true, 0, // todo: hardcoded for only 1 partition worker scenario; need to pass the actual value + true, event.getPartitionId(), Arrays.stream(keyColumnIds).boxed().collect(Collectors.toList()), outputEndpoints, false); Set hashValues = new HashSet<>(numPartition); From b630c486aebaee3db1bffd906169f424929a6058 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 22:15:02 +0100 Subject: [PATCH 25/61] Adapt PixelsPlanner to support streaming mode --- .../io/pixelsdb/pixels/planner/PixelsPlanner.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index 8565c67cf..479a81eb7 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -1278,7 +1278,9 @@ private PartitionedTableInfo getPartitionedTableInfo( if (table.getTableType() == Table.TableType.BASE) { return new PartitionedTableInfo(table.getTableName(), true, - newColumnsToRead, InputStorageInfo, rightPartitionedFiles.build(), + newColumnsToRead, + EnabledExchangeMethod == ExchangeMethod.batch ? InputStorageInfo : IntermediateStorageInfo, + rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); } else { @@ -1324,7 +1326,9 @@ private List getPartitionInputs(Table inputTable, List getPartitionedJoinInputs( String path = getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/"; - MultiOutputInfo output = new MultiOutputInfo(path, IntermediateStorageInfo, true, outputFileNames.build()); + MultiOutputInfo output = new MultiOutputInfo(path, + postPartition && EnabledExchangeMethod != ExchangeMethod.batch ? IntermediateStorageInfo : InputStorageInfo, + true, outputFileNames.build()); boolean[] leftProjection = leftPartitionProjection == null ? joinedTable.getJoin().getLeftProjection() : rewriteProjectionForPartitionedJoin(joinedTable.getJoin().getLeftProjection(), leftPartitionProjection); From b048d6c971a61b9a4f817de9f89ffaad18830571 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 12 Nov 2024 23:57:31 +0100 Subject: [PATCH 26/61] Optimization in partitioned join stream worker --- .../BasePartitionedJoinStreamWorker.java | 69 ++++++++++--------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 825751af0..ff2f59b2d 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -165,7 +165,14 @@ public JoinOutput process(PartitionedJoinInput event) AtomicReference leftSchema = new AtomicReference<>(); AtomicReference rightSchema = new AtomicReference<>(); - // `registerWorker()` might awake the dependent workers, so it should be called just before + // Bootstrap the readers at once which is up all the time during the worker's lifetime, + // to ensure immediate reception of intermediate data and avoid retries on the writer side. + PixelsReader leftPixelsReader = StreamWorkerCommon.getReader( leftInputStorageInfo.getScheme(), + "http://localhost:18688/", true, event.getSmallPartitionWorkerNum()); + PixelsReader rightPixelsReader = StreamWorkerCommon.getReader(rightInputStorageInfo.getScheme(), + "http://localhost:18686/", true, event.getLargePartitionWorkerNum()); + + // `registerWorker()` might awake the dependent workers, so it should be called just before / after // the current worker listens on its HTTP port and is ready to receive streaming packets. CFWorkerInfo workerInfo = new CFWorkerInfo( InetAddress.getLocalHost().getHostAddress(), -1, @@ -178,14 +185,9 @@ public JoinOutput process(PartitionedJoinInput event) logger.debug("getSchemaFromPaths, left input: " + leftPartitioned + ", right input: " + rightPartitioned); - StreamWorkerCommon.getSchemaFromPaths(threadPool, - StreamWorkerCommon.getStorage(leftInputStorageInfo.getScheme()), - StreamWorkerCommon.getStorage(rightInputStorageInfo.getScheme()), - leftSchema, rightSchema, - Collections.singletonList("http://localhost:18688/"), - Collections.singletonList("http://localhost:18686/")); - // XXX: Better to ensure the subsequent data reader is up immediately after the schema is ready, - // to avoid retries on the writer side. + // XXX: StreamWorkerCommon.getSchemaFromPaths() can be removed + leftSchema.set ( leftPixelsReader.getFileSchema()); + rightSchema.set(rightPixelsReader.getFileSchema()); /* * Issue #450: * For the left and the right partial partitioned files, the file schema is equal to the columns to read in normal cases. @@ -196,6 +198,20 @@ public JoinOutput process(PartitionedJoinInput event) leftColAlias, leftProjection, leftKeyColumnIds, StreamWorkerCommon.getResultSchema(rightSchema.get(), rightColumnsToRead), rightColAlias, rightProjection, rightKeyColumnIds); + List downStreamWorkers = workerCoordinateService.getDownstreamWorkers(worker.getWorkerId()) + .stream() + .sorted(Comparator.comparing(worker -> worker.getHashValues().get(0))) + .collect(ImmutableList.toImmutableList()); + List outputEndpoints = downStreamWorkers.stream() + .map(CFWorkerInfo::getIp) + .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") + // .map(URI::create) + .collect(Collectors.toList()); + if (partitionOutput) + { + StreamWorkerCommon.passSchemaToNextLevel(joiner.getJoinedSchema(), outputStorageInfo, outputEndpoints); + } + // build the hash table for the left table. List leftFutures = new ArrayList<>(leftPartitioned.size()); int leftSplitSize = leftPartitioned.size() / leftParallelism; @@ -214,7 +230,7 @@ public JoinOutput process(PartitionedJoinInput event) try { buildHashTable(transId, joiner, parts, leftColumnsToRead, leftInputStorageInfo.getScheme(), - hashValues, event.getSmallPartitionWorkerNum(), workerMetrics); + hashValues, event.getSmallPartitionWorkerNum(), workerMetrics, leftPixelsReader); } catch (Throwable e) { @@ -267,10 +283,10 @@ public JoinOutput process(PartitionedJoinInput event) joinWithRightTableAndPartition( transId, joiner, parts, rightColumnsToRead, rightInputStorageInfo.getScheme(), hashValues, - event.getLargePartitionWorkerNum(), outputPartitionInfo, result, workerMetrics) : + event.getLargePartitionWorkerNum(), outputPartitionInfo, result, workerMetrics, rightPixelsReader) : joinWithRightTable(transId, joiner, parts, rightColumnsToRead, rightInputStorageInfo.getScheme(), hashValues, - event.getLargePartitionWorkerNum(), result.get(0), workerMetrics); + event.getLargePartitionWorkerNum(), result.get(0), workerMetrics, rightPixelsReader); } catch (Throwable e) { @@ -299,21 +315,13 @@ public JoinOutput process(PartitionedJoinInput event) { WorkerMetrics.Timer writeCostTimer = new WorkerMetrics.Timer().start(); PixelsWriter pixelsWriter; + // XXX: The post partition code below is adapted to the streaming protocol. + // Consider modifying the reader and writer code instead (good practice of layering) if (partitionOutput) { - List downStreamWorkers = workerCoordinateService.getDownstreamWorkers(worker.getWorkerId()) - .stream() - .sorted(Comparator.comparing(worker -> worker.getHashValues().get(0))) - .collect(ImmutableList.toImmutableList()); - List outputEndpoints = downStreamWorkers.stream() - .map(CFWorkerInfo::getIp) - .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") - // .map(URI::create) - .collect(Collectors.toList()); // In partitioned mode, the schema is sent in an over-replicated manner: // every previous-stage worker (rather than one of them) sends a schema packet // before sending its intermediate data, to prevent errors from possibly out-of-order packet arrivals. - StreamWorkerCommon.passSchemaToNextLevel(joiner.getJoinedSchema(), outputStorageInfo, outputEndpoints); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, encoding, true, event.getJoinInfo().getPostPartitionId(), Arrays.stream( @@ -431,7 +439,7 @@ public JoinOutput process(PartitionedJoinInput event) */ protected static void buildHashTable(long transId, Joiner joiner, List leftParts, String[] leftCols, Storage.Scheme leftScheme, List hashValues, int numPartition, - WorkerMetrics workerMetrics) throws IOException + WorkerMetrics workerMetrics, PixelsReader leftPixelsReader) throws IOException { // In streaming mode, numPartition is the total number of partition workers, i.e. the number of incoming packets. logger.debug("building hash table for the left table, partition paths: " + leftParts); @@ -441,10 +449,9 @@ protected static void buildHashTable(long transId, Joiner joiner, List l int numReadRequests = 0; readCostTimer.start(); - PixelsReader pixelsReader = null; + PixelsReader pixelsReader = leftPixelsReader; try { - pixelsReader = StreamWorkerCommon.getReader(leftScheme, "http://localhost:18688/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); for (int hashValue : hashValues) @@ -511,7 +518,7 @@ protected static void buildHashTable(long transId, Joiner joiner, List l protected static int joinWithRightTable( long transId, Joiner joiner, List rightParts, String[] rightCols, Storage.Scheme rightScheme, List hashValues, int numPartition, ConcurrentLinkedQueue joinResult, - WorkerMetrics workerMetrics) throws IOException + WorkerMetrics workerMetrics, PixelsReader rightPixelsReader) throws IOException { int joinedRows = 0; WorkerMetrics.Timer readCostTimer = new WorkerMetrics.Timer(); @@ -520,10 +527,9 @@ protected static int joinWithRightTable( int numReadRequests = 0; readCostTimer.start(); - PixelsReader pixelsReader = null; + PixelsReader pixelsReader = rightPixelsReader; try { - pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); for (int hashValue : hashValues) @@ -545,7 +551,7 @@ protected static int joinWithRightTable( { if (!joined.isEmpty()) { - joinResult.add(joined); + joinResult.add(joined); // XXX: Can modify this into PixelsWriter.addRowBatch(), to further exploit the parallelism. joinedRows += joined.size; } } @@ -596,7 +602,7 @@ protected static int joinWithRightTable( protected static int joinWithRightTableAndPartition( long transId, Joiner joiner, List rightParts, String[] rightCols, Storage.Scheme rightScheme, List hashValues, int numPartition, PartitionInfo postPartitionInfo, - List> partitionResult, WorkerMetrics workerMetrics) throws IOException + List> partitionResult, WorkerMetrics workerMetrics, PixelsReader rightPixelsReader) throws IOException { requireNonNull(postPartitionInfo, "outputPartitionInfo is null"); Partitioner partitioner = new Partitioner(postPartitionInfo.getNumPartition(), @@ -608,10 +614,9 @@ protected static int joinWithRightTableAndPartition( int numReadRequests = 0; readCostTimer.start(); - PixelsReader pixelsReader = null; + PixelsReader pixelsReader = rightPixelsReader; try { - pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); // XXX: check that the hashValue in row group headers match the hashValue assigned to this worker From b398d10c10cae32e759a4ac1a332e69a71d55482 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Wed, 13 Nov 2024 00:07:32 +0100 Subject: [PATCH 27/61] Fix hardcode of streaming port numbers --- .../worker/common/BasePartitionStreamWorker.java | 3 ++- .../common/BasePartitionedJoinStreamWorker.java | 14 ++++++++------ .../pixels/worker/common/StreamWorkerCommon.java | 3 +++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 9c6e7e805..1a3763fc1 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -186,7 +186,8 @@ public PartitionOutput process(PartitionInput event) .collect(ImmutableList.toImmutableList()); List outputEndpoints = downStreamWorkers.stream() .map(CFWorkerInfo::getIp) - .map(ip -> "http://" + ip + ":" + (event.isSmallTable() ? "18688" : "18686") + "/") + .map(ip -> "http://" + ip + ":" + + (event.isSmallTable() ? StreamWorkerCommon.STREAM_PORT_SMALL_TABLE : StreamWorkerCommon.STREAM_PORT_LARGE_TABLE)) // .map(URI::create) .collect(Collectors.toList()); diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index ff2f59b2d..a6fa28a6f 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -168,9 +168,9 @@ public JoinOutput process(PartitionedJoinInput event) // Bootstrap the readers at once which is up all the time during the worker's lifetime, // to ensure immediate reception of intermediate data and avoid retries on the writer side. PixelsReader leftPixelsReader = StreamWorkerCommon.getReader( leftInputStorageInfo.getScheme(), - "http://localhost:18688/", true, event.getSmallPartitionWorkerNum()); + "http://localhost:" + StreamWorkerCommon.STREAM_PORT_SMALL_TABLE, true, event.getSmallPartitionWorkerNum()); PixelsReader rightPixelsReader = StreamWorkerCommon.getReader(rightInputStorageInfo.getScheme(), - "http://localhost:18686/", true, event.getLargePartitionWorkerNum()); + "http://localhost:" + StreamWorkerCommon.STREAM_PORT_LARGE_TABLE, true, event.getLargePartitionWorkerNum()); // `registerWorker()` might awake the dependent workers, so it should be called just before / after // the current worker listens on its HTTP port and is ready to receive streaming packets. @@ -204,7 +204,9 @@ public JoinOutput process(PartitionedJoinInput event) .collect(ImmutableList.toImmutableList()); List outputEndpoints = downStreamWorkers.stream() .map(CFWorkerInfo::getIp) - .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") + .map(ip -> "http://" + ip + ":" + + (event.getJoinInfo().getPostPartitionIsSmallTable() ? + StreamWorkerCommon.STREAM_PORT_SMALL_TABLE : StreamWorkerCommon.STREAM_PORT_LARGE_TABLE)) // .map(URI::create) .collect(Collectors.toList()); if (partitionOutput) @@ -491,7 +493,7 @@ protected static void buildHashTable(long transId, Joiner joiner, List l { if (pixelsReader != null) { - logger.debug("closing pixels reader on port 18688"); + logger.debug("closing pixels reader on port " + StreamWorkerCommon.STREAM_PORT_SMALL_TABLE); pixelsReader.close(); } } @@ -573,7 +575,7 @@ protected static int joinWithRightTable( { if (pixelsReader != null) { - logger.debug("closing pixels reader on port 18686"); + logger.debug("closing pixels reader on port " + StreamWorkerCommon.STREAM_PORT_LARGE_TABLE); pixelsReader.close(); } } @@ -666,7 +668,7 @@ protected static int joinWithRightTableAndPartition( { if (pixelsReader != null) { - logger.debug("closing pixels reader on 18686"); + logger.debug("closing pixels reader on port " + StreamWorkerCommon.STREAM_PORT_LARGE_TABLE); pixelsReader.close(); } } diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/StreamWorkerCommon.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/StreamWorkerCommon.java index dd6ce3c10..e0d9d4bbf 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/StreamWorkerCommon.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/StreamWorkerCommon.java @@ -46,6 +46,9 @@ public class StreamWorkerCommon extends WorkerCommon private static final Logger logger = LogManager.getLogger(StreamWorkerCommon.class); private static final Storage http = null; // placeholder. todo: modularize into a pixels-storage-stream module. + public static final int STREAM_PORT_SMALL_TABLE = 18688; + public static final int STREAM_PORT_LARGE_TABLE = 18686; + public static void initStorage(StorageInfo storageInfo, Boolean isOutput) throws IOException { if (storageInfo.getScheme() == Storage.Scheme.httpstream) From 4d0fc87d36e9f4da9fd29be264ab2fbf9ae8c4e2 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Wed, 13 Nov 2024 00:20:47 +0100 Subject: [PATCH 28/61] Format --- .../plan/physical/PartitionedJoinStreamOperator.java | 12 ++++++++---- .../worker/common/BasePartitionStreamWorker.java | 3 ++- .../common/BasePartitionedJoinStreamWorker.java | 3 ++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index 8f1f7c9d3..5dd2f8a23 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -69,10 +69,12 @@ public CompletableFuture[]> execute() partitionInput.setPartitionId(i); } } - else if (smallChild != null) { + else if (smallChild != null) + { smallPartitionWorkerNum = smallChild.getJoinInputs().size(); } - else { + else + { throw new IllegalStateException("smallPartitionInputs and smallChild are both null"); } int largePartitionWorkerNum; @@ -85,10 +87,12 @@ else if (smallChild != null) { partitionInput.setPartitionId(i); } } - else if (largeChild != null) { + else if (largeChild != null) + { largePartitionWorkerNum = largeChild.getJoinInputs().size(); } - else { + else + { throw new IllegalStateException("largePartitionInputs and largeChild are both null"); } for (int i = 0; i < joinInputs.size(); ++i) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 1a3763fc1..c5990bab2 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -208,7 +208,8 @@ public PartitionOutput process(PartitionInput event) pixelsWriter.addRowBatch(batch, hash); } } - else { + else + { pixelsWriter.addRowBatch(null, hash); } hashValues.add(hash); diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index a6fa28a6f..0ece6d760 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -339,7 +339,8 @@ public JoinOutput process(PartitionedJoinInput event) pixelsWriter.addRowBatch(batch, hash); } } - else { + else + { pixelsWriter.addRowBatch(null, hash); } } From 958dd513ad9a18c8107a8499b02f3f1f82713ef8 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Wed, 13 Nov 2024 22:19:26 +0100 Subject: [PATCH 29/61] Modify Pixels stream writer to retry connection at 100ms interval --- .../java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java index 53359f237..91cdcf2f5 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java @@ -791,8 +791,8 @@ private void writeRowGroup() throws IOException try { outstandingHTTPRequestSemaphore.acquire(); - int maxAttempts = 30000; - long backoffMillis = 10; + int maxAttempts = 3000; + long backoffMillis = 100; int attempt = 0; boolean success = false; From 001e3ca8f56f16699735d1334431673c619d138a Mon Sep 17 00:00:00 2001 From: jasha64 Date: Fri, 18 Oct 2024 00:04:39 +0200 Subject: [PATCH 30/61] Minor fix --- .../pixels/core/PixelsReaderStreamImpl.java | 24 +++++++++---------- .../src/main/resources/log4j2.properties | 0 2 files changed, 12 insertions(+), 12 deletions(-) rename pixels-turbo/{pixels-worker-vhive => pixels-worker-common}/src/main/resources/log4j2.properties (100%) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 581da9b4f..d6f5e5631 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -151,13 +151,6 @@ public void channelRead0(ChannelHandlerContext ctx, HttpObject msg) ", partition ID header: " + req.headers().get("X-Partition-Id") + ", HTTP request object body total length: " + req.content().readableBytes()); - // schema packet: only 1 packet expected, so close the connection immediately - // partitioned mode: close the connection if all partitions received - // else (non-partitioned mode, data packet): close connection if empty packet received - boolean needCloseParentChannel = partitionId == PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER || - (partitioned && numPartitionsReceived.get() == numPartitions) || - (Objects.equals(req.headers().get(CONNECTION), CLOSE.toString()) && - req.content().readableBytes() == 0); ByteBuf byteBuf = req.content(); try { @@ -178,7 +171,7 @@ public void channelRead0(ChannelHandlerContext ctx, HttpObject msg) catch (IOException e) { logger.error("Invalid stream header values: ", e); - sendResponseAndClose(ctx, req, BAD_REQUEST, needCloseParentChannel); + sendResponseAndClose(ctx, req, BAD_REQUEST, false); return; } } @@ -193,7 +186,7 @@ else if (partitioned) catch (InvalidProtocolBufferException | IndexOutOfBoundsException e) { logger.error("Malformed or corrupted stream header", e); - sendResponseAndClose(ctx, req, BAD_REQUEST, needCloseParentChannel); + sendResponseAndClose(ctx, req, BAD_REQUEST, false); return; } @@ -209,7 +202,7 @@ else if (partitioned) if (partitionId < 0 || partitionId >= numPartitions) { logger.warn("Client sent invalid partitionId value: " + partitionId); - sendResponseAndClose(ctx, req, BAD_REQUEST, needCloseParentChannel); + sendResponseAndClose(ctx, req, BAD_REQUEST, false); return; } byteBufBlockingMap.put(partitionId, byteBuf); @@ -222,6 +215,13 @@ else if (partitioned) } } + // schema packet: only 1 packet expected, so close the connection immediately + // partitioned mode: close the connection if all partitions received + // else (non-partitioned mode, data packet): close connection if empty packet received + boolean needCloseParentChannel = partitionId == PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER || + (partitioned && numPartitionsReceived.get() == numPartitions) || + (Objects.equals(req.headers().get(CONNECTION), CLOSE.toString()) && + req.content().readableBytes() == 0); sendResponseAndClose(ctx, req, HttpResponseStatus.OK, needCloseParentChannel); } @@ -539,11 +539,11 @@ public void close() try { - if (!this.httpServerFuture.isDone()) this.httpServerFuture.get(5, TimeUnit.SECONDS); + if (!this.httpServerFuture.isDone()) this.httpServerFuture.get(300, TimeUnit.SECONDS); } catch (TimeoutException e) { - logger.warn("In close(), HTTP server did not shut down in 5 seconds, doing forceful shutdown"); + logger.warn("In close(), HTTP server did not shut down in 300 seconds, doing forceful shutdown"); this.httpServerFuture.cancel(true); } catch (InterruptedException | ExecutionException e) diff --git a/pixels-turbo/pixels-worker-vhive/src/main/resources/log4j2.properties b/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties similarity index 100% rename from pixels-turbo/pixels-worker-vhive/src/main/resources/log4j2.properties rename to pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties From 2bc7ce209468e7f5d304b40acd2bbd4650b11edd Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 24 Oct 2024 13:21:08 +0200 Subject: [PATCH 31/61] log in PixelsReaderStreamImpl --- .../java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index d6f5e5631..4c8125c8f 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -84,6 +84,7 @@ public class PixelsReaderStreamImpl implements PixelsReader // In partitioned mode, we use byteBufBlockingMap to map hash value to corresponding ByteBuf private final BlockingMap byteBufBlockingMap; private final boolean partitioned; + private final int httpPort; private final AtomicReference numPartitionsReceived = new AtomicReference<>(0); private final List recordReaders; @@ -113,7 +114,7 @@ public PixelsReaderStreamImpl(String endpoint, boolean partitioned, int numParti this.streamHeader = null; URI uri = new URI(endpoint); String IP = uri.getHost(); - int httpPort = uri.getPort(); + this.httpPort = uri.getPort(); logger.debug("In Pixels stream reader constructor, IP: " + IP + ", port: " + httpPort + ", partitioned: " + partitioned + ", numPartitions: " + numPartitions); if (!Objects.equals(IP, "127.0.0.1") && !Objects.equals(IP, "localhost")) @@ -543,7 +544,7 @@ public void close() } catch (TimeoutException e) { - logger.warn("In close(), HTTP server did not shut down in 300 seconds, doing forceful shutdown"); + logger.warn("In close(), HTTP server on port " + httpPort + " did not shut down in 300 seconds, doing forceful shutdown"); this.httpServerFuture.cancel(true); } catch (InterruptedException | ExecutionException e) From 95c982024e147c6741c9e390a723e6fb0e2c23b4 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 24 Oct 2024 13:38:32 +0200 Subject: [PATCH 32/61] Add isSmallTable in partition input, to fix a hardcode in base partition stream worker --- .../io/pixelsdb/pixels/planner/PixelsPlanner.java | 14 ++++++++++++++ .../plan/physical/input/PartitionInput.java | 14 ++++++++++++++ .../worker/common/BasePartitionStreamWorker.java | 7 +------ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index e9817470e..8f3fa7ff6 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -970,6 +970,10 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) rightTable, rightInputSplits, rightKeyColumnIds, rightPartitionProjection, numPartition, getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/" + rightTable.getTableName() + "/"); + for (PartitionInput rightPartitionInput : rightPartitionInputs) + { + rightPartitionInput.setSmallTable(join.getJoinEndian() != JoinEndian.SMALL_LEFT); + } PartitionedTableInfo rightTableInfo = getPartitionedTableInfo( rightTable, rightKeyColumnIds, rightPartitionInputs, rightPartitionProjection); @@ -1005,6 +1009,11 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) leftTable, leftInputSplits, leftKeyColumnIds, leftPartitionProjection, numPartition, getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/" + leftTable.getTableName() + "/"); + for (PartitionInput leftPartitionInput : leftPartitionInputs) + { + leftPartitionInput.setSmallTable(join.getJoinEndian() == JoinEndian.SMALL_LEFT); + } + PartitionedTableInfo leftTableInfo = getPartitionedTableInfo( leftTable, leftKeyColumnIds, leftPartitionInputs, leftPartitionProjection); @@ -1013,6 +1022,11 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) rightTable, rightInputSplits, rightKeyColumnIds, rightPartitionProjection, numPartition, getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/" + rightTable.getTableName() + "/"); + for (PartitionInput rightPartitionInput : rightPartitionInputs) + { + rightPartitionInput.setSmallTable(join.getJoinEndian() != JoinEndian.SMALL_LEFT); + } + PartitionedTableInfo rightTableInfo = getPartitionedTableInfo( rightTable, rightKeyColumnIds, rightPartitionInputs, rightPartitionProjection); diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java index 99999e59b..2a86df184 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java @@ -51,6 +51,10 @@ public class PartitionInput extends Input * The information about the hash partitioning. */ private PartitionInfo partitionInfo; + /** + * Whether this table is the small table in a join. + */ + private boolean isSmallTable; /** * Default constructor for Jackson. @@ -109,4 +113,14 @@ public void setPartitionInfo(PartitionInfo partitionInfo) { this.partitionInfo = partitionInfo; } + + public boolean isSmallTable() + { + return isSmallTable; + } + + public void setSmallTable(boolean isSmallTable) + { + this.isSmallTable = isSmallTable; + } } diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 35a5e4b23..fdcc096f1 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -184,14 +184,9 @@ public PartitionOutput process(PartitionInput event) .collect(ImmutableList.toImmutableList()); List outputEndpoints = downStreamWorkers.stream() .map(CFWorkerInfo::getIp) - .map(ip -> "http://" + ip + ":" - + (Objects.equals(event.getTableInfo().getTableName(), "part") ? "18688" : "18686") + "/") + .map(ip -> "http://" + ip + ":" + (event.isSmallTable() ? "18688" : "18686") + "/") // .map(URI::create) .collect(Collectors.toList()); - // todo: Need to pass whether the table is the large table or the small table here into the partition worker. - // Perhaps add a boolean field in the PartitionInput class. - // Currently, we hardcode the table name for TPC-H Q14 - the large table (rightTable for join) uses port 18686 - // while the small table (leftTable for join) uses port 18688. StreamWorkerCommon.passSchemaToNextLevel(writerSchema.get(), outputStorageInfo, outputEndpoints); PixelsWriter pixelsWriter = StreamWorkerCommon.getWriter(writerSchema.get(), From ad1a97c06c7983d12cc903beee61e0fcdb8fc7b6 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 24 Oct 2024 13:42:41 +0200 Subject: [PATCH 33/61] Fix log4j2.properties in pixels-turbo/pixels-worker-common --- .../src/main/resources/log4j2.properties | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties b/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties index fe8abaa4b..61c9e552b 100644 --- a/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties +++ b/pixels-turbo/pixels-worker-common/src/main/resources/log4j2.properties @@ -1,11 +1,11 @@ -name=pixels-worker-vhive +name=pixels-worker-common status=warn shutdownHook=disable rootLogger.level=info rootLogger.appenderRef.stdout.ref=STDOUT rootLogger.appenderRef.log.ref=log -filter.threshold.type=ThresholdFilter -filter.threshold.level=info +logger.pixelsdb.name=io.pixelsdb.pixels +logger.pixelsdb.level=info appender.console.type=Console appender.console.name=STDOUT appender.console.layout.type=PatternLayout @@ -13,7 +13,7 @@ appender.console.layout.pattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%c]-[%p] %m%n appender.rolling.type=File appender.rolling.name=log appender.rolling.append=true -appender.rolling.fileName=pixels-worker-vhive.log +appender.rolling.fileName=pixels-worker-common.log appender.rolling.layout.type=PatternLayout appender.rolling.layout.pattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%c]-[%p] %m%n From a3a4da7fb6c650b2a91931bb5a2869ca37d87afd Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 24 Oct 2024 18:15:05 +0200 Subject: [PATCH 34/61] Also fix hardcode in BasePartitionStreamWorker --- .../pixels/worker/common/BasePartitionStreamWorker.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index fdcc096f1..892760231 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -69,10 +69,8 @@ public BasePartitionStreamWorker(WorkerContext context) super(context); this.logger = context.getLogger(); this.workerMetrics = context.getWorkerMetrics(); - this.workerCoordinateService = new WorkerCoordinateService("128.110.218.225", 18894); - // Hardcoded for Cloudlab. todo: Need to figure out how to get the daemon IP dynamically. - // Perhaps add a field in the WorkerContext class to store the daemon IP, - // or to have the Pixels planner pass the daemon IP in the Input. + this.workerCoordinateService = new WorkerCoordinateService( + StreamWorkerCommon.getCoordinatorIp(), StreamWorkerCommon.getCoordinatorPort()); } @Override From 28142da67ecdd89ca0ab0d6e042dfd8ff41e25a2 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sun, 27 Oct 2024 19:02:13 +0100 Subject: [PATCH 35/61] Fix bug in BasePartitionedJoinStreamWorker --- .../BasePartitionedJoinStreamWorker.java | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 6036f61ca..f17660a88 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -527,17 +527,8 @@ protected static int joinWithRightTable( pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - Set rightHashValues = new HashSet<>(numPartition); - for (int hashValue = 0; hashValue < numPartition; ++hashValue) - { - rightHashValues.add(hashValue); - } for (int hashValue : hashValues) { - if (!rightHashValues.contains(hashValue)) - { - continue; - } PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, hashValue, numPartition); VectorizedRowBatch rowBatch; @@ -646,17 +637,8 @@ protected static int joinWithRightTableAndPartition( pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - Set rightHashValues = new HashSet<>(numPartition); - for (int hashValue = 0; hashValue < numPartition; ++hashValue) - { - rightHashValues.add(hashValue); - } for (int hashValue : hashValues) { - if (!rightHashValues.contains(hashValue)) - { - continue; - } PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, hashValue, numPartition); VectorizedRowBatch rowBatch; From 4622503e443908009d87a9bbefb9af4defa42170 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sun, 27 Oct 2024 19:06:00 +0100 Subject: [PATCH 36/61] Use -2 as numPartitions when getting schema readers --- .../java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 4c8125c8f..9cac57272 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -99,7 +99,7 @@ public class PixelsReaderStreamImpl implements PixelsReader public PixelsReaderStreamImpl(String endpoint) throws Exception { - this(endpoint, false, -1); + this(endpoint, false, -2); } public PixelsReaderStreamImpl(int port) throws Exception From 5f3346a673d600388aff2c7f0685b73500f1df6e Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 29 Oct 2024 15:48:29 +0100 Subject: [PATCH 37/61] Also fix hardcode in BasePartitionedJoinStreamWorker --- .../pixels/worker/common/BasePartitionedJoinStreamWorker.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index f17660a88..d7304c74a 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -66,7 +66,8 @@ public BasePartitionedJoinStreamWorker(WorkerContext context) // this.logger = context.getLogger(); this.workerMetrics = context.getWorkerMetrics(); this.workerMetrics.clear(); - this.workerCoordinateService = new WorkerCoordinateService("128.110.218.225", 18894); + this.workerCoordinateService = new WorkerCoordinateService( + StreamWorkerCommon.getCoordinatorIp(), StreamWorkerCommon.getCoordinatorPort()); } @Override From 189a751a81e6886e63c37e50418dea0250f72144 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 19:56:27 +0100 Subject: [PATCH 38/61] Add postPartitionId and postPartitionIsSmallTable in JoinInfo, to extend streaming support to post partition of joins --- .../pixels/planner/PixelsPlanner.java | 7 +++-- .../PartitionedJoinStreamOperator.java | 3 +- .../plan/physical/domain/JoinInfo.java | 29 +++++++++++++++++++ .../physical/domain/PartitionedJoinInfo.java | 12 ++++++++ .../BasePartitionedJoinStreamWorker.java | 19 +++++++++--- 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index 8f3fa7ff6..4099f1620 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -1353,6 +1353,7 @@ private List getPartitionedJoinInputs( { boolean postPartition = false; PartitionInfo postPartitionInfo = null; + boolean postPartitionIsSmallTable = false; if (parent.isPresent() && parent.get().getJoin().getJoinAlgo() == JoinAlgorithm.PARTITIONED) { postPartition = true; @@ -1367,10 +1368,12 @@ private List getPartitionedJoinInputs( if (joinedTable == parent.get().getJoin().getLeftTable()) { postPartitionInfo = new PartitionInfo(parent.get().getJoin().getLeftKeyColumnIds(), numPostPartition); + postPartitionIsSmallTable = parent.get().getJoin().getJoinEndian() == JoinEndian.SMALL_LEFT; } else { postPartitionInfo = new PartitionInfo(parent.get().getJoin().getRightKeyColumnIds(), numPostPartition); + postPartitionIsSmallTable = parent.get().getJoin().getJoinEndian() != JoinEndian.SMALL_LEFT; } } @@ -1400,7 +1403,7 @@ private List getPartitionedJoinInputs( { PartitionedJoinInfo joinInfo = new PartitionedJoinInfo(joinedTable.getJoin().getJoinType(), joinedTable.getJoin().getLeftColumnAlias(), joinedTable.getJoin().getRightColumnAlias(), - leftProjection, rightProjection, postPartition, postPartitionInfo, numPartition, ImmutableList.of(i)); + leftProjection, rightProjection, postPartition, postPartitionInfo, postPartitionIsSmallTable, numPartition, ImmutableList.of(i)); joinInput = new PartitionedJoinInput(transId, timestamp, leftTableInfo, rightTableInfo, joinInfo, false, null, output); } @@ -1408,7 +1411,7 @@ private List getPartitionedJoinInputs( { PartitionedJoinInfo joinInfo = new PartitionedJoinInfo(joinedTable.getJoin().getJoinType().flip(), joinedTable.getJoin().getRightColumnAlias(), joinedTable.getJoin().getLeftColumnAlias(), - rightProjection, leftProjection, postPartition, postPartitionInfo, numPartition, ImmutableList.of(i)); + rightProjection, leftProjection, postPartition, postPartitionInfo, postPartitionIsSmallTable, numPartition, ImmutableList.of(i)); joinInput = new PartitionedJoinInput(transId, timestamp, rightTableInfo, leftTableInfo, joinInfo, false, null, output); } diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index b8d16747b..f17a1f9cd 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -27,6 +27,7 @@ import io.pixelsdb.pixels.planner.coordinate.PlanCoordinatorFactory; import io.pixelsdb.pixels.planner.plan.physical.input.JoinInput; import io.pixelsdb.pixels.planner.plan.physical.input.PartitionInput; +import io.pixelsdb.pixels.planner.plan.physical.input.PartitionedJoinInput; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -107,7 +108,7 @@ else if (smallChild != null) for (PartitionInput partitionInput : largePartitionInputs) { largePartitionOutputs[i++] = InvokerFactory.Instance() - .getInvoker(WorkerType.PARTITION_STREAMING).invoke((partitionInput)); + .getInvoker(WorkerType.PARTITION_STREAMING).invoke(partitionInput); } logger.debug("invoke large partition of " + this.getName()); diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/JoinInfo.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/JoinInfo.java index cbc6aea5a..a7cce5b7d 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/JoinInfo.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/JoinInfo.java @@ -59,6 +59,15 @@ public class JoinInfo * The partition information of the output if outputPartitioned is true. */ private PartitionInfo postPartitionInfo; + /** + * The partition id of this worker in post partition if outputPartitioned is true. + */ + private int postPartitionId; + /** + * Whether this table is the small table in the next-level join. This is used to determine the HTTP port + * when using streaming. + */ + private boolean postPartitionIsSmallTable; /** * Default constructor for Jackson. @@ -147,4 +156,24 @@ public void setPostPartitionInfo(PartitionInfo postPartitionInfo) { this.postPartitionInfo = postPartitionInfo; } + + public int getPostPartitionId() + { + return postPartitionId; + } + + public void setPostPartitionId(int postPartitionId) + { + this.postPartitionId = postPartitionId; + } + + public boolean getPostPartitionIsSmallTable() + { + return postPartitionIsSmallTable; + } + + public void setPostPartitionIsSmallTable(boolean postPartitionIsSmallTable) + { + this.postPartitionIsSmallTable = postPartitionIsSmallTable; + } } diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/PartitionedJoinInfo.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/PartitionedJoinInfo.java index d5f939ff2..eb9dad94d 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/PartitionedJoinInfo.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/domain/PartitionedJoinInfo.java @@ -53,6 +53,18 @@ public PartitionedJoinInfo(JoinType joinType, String[] smallColumnAlias, String[ this.hashValues = hashValues; } + public PartitionedJoinInfo(JoinType joinType, String[] smallColumnAlias, String[] largeColumnAlias, + boolean[] smallProjection, boolean[] largeProjection, boolean postPartition, + PartitionInfo postPartitionInfo, boolean postPartitionIsSmallTable, + int numPartition, List hashValues) + { + super(joinType, smallColumnAlias, largeColumnAlias, smallProjection, largeProjection, + postPartition, postPartitionInfo); + this.numPartition = numPartition; + this.hashValues = hashValues; + this.setPostPartitionIsSmallTable(postPartitionIsSmallTable); + } + public int getNumPartition() { return numPartition; diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index d7304c74a..bb0623065 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -19,6 +19,7 @@ */ package io.pixelsdb.pixels.worker.common; +import com.google.common.collect.ImmutableList; import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.core.PixelsReader; import io.pixelsdb.pixels.core.PixelsWriter; @@ -296,11 +297,20 @@ public JoinOutput process(PartitionedJoinInput event) PixelsWriter pixelsWriter; if (partitionOutput) { + List downStreamWorkers = workerCoordinateService.getDownstreamWorkers(worker.getWorkerId()) + .stream() + .sorted(Comparator.comparing(worker -> worker.getHashValues().get(0))) + .collect(ImmutableList.toImmutableList()); + List outputEndpoints = downStreamWorkers.stream() + .map(CFWorkerInfo::getIp) + .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") + // .map(URI::create) + .collect(Collectors.toList()); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, - encoding, true, -1, Arrays.stream( + encoding, true, event.getJoinInfo().getPostPartitionId(), Arrays.stream( outputPartitionInfo.getKeyColumnIds()).boxed(). - collect(Collectors.toList())); + collect(Collectors.toList()), outputEndpoints, false); for (int hash = 0; hash < outputPartitionInfo.getNumPartition(); ++hash) { ConcurrentLinkedQueue batches = result.get(hash); @@ -346,9 +356,10 @@ public JoinOutput process(PartitionedJoinInput event) requireNonNull(outputPartitionInfo, "outputPartitionInfo is null"); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, - encoding, true, -1, Arrays.stream( + encoding, true, event.getJoinInfo().getPostPartitionId(), Arrays.stream( outputPartitionInfo.getKeyColumnIds()).boxed(). - collect(Collectors.toList())); + collect(Collectors.toList())); // , outputEndpoints, false); + // TODO: Adapt the left-outer tail to streaming mode. joiner.writeLeftOuterAndPartition(pixelsWriter, StreamWorkerCommon.rowBatchSize, outputPartitionInfo.getNumPartition(), outputPartitionInfo.getKeyColumnIds()); } From 222145c536094560a1053eda6c29b13fa0b7215e Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 20:03:45 +0100 Subject: [PATCH 39/61] Add postPartitionId and postPartitionIsSmallTable in JoinInfo, to extend streaming support to post partition of joins --- .../planner/plan/physical/PartitionedJoinStreamOperator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index f17a1f9cd..e4145ca63 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -63,7 +63,8 @@ public CompletableFuture[]> execute() { JoinInput joinInput = joinInputs.get(i); joinInput.setSmallPartitionWorkerNum(smallPartitionInputs.size()); // XXX: could be 0 - joinInput.setLargePartitionWorkerNum(largePartitionInputs.size()); + joinInput.setLargePartitionWorkerNum(largePartitionInputs.size()); // XXX: Can do this in PixelsPlanner + ((PartitionedJoinInput)joinInput).getJoinInfo().setPostPartitionId(i); if (joinAlgo == JoinAlgorithm.PARTITIONED) { joinOutputs[i] = InvokerFactory.Instance() From 2bf5f52d213464f38209c13c30ca6d921f52f71b Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 20:05:22 +0100 Subject: [PATCH 40/61] Comments --- .../pixels/planner/plan/physical/input/PartitionInput.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java index 2a86df184..313013c5c 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java @@ -52,7 +52,8 @@ public class PartitionInput extends Input */ private PartitionInfo partitionInfo; /** - * Whether this table is the small table in a join. + * Whether this table is the small table in the next-level join. This is used to determine the HTTP port + * when using streaming. */ private boolean isSmallTable; From 83bce2c08c91a38db8c0098d590709b30b8aa428 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 20:05:38 +0100 Subject: [PATCH 41/61] Comments --- .../pixels/worker/common/BasePartitionStreamWorker.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 892760231..0b2c11b0e 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -71,6 +71,10 @@ public BasePartitionStreamWorker(WorkerContext context) this.workerMetrics = context.getWorkerMetrics(); this.workerCoordinateService = new WorkerCoordinateService( StreamWorkerCommon.getCoordinatorIp(), StreamWorkerCommon.getCoordinatorPort()); + // In cloud functions, configuration files "pixels.properties" are not present, and so the pre-packaged + // configuration file "pixels-common/src/main/resources/pixels.properties" will be used during runtime. + // Therefore, you need to modify the coordinator host and port in the pre-packaged configuration file on localhost + // where you rebuild the Docker image. } @Override From c516ecbf9c396f1f10136d5cf715743ef9c8da44 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Sat, 2 Nov 2024 20:09:39 +0100 Subject: [PATCH 42/61] Interconnection between workers and stream workers, by modifying storage infos in PixelsPlanner --- .../pixels/planner/PixelsPlanner.java | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index 4099f1620..5be7884a4 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -62,6 +62,7 @@ public class PixelsPlanner private static final Logger logger = LogManager.getLogger(PixelsPlanner.class); private static final StorageInfo InputStorageInfo; private static final StorageInfo IntermediateStorageInfo; + private static final StorageInfo IntermediateStreamStorageInfo; // Streaming only used between partition worker -> partitioned join worker private static final String IntermediateFolder; private static final int IntraWorkerParallelism; private static final ExchangeMethod EnabledExchangeMethod; @@ -91,10 +92,10 @@ public class PixelsPlanner ConfigFactory.Instance().getProperty("executor.input.storage.scheme")); InputStorageInfo = StorageInfoBuilder.BuildFromConfig(inputStorageScheme); - Storage.Scheme interStorageScheme = EnabledExchangeMethod == ExchangeMethod.batch ? - Storage.Scheme.from(ConfigFactory.Instance().getProperty("executor.intermediate.storage.scheme")) : - Storage.Scheme.valueOf("httpstream"); + Storage.Scheme interStorageScheme = Storage.Scheme.from( + ConfigFactory.Instance().getProperty("executor.intermediate.storage.scheme")); IntermediateStorageInfo = StorageInfoBuilder.BuildFromConfig(interStorageScheme); + IntermediateStreamStorageInfo = StorageInfoBuilder.BuildFromConfig(Storage.Scheme.valueOf("httpstream")); String interStorageFolder = ConfigFactory.Instance().getProperty("executor.intermediate.folder"); if (!interStorageFolder.endsWith("/")) { @@ -285,7 +286,7 @@ else if (originTable.getTableType() == Table.TableType.JOINED) joinInput.setPartialAggregationInfo(partialAggregationInfo); String fileName = "partial_aggr_" + outputId++; MultiOutputInfo outputInfo = joinInput.getOutput(); - outputInfo.setStorageInfo(IntermediateStorageInfo); + outputInfo.setStorageInfo(IntermediateStorageInfo); // IntermediateStreamStorageInfo? outputInfo.setPath(intermediateBase); outputInfo.setFileNames(ImmutableList.of(fileName)); aggrInputFilesBuilder.add(intermediateBase + fileName); @@ -527,12 +528,14 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) boolean leftIsBase = leftTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo leftTableInfo = new PartitionedTableInfo( leftTable.getTableName(), leftIsBase, leftTable.getColumnNames(), - leftIsBase ? InputStorageInfo : IntermediateStorageInfo, + leftIsBase ? InputStorageInfo : + (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), leftPartitionedFiles, IntraWorkerParallelism, leftKeyColumnIds); boolean rightIsBase = rightTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo rightTableInfo = new PartitionedTableInfo( rightTable.getTableName(), rightIsBase, rightTable.getColumnNames(), - rightIsBase ? InputStorageInfo : IntermediateStorageInfo, + rightIsBase ? InputStorageInfo : + (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), rightPartitionedFiles, IntraWorkerParallelism, rightKeyColumnIds); int numPartition = PlanOptimizer.Instance().getJoinNumPartition( @@ -961,7 +964,8 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) boolean leftIsBase = leftTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo leftTableInfo = new PartitionedTableInfo( leftTable.getTableName(), leftIsBase, leftTable.getColumnNames(), - leftIsBase ? InputStorageInfo : IntermediateStorageInfo, + leftIsBase ? InputStorageInfo : + (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), // ??? leftPartitionedFiles, IntraWorkerParallelism, leftKeyColumnIds); boolean[] rightPartitionProjection = getPartitionProjection(rightTable, join.getRightProjection()); @@ -1278,13 +1282,19 @@ private PartitionedTableInfo getPartitionedTableInfo( if (table.getTableType() == Table.TableType.BASE) { return new PartitionedTableInfo(table.getTableName(), true, - newColumnsToRead, InputStorageInfo, rightPartitionedFiles.build(), + newColumnsToRead, + EnabledExchangeMethod == ExchangeMethod.batch ? InputStorageInfo : IntermediateStreamStorageInfo, + rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); } else { return new PartitionedTableInfo(table.getTableName(), false, - newColumnsToRead, IntermediateStorageInfo, rightPartitionedFiles.build(), + newColumnsToRead, + EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo, + rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); + // XXX: This only applies to joined tables, when the current join reads table from a post partition of a previous join. + // If the table type is AGGREAGATED, we should use IntermediateStorageInfo. } } @@ -1320,11 +1330,13 @@ private List getPartitionInputs(Table inputTable, List getPartitionedJoinInputs( String path = getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/"; - MultiOutputInfo output = new MultiOutputInfo(path, IntermediateStorageInfo, true, outputFileNames.build()); + MultiOutputInfo output = new MultiOutputInfo(path, + postPartition && EnabledExchangeMethod == ExchangeMethod.stream ? IntermediateStreamStorageInfo : IntermediateStorageInfo, + true, outputFileNames.build()); boolean[] leftProjection = leftPartitionProjection == null ? joinedTable.getJoin().getLeftProjection() : rewriteProjectionForPartitionedJoin(joinedTable.getJoin().getLeftProjection(), leftPartitionProjection); From e8d7a554a80ce4b2ad27f167dae7a9c3e5280edf Mon Sep 17 00:00:00 2001 From: jasha64 Date: Mon, 4 Nov 2024 16:51:17 +0100 Subject: [PATCH 43/61] VhiveInvoker should not call blocking GRPC in its constructor --- .../pixels/invoker/vhive/VhiveInvoker.java | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pixels-turbo/pixels-invoker-vhive/src/main/java/io/pixelsdb/pixels/invoker/vhive/VhiveInvoker.java b/pixels-turbo/pixels-invoker-vhive/src/main/java/io/pixelsdb/pixels/invoker/vhive/VhiveInvoker.java index cc2428be1..246dc16ff 100644 --- a/pixels-turbo/pixels-invoker-vhive/src/main/java/io/pixelsdb/pixels/invoker/vhive/VhiveInvoker.java +++ b/pixels-turbo/pixels-invoker-vhive/src/main/java/io/pixelsdb/pixels/invoker/vhive/VhiveInvoker.java @@ -33,21 +33,23 @@ public abstract class VhiveInvoker implements Invoker { private static final Logger logger = LogManager.getLogger(VhiveInvoker.class); private final String functionName; - private final int memoryMB; + private int memoryMB; protected VhiveInvoker(String functionName) { this.functionName = functionName; - int memoryMB = 0; - try - { - TurboProto.GetMemoryResponse response = Vhive.Instance().getAsyncClient().getMemory().get(); - memoryMB = (int) response.getMemoryMB(); - } catch (Exception e) - { - logger.error("failed to get memory: " + e); - } - this.memoryMB = memoryMB; + new Thread(() -> { + int memoryMB = 0; + try + { + TurboProto.GetMemoryResponse response = Vhive.Instance().getAsyncClient().getMemory().get(); + memoryMB = (int) response.getMemoryMB(); + } catch (Exception e) + { + logger.error("failed to get memory: " + e); + } + this.memoryMB = memoryMB; + }).start(); } @Override From 5842401b7709eed39faacf2a7f74d73b427c2820 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Mon, 4 Nov 2024 22:16:30 +0100 Subject: [PATCH 44/61] Fix smallPartitionWorkerNum and largePartitionWorkerNum in PartitionedJoinStreamOperator --- .../PartitionedJoinStreamOperator.java | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index e4145ca63..ad979f423 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -59,11 +59,33 @@ public CompletableFuture[]> execute() { // First, bootstrap the join workers. joinOutputs = new CompletableFuture[joinInputs.size()]; + int smallPartitionWorkerNum; + if (!smallPartitionInputs.isEmpty()) + { + smallPartitionWorkerNum = smallPartitionInputs.size(); + } + else if (smallChild != null) { + smallPartitionWorkerNum = smallChild.getJoinInputs().size(); + } + else { + throw new IllegalStateException("smallPartitionInputs and smallChild are both null"); + } + int largePartitionWorkerNum; + if (!largePartitionInputs.isEmpty()) + { + largePartitionWorkerNum = largePartitionInputs.size(); + } + else if (largeChild != null) { + largePartitionWorkerNum = largeChild.getJoinInputs().size(); + } + else { + throw new IllegalStateException("largePartitionInputs and largeChild are both null"); + } for (int i = 0; i < joinInputs.size(); ++i) { JoinInput joinInput = joinInputs.get(i); - joinInput.setSmallPartitionWorkerNum(smallPartitionInputs.size()); // XXX: could be 0 - joinInput.setLargePartitionWorkerNum(largePartitionInputs.size()); // XXX: Can do this in PixelsPlanner + joinInput.setSmallPartitionWorkerNum(smallPartitionWorkerNum); + joinInput.setLargePartitionWorkerNum(largePartitionWorkerNum); // XXX: Can do this in PixelsPlanner ((PartitionedJoinInput)joinInput).getJoinInfo().setPostPartitionId(i); if (joinAlgo == JoinAlgorithm.PARTITIONED) { From 2d85258dc6e32d710fcc7dd47f9217a121e9f9ca Mon Sep 17 00:00:00 2001 From: jasha64 Date: Mon, 4 Nov 2024 22:20:46 +0100 Subject: [PATCH 45/61] Bug fix, support multiple partition workers --- .../pixels/core/PixelsReaderStreamImpl.java | 6 +- .../BasePartitionedJoinStreamWorker.java | 324 +++++++----------- 2 files changed, 134 insertions(+), 196 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 9cac57272..5078434a0 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -216,12 +216,12 @@ else if (partitioned) } } - // schema packet: only 1 packet expected, so close the connection immediately + // schema reader: only 1 packet expected, so close the connection immediately // partitioned mode: close the connection if all partitions received // else (non-partitioned mode, data packet): close connection if empty packet received - boolean needCloseParentChannel = partitionId == PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER || + boolean needCloseParentChannel = numPartitions == PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER || (partitioned && numPartitionsReceived.get() == numPartitions) || - (Objects.equals(req.headers().get(CONNECTION), CLOSE.toString()) && + (numPartitions == -1 && Objects.equals(req.headers().get(CONNECTION), CLOSE.toString()) && req.content().readableBytes() == 0); sendResponseAndClose(ctx, req, HttpResponseStatus.OK, needCloseParentChannel); } diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index bb0623065..7fc83cbaa 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -96,7 +96,11 @@ public JoinOutput process(PartitionedJoinInput event) List leftPartitioned = event.getSmallTable().getInputFiles(); requireNonNull(leftPartitioned, "leftPartitioned is null"); checkArgument(leftPartitioned.size() > 0, "leftPartitioned is empty"); - int leftParallelism = event.getSmallTable().getParallelism(); + int leftParallelism = 1; // event.getSmallTable().getParallelism(); + // todo: Intra-worker parallelism support in streaming mode + // Currently, we only support an intra-worker parallelism of 1 (no parallelism) in streaming mode. + // Need to allow each join worker to use multiple ports to read input in parallel, so as to + // build the hash table in parallel, thus achieving intra-worker parallelism. checkArgument(leftParallelism > 0, "leftParallelism is not positive"); String[] leftColumnsToRead = event.getSmallTable().getColumnsToRead(); int[] leftKeyColumnIds = event.getSmallTable().getKeyColumnIds(); @@ -106,7 +110,7 @@ public JoinOutput process(PartitionedJoinInput event) List rightPartitioned = event.getLargeTable().getInputFiles(); requireNonNull(rightPartitioned, "rightPartitioned is null"); checkArgument(rightPartitioned.size() > 0, "rightPartitioned is empty"); - int rightParallelism = event.getLargeTable().getParallelism(); + int rightParallelism = 1; // event.getLargeTable().getParallelism(); checkArgument(rightParallelism > 0, "rightParallelism is not positive"); String[] rightColumnsToRead = event.getLargeTable().getColumnsToRead(); int[] rightKeyColumnIds = event.getLargeTable().getKeyColumnIds(); @@ -202,8 +206,6 @@ public JoinOutput process(PartitionedJoinInput event) for (int i = 0; i < leftPartitioned.size(); i += leftSplitSize) { List parts = new LinkedList<>(); - // XXX: Can allow 1 join worker to use multiple ports to read input in parallel, so as to - // build the hash table in parallel. for (int j = i; j < i + leftSplitSize && j < leftPartitioned.size(); ++j) { parts.add(leftPartitioned.get(j)); @@ -306,6 +308,7 @@ public JoinOutput process(PartitionedJoinInput event) .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") // .map(URI::create) .collect(Collectors.toList()); + StreamWorkerCommon.passSchemaToNextLevel(joiner.getJoinedSchema(), outputStorageInfo, outputEndpoints); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, encoding, true, event.getJoinInfo().getPostPartitionId(), Arrays.stream( @@ -428,73 +431,51 @@ protected static void buildHashTable(long transId, Joiner joiner, List l WorkerMetrics.Timer computeCostTimer = new WorkerMetrics.Timer(); long readBytes = 0L; int numReadRequests = 0; - while (!leftParts.isEmpty()) + + readCostTimer.start(); + PixelsReader pixelsReader = null; + try { - for (Iterator it = leftParts.iterator(); it.hasNext(); ) + pixelsReader = StreamWorkerCommon.getReader(leftScheme, "http://localhost:18688/", true, numPartition); + readCostTimer.stop(); + checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); + for (int hashValue : hashValues) { - String leftPartitioned = it.next(); - readCostTimer.start(); - PixelsReader pixelsReader = null; - try - { - pixelsReader = StreamWorkerCommon.getReader(leftScheme, "http://localhost:18688/", true, numPartition); - readCostTimer.stop(); - checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - for (int hashValue : hashValues) - { - PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, leftCols, pixelsReader, - hashValue, numPartition); - VectorizedRowBatch rowBatch; - PixelsRecordReader recordReader = pixelsReader.read(option); - // XXX: perhaps do not need to re-initialize the record reader for each hash value. - if (recordReader == null) continue; - checkArgument(recordReader.isValid(), "failed to get record reader"); - - computeCostTimer.start(); - do - { - rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); - if (rowBatch.size > 0) - { - joiner.populateLeftTable(rowBatch); - } - } while (!rowBatch.endOfFile); - computeCostTimer.stop(); - computeCostTimer.minus(recordReader.getReadTimeNanos()); - readCostTimer.add(recordReader.getReadTimeNanos()); - readBytes += recordReader.getCompletedBytes(); - numReadRequests += recordReader.getNumReadRequests(); - } - it.remove(); - } - catch (Throwable e) - { - if (e instanceof IOException) - { - continue; - } - throw new WorkerException("failed to scan the partitioned file '" + - leftPartitioned + "' and build the hash table", e); - } - finally + PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, leftCols, pixelsReader, + hashValue, numPartition); + VectorizedRowBatch rowBatch; + PixelsRecordReader recordReader = pixelsReader.read(option); + // XXX: perhaps do not need to re-initialize the record reader for each hash value. + if (recordReader == null) continue; + checkArgument(recordReader.isValid(), "failed to get record reader"); + + computeCostTimer.start(); + do { - if (pixelsReader != null) + rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); + if (rowBatch.size > 0) { - logger.debug("closing pixels reader"); - pixelsReader.close(); + joiner.populateLeftTable(rowBatch); } - } + } while (!rowBatch.endOfFile); + computeCostTimer.stop(); + computeCostTimer.minus(recordReader.getReadTimeNanos()); + readCostTimer.add(recordReader.getReadTimeNanos()); + readBytes += recordReader.getCompletedBytes(); + numReadRequests += recordReader.getNumReadRequests(); } - if (!leftParts.isEmpty()) + } + catch (Throwable e) + { + if (!(e instanceof IOException)) + throw new WorkerException("failed to scan the partitioned file and build the hash table", e); + } + finally + { + if (pixelsReader != null) { - try - { - TimeUnit.MILLISECONDS.sleep(100); - } - catch (InterruptedException e) - { - throw new WorkerException("interrupted while waiting for the partitioned files"); - } + logger.debug("closing pixels reader"); + pixelsReader.close(); } } workerMetrics.addReadBytes(readBytes); @@ -527,79 +508,57 @@ protected static int joinWithRightTable( WorkerMetrics.Timer computeCostTimer = new WorkerMetrics.Timer(); long readBytes = 0L; int numReadRequests = 0; - while (!rightParts.isEmpty()) + + readCostTimer.start(); + PixelsReader pixelsReader = null; + try { - for (Iterator it = rightParts.iterator(); it.hasNext(); ) + pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); + readCostTimer.stop(); + checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); + for (int hashValue : hashValues) { - String rightPartitioned = it.next(); - readCostTimer.start(); - PixelsReader pixelsReader = null; - try + PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, + hashValue, numPartition); + VectorizedRowBatch rowBatch; + PixelsRecordReader recordReader = pixelsReader.read(option); + checkArgument(recordReader.isValid(), "failed to get record reader"); + + computeCostTimer.start(); + do { - pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); - readCostTimer.stop(); - checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - for (int hashValue : hashValues) + rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); + if (rowBatch.size > 0) { - PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, - hashValue, numPartition); - VectorizedRowBatch rowBatch; - PixelsRecordReader recordReader = pixelsReader.read(option); - checkArgument(recordReader.isValid(), "failed to get record reader"); - - computeCostTimer.start(); - do + List joinedBatches = joiner.join(rowBatch); + for (VectorizedRowBatch joined : joinedBatches) { - rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); - if (rowBatch.size > 0) + if (!joined.isEmpty()) { - List joinedBatches = joiner.join(rowBatch); - for (VectorizedRowBatch joined : joinedBatches) - { - if (!joined.isEmpty()) - { - joinResult.add(joined); - joinedRows += joined.size; - } - } + joinResult.add(joined); + joinedRows += joined.size; } - } while (!rowBatch.endOfFile); - computeCostTimer.stop(); - computeCostTimer.minus(recordReader.getReadTimeNanos()); - readCostTimer.add(recordReader.getReadTimeNanos()); - readBytes += recordReader.getCompletedBytes(); - numReadRequests += recordReader.getNumReadRequests(); - } - it.remove(); - } - catch (Throwable e) - { - if (e instanceof IOException) - { - continue; - } - throw new WorkerException("failed to scan the partitioned file '" + - rightPartitioned + "' and do the join", e); - } - finally - { - if (pixelsReader != null) - { - logger.debug("closing pixels reader"); - pixelsReader.close(); + } } - } + } while (!rowBatch.endOfFile); + computeCostTimer.stop(); + computeCostTimer.minus(recordReader.getReadTimeNanos()); + readCostTimer.add(recordReader.getReadTimeNanos()); + readBytes += recordReader.getCompletedBytes(); + numReadRequests += recordReader.getNumReadRequests(); } - if (!rightParts.isEmpty()) + } + catch (Throwable e) + { + if (!(e instanceof IOException)) + throw new WorkerException("failed to scan the partitioned file and do the join", e); + } + finally + { + if (pixelsReader != null) { - try - { - TimeUnit.MILLISECONDS.sleep(100); - } - catch (InterruptedException e) - { - throw new WorkerException("interrupted while waiting for the partitioned files"); - } + logger.debug("closing pixels reader"); + pixelsReader.close(); } } workerMetrics.addReadBytes(readBytes); @@ -637,84 +596,63 @@ protected static int joinWithRightTableAndPartition( WorkerMetrics.Timer computeCostTimer = new WorkerMetrics.Timer(); long readBytes = 0L; int numReadRequests = 0; - while (!rightParts.isEmpty()) + + readCostTimer.start(); + PixelsReader pixelsReader = null; + try { - for (Iterator it = rightParts.iterator(); it.hasNext(); ) + pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); + readCostTimer.stop(); + checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); + // XXX: check that the hashValue in row group headers match the hashValue assigned to this worker + for (int hashValue : hashValues) { - String rightPartitioned = it.next(); - readCostTimer.start(); - PixelsReader pixelsReader = null; - try + PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, + hashValue, numPartition); + VectorizedRowBatch rowBatch; + PixelsRecordReader recordReader = pixelsReader.read(option); + if (recordReader == null) continue; + checkArgument(recordReader.isValid(), "failed to get record reader"); + + computeCostTimer.start(); + do { - pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); - readCostTimer.stop(); - checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); - for (int hashValue : hashValues) + rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); + if (rowBatch.size > 0) { - PixelsReaderOption option = StreamWorkerCommon.getReaderOption(transId, rightCols, pixelsReader, - hashValue, numPartition); - VectorizedRowBatch rowBatch; - PixelsRecordReader recordReader = pixelsReader.read(option); - if (recordReader == null) continue; - checkArgument(recordReader.isValid(), "failed to get record reader"); - - computeCostTimer.start(); - do + List joinedBatches = joiner.join(rowBatch); + for (VectorizedRowBatch joined : joinedBatches) { - rowBatch = recordReader.readBatch(StreamWorkerCommon.rowBatchSize); - if (rowBatch.size > 0) + if (!joined.isEmpty()) { - List joinedBatches = joiner.join(rowBatch); - for (VectorizedRowBatch joined : joinedBatches) + Map parts = partitioner.partition(joined); + for (Map.Entry entry : parts.entrySet()) { - if (!joined.isEmpty()) - { - Map parts = partitioner.partition(joined); - for (Map.Entry entry : parts.entrySet()) - { - partitionResult.get(entry.getKey()).add(entry.getValue()); - } - joinedRows += joined.size; - } + partitionResult.get(entry.getKey()).add(entry.getValue()); } + joinedRows += joined.size; } - } while (!rowBatch.endOfFile); - computeCostTimer.stop(); - computeCostTimer.minus(recordReader.getReadTimeNanos()); - readCostTimer.add(recordReader.getReadTimeNanos()); - readBytes += recordReader.getCompletedBytes(); - numReadRequests += recordReader.getNumReadRequests(); - } - it.remove(); - } - catch (Throwable e) - { - if (e instanceof IOException) - { - continue; - } - throw new WorkerException("failed to scan the partitioned file '" + - rightPartitioned + "' and do the join", e); - } - finally - { - if (pixelsReader != null) - { - logger.debug("closing pixels reader"); - pixelsReader.close(); + } } - } + } while (!rowBatch.endOfFile); + computeCostTimer.stop(); + computeCostTimer.minus(recordReader.getReadTimeNanos()); + readCostTimer.add(recordReader.getReadTimeNanos()); + readBytes += recordReader.getCompletedBytes(); + numReadRequests += recordReader.getNumReadRequests(); } - if (!rightParts.isEmpty()) + } + catch (Throwable e) + { + if (!(e instanceof IOException)) + throw new WorkerException("failed to scan the partitioned file and do the join", e); + } + finally + { + if (pixelsReader != null) { - try - { - TimeUnit.MILLISECONDS.sleep(100); - } - catch (InterruptedException e) - { - throw new WorkerException("interrupted while waiting for the partitioned files"); - } + logger.debug("closing pixels reader"); + pixelsReader.close(); } } From e1b6ccae5731e7af1c94071a347b21972dce7042 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Mon, 4 Nov 2024 22:21:24 +0100 Subject: [PATCH 46/61] Minor fix --- .../src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index 5be7884a4..98d35a458 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -1001,7 +1001,7 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) new PartitionedJoinBatchOperator(joinedTable.getTableName(), rightPartitionInputs, null, joinInputs, joinAlgo) : new PartitionedJoinStreamOperator(joinedTable.getTableName(), - null, rightPartitionInputs, joinInputs, joinAlgo); + rightPartitionInputs, null, joinInputs, joinAlgo); joinOperator.setLargeChild(childOperator); } } From 1f0a15981fa86adfa65799ee857f545cf677d2bd Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 5 Nov 2024 17:02:50 +0100 Subject: [PATCH 47/61] Bug fix in pixels stream reader --- .../pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 10 +--------- .../core/reader/PixelsRecordReaderStreamImpl.java | 8 ++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 5078434a0..a369a2d67 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -438,15 +438,7 @@ public int getRowGroupNum() @Override public boolean isPartitioned() { - try - { - streamHeaderLatch.await(); - } - catch (InterruptedException e) - { - logger.error("Interrupted while waiting for stream header", e); - } - return this.streamHeader.hasPartitioned() && this.streamHeader.getPartitioned(); + return partitioned; } /** diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java index 0588cbd47..3eee68314 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java @@ -37,6 +37,8 @@ import java.util.*; import java.util.concurrent.BlockingQueue; +import static com.google.common.base.Preconditions.checkArgument; + /** * PixelsRecordReaderStreamImpl is the variant of {@link PixelsRecordReaderImpl} for streaming mode. *

@@ -153,6 +155,12 @@ public PixelsRecordReaderStreamImpl(boolean partitioned, */ public void lateInitialization(PixelsStreamProto.StreamHeader streamHeader) throws IOException { + if (this.streamHeader != null) + { + checkArgument(this.streamHeader == streamHeader, + "streamHeader used for lateInitialization() is not the same as the one in the RecordReader."); + return; + } this.streamHeader = streamHeader; checkBeforeRead(); } From 0deb55e220c7c40b5621f03a6318af171704798b Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 5 Nov 2024 17:04:07 +0100 Subject: [PATCH 48/61] Bug fix pixels stream reader --- .../worker/common/BasePartitionedJoinStreamWorker.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 7fc83cbaa..0912793d0 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -447,7 +447,9 @@ protected static void buildHashTable(long transId, Joiner joiner, List l PixelsRecordReader recordReader = pixelsReader.read(option); // XXX: perhaps do not need to re-initialize the record reader for each hash value. if (recordReader == null) continue; - checkArgument(recordReader.isValid(), "failed to get record reader"); + // We no longer check the validity of the record reader here, because the record reader + // might not have been initialized yet due to the absence of the stream header. + // checkArgument(recordReader.isValid(), "failed to get record reader"); computeCostTimer.start(); do @@ -522,7 +524,7 @@ protected static int joinWithRightTable( hashValue, numPartition); VectorizedRowBatch rowBatch; PixelsRecordReader recordReader = pixelsReader.read(option); - checkArgument(recordReader.isValid(), "failed to get record reader"); + // checkArgument(recordReader.isValid(), "failed to get record reader"); computeCostTimer.start(); do @@ -612,7 +614,7 @@ protected static int joinWithRightTableAndPartition( VectorizedRowBatch rowBatch; PixelsRecordReader recordReader = pixelsReader.read(option); if (recordReader == null) continue; - checkArgument(recordReader.isValid(), "failed to get record reader"); + // checkArgument(recordReader.isValid(), "failed to get record reader"); computeCostTimer.start(); do From 1a1aadbbd2f0f007c81c219b1620693f14d091a9 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 5 Nov 2024 17:13:35 +0100 Subject: [PATCH 49/61] Comments and logs --- .../io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 2 +- .../worker/common/BasePartitionedJoinStreamWorker.java | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index a369a2d67..1bd7ab81e 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -192,7 +192,7 @@ else if (partitioned) } // We only need to put the byteBuf into the blocking queue to pass it to the recordReader, if the - // client is a data writer rather than a schema writer. In the latter case, + // packet is a data packet rather than a schema packet. Because in the latter case, // the schema packet has been processed when parsing the stream header above. if (partitionId != PixelsWriterStreamImpl.PARTITION_ID_SCHEMA_WRITER) { diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 0912793d0..06973e4c2 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -308,6 +308,9 @@ public JoinOutput process(PartitionedJoinInput event) .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") // .map(URI::create) .collect(Collectors.toList()); + // In partitioned mode, the schema is sent in an over-replicated manner: + // every previous-stage worker (rather than one of them) sends a schema packet + // before sending its intermediate data, to prevent errors from possibly out-of-order packet arrivals. StreamWorkerCommon.passSchemaToNextLevel(joiner.getJoinedSchema(), outputStorageInfo, outputEndpoints); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, @@ -476,7 +479,7 @@ protected static void buildHashTable(long transId, Joiner joiner, List l { if (pixelsReader != null) { - logger.debug("closing pixels reader"); + logger.debug("closing pixels reader on port 18688"); pixelsReader.close(); } } @@ -559,7 +562,7 @@ protected static int joinWithRightTable( { if (pixelsReader != null) { - logger.debug("closing pixels reader"); + logger.debug("closing pixels reader on port 18686"); pixelsReader.close(); } } @@ -653,7 +656,7 @@ protected static int joinWithRightTableAndPartition( { if (pixelsReader != null) { - logger.debug("closing pixels reader"); + logger.debug("closing pixels reader on 18686"); pixelsReader.close(); } } From 7ad765d71a1691e77417b2b0724686326ec0a936 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 00:18:58 +0100 Subject: [PATCH 50/61] Bug fix in stream reader and writer: process empty partition results --- .../pixelsdb/pixels/core/PixelsReaderStreamImpl.java | 7 +++++-- .../pixelsdb/pixels/core/PixelsWriterStreamImpl.java | 11 ++++++++--- .../core/reader/PixelsRecordReaderStreamImpl.java | 10 ++++++++++ .../io/pixelsdb/pixels/core/utils/BlockingMap.java | 2 +- .../worker/common/BasePartitionStreamWorker.java | 5 ++++- .../common/BasePartitionedJoinStreamWorker.java | 11 ++++++++--- 6 files changed, 36 insertions(+), 10 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java index 1bd7ab81e..98f41029f 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsReaderStreamImpl.java @@ -179,8 +179,10 @@ public void channelRead0(ChannelHandlerContext ctx, HttpObject msg) else if (partitioned) { // In partitioned mode, every packet brings a streamHeader to prevent errors from possibly - // out-of-order packet arrivals, so we need to parse it, but do not need the return value - // (except for the first incoming packet processed above). + // out-of-order packet arrivals, so we need to parse it, but do not need the return value + // (except for the first incoming packet processed above). + // XXX: Now we have each worker pass the schema in a separate packet, so this is no longer + // necessary. We can remove this block of code in PixelsWriterStreamImpl. parseStreamHeader(byteBuf); } } @@ -516,6 +518,7 @@ public PixelsProto.Footer getFooter() public void close() throws IOException { + logger.debug("Closing PixelsReaderStreamImpl"); new Thread(() -> { // Conditions for closing: // 1. streamHeaderLatch.await() to ensure that the stream header has been received diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java index 127aeb1f0..53359f237 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java @@ -512,6 +512,7 @@ public void addRowBatch(VectorizedRowBatch rowBatch, int hashValue) throws IOExc currHashValue = hashValue; hashValueIsSet = true; curRowGroupDataLength = 0; + if (rowBatch == null) return; curRowGroupNumOfRows += rowBatch.size; writeColumnVectors(rowBatch.cols, rowBatch.size); } @@ -550,8 +551,9 @@ public void close() { try { - if (curRowGroupNumOfRows != 0) + if (partitioned || curRowGroupNumOfRows != 0) { + // In partitioned mode, even an empty row group has to be sent to the server. writeRowGroup(); } // If the outgoing stream is empty (addRowBatch() and thus writeRowGroup() never called), we artificially @@ -609,7 +611,9 @@ else if (isFirstRowGroup) private void writeRowGroup() throws IOException { - if (isFirstRowGroup || partitioned) + // XXX: Now that we have each worker pass the schema in a separate packet in partitioned mode, it is no longer + // necessary to add a stream header to every packet. We can modify this block of code. + if (isFirstRowGroup || partitioned) // if (isFirstRowGroup) { writeStreamHeader(); isFirstRowGroup = false; @@ -769,7 +773,8 @@ private void writeRowGroup() throws IOException uri = URI.create(fileNameToUri(fileName)); } String reqUri = partitioned ? uris.get(currHashValue).toString() : uri.toString(); - logger.debug("Sending row group with length: " + byteBuf.writerIndex() + " to endpoint: " + reqUri); + logger.debug("Sending row group to endpoint: " + reqUri + ", length: " + byteBuf.writerIndex() + + ", partitionId: " + partitionId); Request req = httpClient.preparePost(reqUri) .setBody(byteBuf.nioBuffer()) .addHeader("X-Partition-Id", String.valueOf(partitionId)) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java index 3eee68314..4d1dab6f4 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderStreamImpl.java @@ -434,6 +434,7 @@ public int prepareBatch(int batchSize) */ private VectorizedRowBatch createEmptyEOFRowBatch(int size) { + logger.debug("In createEmptyEOFRowBatch(), size = " + size); TypeDescription resultSchema = TypeDescription.createSchema(new ArrayList<>()); VectorizedRowBatch resultRowBatch = resultSchema.createRowBatch(0); resultRowBatch.projectionSize = 0; @@ -502,6 +503,14 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) } int rgRowCount = (int) curRowGroupStreamFooter.getNumberOfRows(); + if (rgRowCount == 0) + { + // Empty row group, mark the current row group as unreadable. + curRowGroupByteBuf.readerIndex(curRowGroupByteBuf.readerIndex() + curRowGroupByteBuf.readableBytes()); + curRGIdx++; + return resultSchema.createRowBatch(0, resultColumnsEncoded); + } + int curBatchSize; ColumnVector[] columnVectors = resultRowBatch.cols; @@ -710,6 +719,7 @@ private void acquireNewRowGroup(boolean reuse) throws IOException else // incoming byteBuf unreadable, must be end of stream { + logger.debug("In acquireNewRowGroup(), end of file"); // checkValid = false; // Issue #105: to reject continuous read. if (reuse && resultRowBatch != null) // XXX: Before we implement necessary checks, the close() below might be called before our readBatch() diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/utils/BlockingMap.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/utils/BlockingMap.java index c1f0107fc..1505e937d 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/utils/BlockingMap.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/utils/BlockingMap.java @@ -51,7 +51,7 @@ public void put(K key, V value) public V get(K key) throws InterruptedException { - V ret = getQueue(key).poll(60, TimeUnit.SECONDS); + V ret = getQueue(key).poll(300, TimeUnit.SECONDS); if (ret == null) { throw new RuntimeException("BlockingMap.get() timed out"); diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 0b2c11b0e..7a30cd545 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -206,8 +206,11 @@ public PartitionOutput process(PartitionInput event) { pixelsWriter.addRowBatch(batch, hash); } - hashValues.add(hash); } + else { + pixelsWriter.addRowBatch(null, hash); + } + hashValues.add(hash); } partitionOutput.addOutput(outputPath); partitionOutput.setHashValues(hashValues); diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 06973e4c2..825751af0 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -243,8 +243,10 @@ public JoinOutput process(PartitionedJoinInput event) } // scan the right table and do the join. - if (joiner.getSmallTableSize() > 0) - { + // We no longer check this condition in streaming mode, because even if the joiner is empty, + // we have to read from the right table to enforce the streaming protocol. +// if (joiner.getSmallTableSize() > 0) +// { int rightSplitSize = rightPartitioned.size() / rightParallelism; if (rightPartitioned.size() % rightParallelism > 0) { @@ -290,7 +292,7 @@ public JoinOutput process(PartitionedJoinInput event) { throw new WorkerException("error occurred threads, please check the stacktrace before this log record"); } - } +// } String outputPath = outputFolder + outputInfo.getFileNames().get(0); try @@ -327,6 +329,9 @@ public JoinOutput process(PartitionedJoinInput event) pixelsWriter.addRowBatch(batch, hash); } } + else { + pixelsWriter.addRowBatch(null, hash); + } } } else From b009faab2931cbe8d84ad7b8e0322bb1d4544624 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 00:21:45 +0100 Subject: [PATCH 51/61] Bug fix in PartitionedJoinOperator --- .../pixels/planner/plan/physical/PartitionedJoinOperator.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinOperator.java index a76a65773..619b065a9 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinOperator.java @@ -192,6 +192,7 @@ public void initPlanCoordinator(PlanCoordinator planCoordinator, int parentStage partitionInput.getTableInfo().setInputSplits(ImmutableList.of(inputSplit)); tasks.add(new Task(taskId++, JSON.toJSONString(partitionInput))); } + partitionInput.getTableInfo().setInputSplits(inputSplits); } StageCoordinator partitionStageCoordinator = new StageCoordinator(smallPartitionStageId, tasks); planCoordinator.addStageCoordinator(partitionStageCoordinator, partitionStageDependency); @@ -215,6 +216,7 @@ public void initPlanCoordinator(PlanCoordinator planCoordinator, int parentStage partitionInput.getTableInfo().setInputSplits(ImmutableList.of(inputSplit)); tasks.add(new Task(taskId++, JSON.toJSONString(partitionInput))); } + partitionInput.getTableInfo().setInputSplits(inputSplits); // restore the input splits after modifying partitionInput as a temporary variable } StageCoordinator partitionStageCoordinator = new StageCoordinator(largePartitionStageId, tasks); planCoordinator.addStageCoordinator(partitionStageCoordinator, partitionStageDependency); From d0aa7d00e6ed53a3dbffcdd3778c38c5ca679a43 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 00:23:39 +0100 Subject: [PATCH 52/61] Revert "Interconnection between workers and stream workers, by modifying storage infos in PixelsPlanner" This reverts commit 09bc92c95313242b87c17586cf452ae2ed2089cf. --- .../pixels/planner/PixelsPlanner.java | 38 ++++++------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index 98d35a458..a8ecc150d 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -62,7 +62,6 @@ public class PixelsPlanner private static final Logger logger = LogManager.getLogger(PixelsPlanner.class); private static final StorageInfo InputStorageInfo; private static final StorageInfo IntermediateStorageInfo; - private static final StorageInfo IntermediateStreamStorageInfo; // Streaming only used between partition worker -> partitioned join worker private static final String IntermediateFolder; private static final int IntraWorkerParallelism; private static final ExchangeMethod EnabledExchangeMethod; @@ -92,10 +91,10 @@ public class PixelsPlanner ConfigFactory.Instance().getProperty("executor.input.storage.scheme")); InputStorageInfo = StorageInfoBuilder.BuildFromConfig(inputStorageScheme); - Storage.Scheme interStorageScheme = Storage.Scheme.from( - ConfigFactory.Instance().getProperty("executor.intermediate.storage.scheme")); + Storage.Scheme interStorageScheme = EnabledExchangeMethod == ExchangeMethod.batch ? + Storage.Scheme.from(ConfigFactory.Instance().getProperty("executor.intermediate.storage.scheme")) : + Storage.Scheme.valueOf("httpstream"); IntermediateStorageInfo = StorageInfoBuilder.BuildFromConfig(interStorageScheme); - IntermediateStreamStorageInfo = StorageInfoBuilder.BuildFromConfig(Storage.Scheme.valueOf("httpstream")); String interStorageFolder = ConfigFactory.Instance().getProperty("executor.intermediate.folder"); if (!interStorageFolder.endsWith("/")) { @@ -286,7 +285,7 @@ else if (originTable.getTableType() == Table.TableType.JOINED) joinInput.setPartialAggregationInfo(partialAggregationInfo); String fileName = "partial_aggr_" + outputId++; MultiOutputInfo outputInfo = joinInput.getOutput(); - outputInfo.setStorageInfo(IntermediateStorageInfo); // IntermediateStreamStorageInfo? + outputInfo.setStorageInfo(IntermediateStorageInfo); outputInfo.setPath(intermediateBase); outputInfo.setFileNames(ImmutableList.of(fileName)); aggrInputFilesBuilder.add(intermediateBase + fileName); @@ -528,14 +527,12 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) boolean leftIsBase = leftTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo leftTableInfo = new PartitionedTableInfo( leftTable.getTableName(), leftIsBase, leftTable.getColumnNames(), - leftIsBase ? InputStorageInfo : - (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), + leftIsBase ? InputStorageInfo : IntermediateStorageInfo, leftPartitionedFiles, IntraWorkerParallelism, leftKeyColumnIds); boolean rightIsBase = rightTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo rightTableInfo = new PartitionedTableInfo( rightTable.getTableName(), rightIsBase, rightTable.getColumnNames(), - rightIsBase ? InputStorageInfo : - (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), + rightIsBase ? InputStorageInfo : IntermediateStorageInfo, rightPartitionedFiles, IntraWorkerParallelism, rightKeyColumnIds); int numPartition = PlanOptimizer.Instance().getJoinNumPartition( @@ -964,8 +961,7 @@ else if (joinAlgo == JoinAlgorithm.PARTITIONED) boolean leftIsBase = leftTable.getTableType() == Table.TableType.BASE; PartitionedTableInfo leftTableInfo = new PartitionedTableInfo( leftTable.getTableName(), leftIsBase, leftTable.getColumnNames(), - leftIsBase ? InputStorageInfo : - (EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo), // ??? + leftIsBase ? InputStorageInfo : IntermediateStorageInfo, leftPartitionedFiles, IntraWorkerParallelism, leftKeyColumnIds); boolean[] rightPartitionProjection = getPartitionProjection(rightTable, join.getRightProjection()); @@ -1282,19 +1278,13 @@ private PartitionedTableInfo getPartitionedTableInfo( if (table.getTableType() == Table.TableType.BASE) { return new PartitionedTableInfo(table.getTableName(), true, - newColumnsToRead, - EnabledExchangeMethod == ExchangeMethod.batch ? InputStorageInfo : IntermediateStreamStorageInfo, - rightPartitionedFiles.build(), + newColumnsToRead, InputStorageInfo, rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); } else { return new PartitionedTableInfo(table.getTableName(), false, - newColumnsToRead, - EnabledExchangeMethod == ExchangeMethod.batch ? IntermediateStorageInfo : IntermediateStreamStorageInfo, - rightPartitionedFiles.build(), + newColumnsToRead, IntermediateStorageInfo, rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); - // XXX: This only applies to joined tables, when the current join reads table from a post partition of a previous join. - // If the table type is AGGREAGATED, we should use IntermediateStorageInfo. } } @@ -1330,13 +1320,11 @@ private List getPartitionInputs(Table inputTable, List getPartitionedJoinInputs( String path = getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/"; - MultiOutputInfo output = new MultiOutputInfo(path, - postPartition && EnabledExchangeMethod == ExchangeMethod.stream ? IntermediateStreamStorageInfo : IntermediateStorageInfo, - true, outputFileNames.build()); + MultiOutputInfo output = new MultiOutputInfo(path, IntermediateStorageInfo, true, outputFileNames.build()); boolean[] leftProjection = leftPartitionProjection == null ? joinedTable.getJoin().getLeftProjection() : rewriteProjectionForPartitionedJoin(joinedTable.getJoin().getLeftProjection(), leftPartitionProjection); From 3b03cdca0e4b7f455980d8bd735fb9759e7ba579 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 20:20:10 +0100 Subject: [PATCH 53/61] Add partitionId in PartitionInput, to fix a hardcode in BasePartitionStreamWorker --- .../physical/PartitionedJoinStreamOperator.java | 13 ++++++++++++- .../plan/physical/input/PartitionInput.java | 14 ++++++++++++++ .../worker/common/BasePartitionStreamWorker.java | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index ad979f423..8f1f7c9d3 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -63,6 +63,11 @@ public CompletableFuture[]> execute() if (!smallPartitionInputs.isEmpty()) { smallPartitionWorkerNum = smallPartitionInputs.size(); + for (int i = 0; i < smallPartitionInputs.size(); ++i) + { + PartitionInput partitionInput = smallPartitionInputs.get(i); + partitionInput.setPartitionId(i); + } } else if (smallChild != null) { smallPartitionWorkerNum = smallChild.getJoinInputs().size(); @@ -74,6 +79,11 @@ else if (smallChild != null) { if (!largePartitionInputs.isEmpty()) { largePartitionWorkerNum = largePartitionInputs.size(); + for (int i = 0; i < largePartitionInputs.size(); ++i) + { + PartitionInput partitionInput = largePartitionInputs.get(i); + partitionInput.setPartitionId(i); + } } else if (largeChild != null) { largePartitionWorkerNum = largeChild.getJoinInputs().size(); @@ -179,7 +189,8 @@ else if (largeChild != null) logger.debug("invoke large partition of " + this.getName()); } - // todo: Finally, wait for the readiness of the partition operators + // todo: Finally, wait for the readiness of the partition workers + // (need to modify the partition workers to pull tasks from the worker coordinator server). return joinOutputs; }); diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java index 313013c5c..8301ee1ce 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/input/PartitionInput.java @@ -56,6 +56,10 @@ public class PartitionInput extends Input * when using streaming. */ private boolean isSmallTable; + /** + * The id of this partition in the current stage. + */ + private int partitionId; /** * Default constructor for Jackson. @@ -124,4 +128,14 @@ public void setSmallTable(boolean isSmallTable) { this.isSmallTable = isSmallTable; } + + public int getPartitionId() + { + return partitionId; + } + + public void setPartitionId(int partitionId) + { + this.partitionId = partitionId; + } } diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 7a30cd545..9c6e7e805 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -193,7 +193,7 @@ public PartitionOutput process(PartitionInput event) StreamWorkerCommon.passSchemaToNextLevel(writerSchema.get(), outputStorageInfo, outputEndpoints); PixelsWriter pixelsWriter = StreamWorkerCommon.getWriter(writerSchema.get(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, encoding, - true, 0, // todo: hardcoded for only 1 partition worker scenario; need to pass the actual value + true, event.getPartitionId(), Arrays.stream(keyColumnIds).boxed().collect(Collectors.toList()), outputEndpoints, false); Set hashValues = new HashSet<>(numPartition); From 8e8a2bcdc75a5526a7069d3c6b618889aefc2d5d Mon Sep 17 00:00:00 2001 From: jasha64 Date: Thu, 7 Nov 2024 22:15:02 +0100 Subject: [PATCH 54/61] Adapt PixelsPlanner to support streaming mode --- .../io/pixelsdb/pixels/planner/PixelsPlanner.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index a8ecc150d..bb71c7538 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -1278,7 +1278,9 @@ private PartitionedTableInfo getPartitionedTableInfo( if (table.getTableType() == Table.TableType.BASE) { return new PartitionedTableInfo(table.getTableName(), true, - newColumnsToRead, InputStorageInfo, rightPartitionedFiles.build(), + newColumnsToRead, + EnabledExchangeMethod == ExchangeMethod.batch ? InputStorageInfo : IntermediateStorageInfo, + rightPartitionedFiles.build(), IntraWorkerParallelism, newKeyColumnIds); } else { @@ -1324,7 +1326,9 @@ private List getPartitionInputs(Table inputTable, List getPartitionedJoinInputs( String path = getIntermediateFolderForTrans(transId) + joinedTable.getSchemaName() + "/" + joinedTable.getTableName() + "/"; - MultiOutputInfo output = new MultiOutputInfo(path, IntermediateStorageInfo, true, outputFileNames.build()); + MultiOutputInfo output = new MultiOutputInfo(path, + postPartition && EnabledExchangeMethod != ExchangeMethod.batch ? IntermediateStorageInfo : InputStorageInfo, + true, outputFileNames.build()); boolean[] leftProjection = leftPartitionProjection == null ? joinedTable.getJoin().getLeftProjection() : rewriteProjectionForPartitionedJoin(joinedTable.getJoin().getLeftProjection(), leftPartitionProjection); From 35dd99ed5beecb1123d575ce425ae794e4c5d954 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Tue, 12 Nov 2024 23:57:31 +0100 Subject: [PATCH 55/61] Optimization in partitioned join stream worker --- .../BasePartitionedJoinStreamWorker.java | 69 ++++++++++--------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index 825751af0..ff2f59b2d 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -165,7 +165,14 @@ public JoinOutput process(PartitionedJoinInput event) AtomicReference leftSchema = new AtomicReference<>(); AtomicReference rightSchema = new AtomicReference<>(); - // `registerWorker()` might awake the dependent workers, so it should be called just before + // Bootstrap the readers at once which is up all the time during the worker's lifetime, + // to ensure immediate reception of intermediate data and avoid retries on the writer side. + PixelsReader leftPixelsReader = StreamWorkerCommon.getReader( leftInputStorageInfo.getScheme(), + "http://localhost:18688/", true, event.getSmallPartitionWorkerNum()); + PixelsReader rightPixelsReader = StreamWorkerCommon.getReader(rightInputStorageInfo.getScheme(), + "http://localhost:18686/", true, event.getLargePartitionWorkerNum()); + + // `registerWorker()` might awake the dependent workers, so it should be called just before / after // the current worker listens on its HTTP port and is ready to receive streaming packets. CFWorkerInfo workerInfo = new CFWorkerInfo( InetAddress.getLocalHost().getHostAddress(), -1, @@ -178,14 +185,9 @@ public JoinOutput process(PartitionedJoinInput event) logger.debug("getSchemaFromPaths, left input: " + leftPartitioned + ", right input: " + rightPartitioned); - StreamWorkerCommon.getSchemaFromPaths(threadPool, - StreamWorkerCommon.getStorage(leftInputStorageInfo.getScheme()), - StreamWorkerCommon.getStorage(rightInputStorageInfo.getScheme()), - leftSchema, rightSchema, - Collections.singletonList("http://localhost:18688/"), - Collections.singletonList("http://localhost:18686/")); - // XXX: Better to ensure the subsequent data reader is up immediately after the schema is ready, - // to avoid retries on the writer side. + // XXX: StreamWorkerCommon.getSchemaFromPaths() can be removed + leftSchema.set ( leftPixelsReader.getFileSchema()); + rightSchema.set(rightPixelsReader.getFileSchema()); /* * Issue #450: * For the left and the right partial partitioned files, the file schema is equal to the columns to read in normal cases. @@ -196,6 +198,20 @@ public JoinOutput process(PartitionedJoinInput event) leftColAlias, leftProjection, leftKeyColumnIds, StreamWorkerCommon.getResultSchema(rightSchema.get(), rightColumnsToRead), rightColAlias, rightProjection, rightKeyColumnIds); + List downStreamWorkers = workerCoordinateService.getDownstreamWorkers(worker.getWorkerId()) + .stream() + .sorted(Comparator.comparing(worker -> worker.getHashValues().get(0))) + .collect(ImmutableList.toImmutableList()); + List outputEndpoints = downStreamWorkers.stream() + .map(CFWorkerInfo::getIp) + .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") + // .map(URI::create) + .collect(Collectors.toList()); + if (partitionOutput) + { + StreamWorkerCommon.passSchemaToNextLevel(joiner.getJoinedSchema(), outputStorageInfo, outputEndpoints); + } + // build the hash table for the left table. List leftFutures = new ArrayList<>(leftPartitioned.size()); int leftSplitSize = leftPartitioned.size() / leftParallelism; @@ -214,7 +230,7 @@ public JoinOutput process(PartitionedJoinInput event) try { buildHashTable(transId, joiner, parts, leftColumnsToRead, leftInputStorageInfo.getScheme(), - hashValues, event.getSmallPartitionWorkerNum(), workerMetrics); + hashValues, event.getSmallPartitionWorkerNum(), workerMetrics, leftPixelsReader); } catch (Throwable e) { @@ -267,10 +283,10 @@ public JoinOutput process(PartitionedJoinInput event) joinWithRightTableAndPartition( transId, joiner, parts, rightColumnsToRead, rightInputStorageInfo.getScheme(), hashValues, - event.getLargePartitionWorkerNum(), outputPartitionInfo, result, workerMetrics) : + event.getLargePartitionWorkerNum(), outputPartitionInfo, result, workerMetrics, rightPixelsReader) : joinWithRightTable(transId, joiner, parts, rightColumnsToRead, rightInputStorageInfo.getScheme(), hashValues, - event.getLargePartitionWorkerNum(), result.get(0), workerMetrics); + event.getLargePartitionWorkerNum(), result.get(0), workerMetrics, rightPixelsReader); } catch (Throwable e) { @@ -299,21 +315,13 @@ public JoinOutput process(PartitionedJoinInput event) { WorkerMetrics.Timer writeCostTimer = new WorkerMetrics.Timer().start(); PixelsWriter pixelsWriter; + // XXX: The post partition code below is adapted to the streaming protocol. + // Consider modifying the reader and writer code instead (good practice of layering) if (partitionOutput) { - List downStreamWorkers = workerCoordinateService.getDownstreamWorkers(worker.getWorkerId()) - .stream() - .sorted(Comparator.comparing(worker -> worker.getHashValues().get(0))) - .collect(ImmutableList.toImmutableList()); - List outputEndpoints = downStreamWorkers.stream() - .map(CFWorkerInfo::getIp) - .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") - // .map(URI::create) - .collect(Collectors.toList()); // In partitioned mode, the schema is sent in an over-replicated manner: // every previous-stage worker (rather than one of them) sends a schema packet // before sending its intermediate data, to prevent errors from possibly out-of-order packet arrivals. - StreamWorkerCommon.passSchemaToNextLevel(joiner.getJoinedSchema(), outputStorageInfo, outputEndpoints); pixelsWriter = StreamWorkerCommon.getWriter(joiner.getJoinedSchema(), StreamWorkerCommon.getStorage(outputStorageInfo.getScheme()), outputPath, encoding, true, event.getJoinInfo().getPostPartitionId(), Arrays.stream( @@ -431,7 +439,7 @@ public JoinOutput process(PartitionedJoinInput event) */ protected static void buildHashTable(long transId, Joiner joiner, List leftParts, String[] leftCols, Storage.Scheme leftScheme, List hashValues, int numPartition, - WorkerMetrics workerMetrics) throws IOException + WorkerMetrics workerMetrics, PixelsReader leftPixelsReader) throws IOException { // In streaming mode, numPartition is the total number of partition workers, i.e. the number of incoming packets. logger.debug("building hash table for the left table, partition paths: " + leftParts); @@ -441,10 +449,9 @@ protected static void buildHashTable(long transId, Joiner joiner, List l int numReadRequests = 0; readCostTimer.start(); - PixelsReader pixelsReader = null; + PixelsReader pixelsReader = leftPixelsReader; try { - pixelsReader = StreamWorkerCommon.getReader(leftScheme, "http://localhost:18688/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); for (int hashValue : hashValues) @@ -511,7 +518,7 @@ protected static void buildHashTable(long transId, Joiner joiner, List l protected static int joinWithRightTable( long transId, Joiner joiner, List rightParts, String[] rightCols, Storage.Scheme rightScheme, List hashValues, int numPartition, ConcurrentLinkedQueue joinResult, - WorkerMetrics workerMetrics) throws IOException + WorkerMetrics workerMetrics, PixelsReader rightPixelsReader) throws IOException { int joinedRows = 0; WorkerMetrics.Timer readCostTimer = new WorkerMetrics.Timer(); @@ -520,10 +527,9 @@ protected static int joinWithRightTable( int numReadRequests = 0; readCostTimer.start(); - PixelsReader pixelsReader = null; + PixelsReader pixelsReader = rightPixelsReader; try { - pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); for (int hashValue : hashValues) @@ -545,7 +551,7 @@ protected static int joinWithRightTable( { if (!joined.isEmpty()) { - joinResult.add(joined); + joinResult.add(joined); // XXX: Can modify this into PixelsWriter.addRowBatch(), to further exploit the parallelism. joinedRows += joined.size; } } @@ -596,7 +602,7 @@ protected static int joinWithRightTable( protected static int joinWithRightTableAndPartition( long transId, Joiner joiner, List rightParts, String[] rightCols, Storage.Scheme rightScheme, List hashValues, int numPartition, PartitionInfo postPartitionInfo, - List> partitionResult, WorkerMetrics workerMetrics) throws IOException + List> partitionResult, WorkerMetrics workerMetrics, PixelsReader rightPixelsReader) throws IOException { requireNonNull(postPartitionInfo, "outputPartitionInfo is null"); Partitioner partitioner = new Partitioner(postPartitionInfo.getNumPartition(), @@ -608,10 +614,9 @@ protected static int joinWithRightTableAndPartition( int numReadRequests = 0; readCostTimer.start(); - PixelsReader pixelsReader = null; + PixelsReader pixelsReader = rightPixelsReader; try { - pixelsReader = StreamWorkerCommon.getReader(rightScheme, "http://localhost:18686/", true, numPartition); readCostTimer.stop(); checkArgument(pixelsReader.isPartitioned(), "pixels file is not partitioned"); // XXX: check that the hashValue in row group headers match the hashValue assigned to this worker From 4eb8fd92f5e38d058c7d74c88c0da5eacbd58ae6 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Wed, 13 Nov 2024 00:07:32 +0100 Subject: [PATCH 56/61] Fix hardcode of streaming port numbers --- .../worker/common/BasePartitionStreamWorker.java | 3 ++- .../common/BasePartitionedJoinStreamWorker.java | 14 ++++++++------ .../pixels/worker/common/StreamWorkerCommon.java | 3 +++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 9c6e7e805..1a3763fc1 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -186,7 +186,8 @@ public PartitionOutput process(PartitionInput event) .collect(ImmutableList.toImmutableList()); List outputEndpoints = downStreamWorkers.stream() .map(CFWorkerInfo::getIp) - .map(ip -> "http://" + ip + ":" + (event.isSmallTable() ? "18688" : "18686") + "/") + .map(ip -> "http://" + ip + ":" + + (event.isSmallTable() ? StreamWorkerCommon.STREAM_PORT_SMALL_TABLE : StreamWorkerCommon.STREAM_PORT_LARGE_TABLE)) // .map(URI::create) .collect(Collectors.toList()); diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index ff2f59b2d..a6fa28a6f 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -168,9 +168,9 @@ public JoinOutput process(PartitionedJoinInput event) // Bootstrap the readers at once which is up all the time during the worker's lifetime, // to ensure immediate reception of intermediate data and avoid retries on the writer side. PixelsReader leftPixelsReader = StreamWorkerCommon.getReader( leftInputStorageInfo.getScheme(), - "http://localhost:18688/", true, event.getSmallPartitionWorkerNum()); + "http://localhost:" + StreamWorkerCommon.STREAM_PORT_SMALL_TABLE, true, event.getSmallPartitionWorkerNum()); PixelsReader rightPixelsReader = StreamWorkerCommon.getReader(rightInputStorageInfo.getScheme(), - "http://localhost:18686/", true, event.getLargePartitionWorkerNum()); + "http://localhost:" + StreamWorkerCommon.STREAM_PORT_LARGE_TABLE, true, event.getLargePartitionWorkerNum()); // `registerWorker()` might awake the dependent workers, so it should be called just before / after // the current worker listens on its HTTP port and is ready to receive streaming packets. @@ -204,7 +204,9 @@ public JoinOutput process(PartitionedJoinInput event) .collect(ImmutableList.toImmutableList()); List outputEndpoints = downStreamWorkers.stream() .map(CFWorkerInfo::getIp) - .map(ip -> "http://" + ip + ":" + (event.getJoinInfo().getPostPartitionIsSmallTable() ? "18688" : "18686") + "/") + .map(ip -> "http://" + ip + ":" + + (event.getJoinInfo().getPostPartitionIsSmallTable() ? + StreamWorkerCommon.STREAM_PORT_SMALL_TABLE : StreamWorkerCommon.STREAM_PORT_LARGE_TABLE)) // .map(URI::create) .collect(Collectors.toList()); if (partitionOutput) @@ -491,7 +493,7 @@ protected static void buildHashTable(long transId, Joiner joiner, List l { if (pixelsReader != null) { - logger.debug("closing pixels reader on port 18688"); + logger.debug("closing pixels reader on port " + StreamWorkerCommon.STREAM_PORT_SMALL_TABLE); pixelsReader.close(); } } @@ -573,7 +575,7 @@ protected static int joinWithRightTable( { if (pixelsReader != null) { - logger.debug("closing pixels reader on port 18686"); + logger.debug("closing pixels reader on port " + StreamWorkerCommon.STREAM_PORT_LARGE_TABLE); pixelsReader.close(); } } @@ -666,7 +668,7 @@ protected static int joinWithRightTableAndPartition( { if (pixelsReader != null) { - logger.debug("closing pixels reader on 18686"); + logger.debug("closing pixels reader on port " + StreamWorkerCommon.STREAM_PORT_LARGE_TABLE); pixelsReader.close(); } } diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/StreamWorkerCommon.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/StreamWorkerCommon.java index dd6ce3c10..e0d9d4bbf 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/StreamWorkerCommon.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/StreamWorkerCommon.java @@ -46,6 +46,9 @@ public class StreamWorkerCommon extends WorkerCommon private static final Logger logger = LogManager.getLogger(StreamWorkerCommon.class); private static final Storage http = null; // placeholder. todo: modularize into a pixels-storage-stream module. + public static final int STREAM_PORT_SMALL_TABLE = 18688; + public static final int STREAM_PORT_LARGE_TABLE = 18686; + public static void initStorage(StorageInfo storageInfo, Boolean isOutput) throws IOException { if (storageInfo.getScheme() == Storage.Scheme.httpstream) From bb0ae95124499707d15bab39d231bb65234bf072 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Wed, 13 Nov 2024 00:20:47 +0100 Subject: [PATCH 57/61] Format --- .../plan/physical/PartitionedJoinStreamOperator.java | 12 ++++++++---- .../worker/common/BasePartitionStreamWorker.java | 3 ++- .../common/BasePartitionedJoinStreamWorker.java | 3 ++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java index 8f1f7c9d3..5dd2f8a23 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/plan/physical/PartitionedJoinStreamOperator.java @@ -69,10 +69,12 @@ public CompletableFuture[]> execute() partitionInput.setPartitionId(i); } } - else if (smallChild != null) { + else if (smallChild != null) + { smallPartitionWorkerNum = smallChild.getJoinInputs().size(); } - else { + else + { throw new IllegalStateException("smallPartitionInputs and smallChild are both null"); } int largePartitionWorkerNum; @@ -85,10 +87,12 @@ else if (smallChild != null) { partitionInput.setPartitionId(i); } } - else if (largeChild != null) { + else if (largeChild != null) + { largePartitionWorkerNum = largeChild.getJoinInputs().size(); } - else { + else + { throw new IllegalStateException("largePartitionInputs and largeChild are both null"); } for (int i = 0; i < joinInputs.size(); ++i) diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java index 1a3763fc1..c5990bab2 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionStreamWorker.java @@ -208,7 +208,8 @@ public PartitionOutput process(PartitionInput event) pixelsWriter.addRowBatch(batch, hash); } } - else { + else + { pixelsWriter.addRowBatch(null, hash); } hashValues.add(hash); diff --git a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java index a6fa28a6f..0ece6d760 100644 --- a/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java +++ b/pixels-turbo/pixels-worker-common/src/main/java/io/pixelsdb/pixels/worker/common/BasePartitionedJoinStreamWorker.java @@ -339,7 +339,8 @@ public JoinOutput process(PartitionedJoinInput event) pixelsWriter.addRowBatch(batch, hash); } } - else { + else + { pixelsWriter.addRowBatch(null, hash); } } From 56c0a75dccab22fe4ee7b348dd6cba474143c0e7 Mon Sep 17 00:00:00 2001 From: jasha64 Date: Wed, 13 Nov 2024 22:19:26 +0100 Subject: [PATCH 58/61] Modify Pixels stream writer to retry connection at 100ms interval --- .../java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java index 53359f237..91cdcf2f5 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterStreamImpl.java @@ -791,8 +791,8 @@ private void writeRowGroup() throws IOException try { outstandingHTTPRequestSemaphore.acquire(); - int maxAttempts = 30000; - long backoffMillis = 10; + int maxAttempts = 3000; + long backoffMillis = 100; int attempt = 0; boolean success = false; From 1fafdf51b9d9cfe54097096f63bb4aa1970ace77 Mon Sep 17 00:00:00 2001 From: huasiy Date: Thu, 7 Nov 2024 08:12:50 +0800 Subject: [PATCH 59/61] implement stream storage --- .../pixels/common/physical/StreamPath.java | 37 +++ pixels-storage/pixels-storage-stream/pom.xml | 86 ++++++ .../pixels/storage/stream/Stream.java | 163 ++++++++++++ .../storage/stream/io/StreamInputStream.java | 195 ++++++++++++++ .../storage/stream/io/StreamOutputStream.java | 245 ++++++++++++++++++ .../pixels/storage/stream/TestStream.java | 43 +++ pom.xml | 1 + 7 files changed, 770 insertions(+) create mode 100644 pixels-common/src/main/java/io/pixelsdb/pixels/common/physical/StreamPath.java create mode 100644 pixels-storage/pixels-storage-stream/pom.xml create mode 100644 pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/Stream.java create mode 100644 pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java create mode 100644 pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamOutputStream.java create mode 100644 pixels-storage/pixels-storage-stream/src/test/java/io/pixelsdb/pixels/storage/stream/TestStream.java diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/physical/StreamPath.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/physical/StreamPath.java new file mode 100644 index 000000000..16966484a --- /dev/null +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/physical/StreamPath.java @@ -0,0 +1,37 @@ +package io.pixelsdb.pixels.common.physical; + +import static java.util.Objects.requireNonNull; + +public class StreamPath +{ + private String host; + private int port; + public boolean valid = false; + + public StreamPath(String path) + { + requireNonNull(path); + if (path.contains(":///")) + { + path = path.substring(path.indexOf(":///") + 4); + } + int colon = path.indexOf(':'); + if (colon > 0) + { + host = path.substring(0, colon); + port = Integer.parseInt(path.substring(colon + 1)); + this.valid = true; + } + } + + public String getHostName() + { + return host; + } + + public int getPort() + { + return port; + } + +} diff --git a/pixels-storage/pixels-storage-stream/pom.xml b/pixels-storage/pixels-storage-stream/pom.xml new file mode 100644 index 000000000..c1c6cd54f --- /dev/null +++ b/pixels-storage/pixels-storage-stream/pom.xml @@ -0,0 +1,86 @@ + + + 4.0.0 + + io.pixelsdb + pixels + 0.2.0-SNAPSHOT + ../../pom.xml + + + pixels-storage-stream + + + 8 + 8 + UTF-8 + + + + + io.pixelsdb + pixels-common + true + + + + net.java.dev.jna + jna + + + + + org.asynchttpclient + async-http-client + true + + + io.netty + netty-all + true + + + + org.apache.hadoop + hadoop-client + true + test + + + com.google.guava + guava + true + + + + + + + org.apache.maven.plugins + maven-deploy-plugin + ${maven.plugin.deploy.version} + + + local.mvn.repo::default::file://${project.parent.basedir}/mvn + + + + + + org.apache.maven.plugins + maven-source-plugin + ${maven.plugin.source.version} + + + attach-sources + + jar + + + + + + + \ No newline at end of file diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/Stream.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/Stream.java new file mode 100644 index 000000000..dc2a6b543 --- /dev/null +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/Stream.java @@ -0,0 +1,163 @@ +/* + * Copyright 2024 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.storage.stream; + +import io.pixelsdb.pixels.common.physical.Status; +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.physical.StreamPath; +import io.pixelsdb.pixels.storage.stream.io.StreamInputStream; +import io.pixelsdb.pixels.storage.stream.io.StreamOutputStream; +import sun.reflect.generics.reflectiveObjects.NotImplementedException; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.List; + + +public final class Stream implements Storage +{ + private static final String SchemePrefix = Scheme.httpstream.name() + "://"; + + public Stream() { } + + @Override + public Scheme getScheme() { return Scheme.httpstream; } + + @Override + public String ensureSchemePrefix(String path) throws IOException + { + if (path.startsWith(SchemePrefix)) + { + return path; + } + if (path.contains("://")) + { + throw new IOException("Path '" + path + + "' already has a different scheme prefix than '" + SchemePrefix + "'."); + } + return SchemePrefix + path; + } + + /** + * This method is used for read content from stream. + * @param path + * @return + */ + @Override + public DataInputStream open(String path) throws IOException + { + StreamPath streamPath = new StreamPath(path); + if (!streamPath.valid) + { + throw new IOException("Path '" + path + "' is not valid."); + } + + StreamInputStream inputStream; + try + { + inputStream = new StreamInputStream(streamPath.getHostName(), streamPath.getPort()); + } catch (Exception e) + { + throw new IOException("Failed to open streamInputStream, " + e.toString()); + } + return new DataInputStream(inputStream); + } + + @Override + public List listStatus(String... path) + { + throw new NotImplementedException(); + } + + @Override + public List listPaths(String... path) + { + throw new NotImplementedException(); + } + + @Override + public Status getStatus(String path) + { + throw new NotImplementedException(); + } + + @Override + public long getFileId(String path) + { + throw new NotImplementedException(); + } + + @Override + public boolean mkdirs(String path) + { + throw new NotImplementedException(); + } + + /** + * This method is used for write content to stream. + */ + @Override + public DataOutputStream create(String path, boolean overwrite, int bufferSize) throws IOException + { + StreamPath streamPath = new StreamPath(path); + if (!streamPath.valid) + { + throw new IOException("Path '" + path + "' is not valid."); + } + return new DataOutputStream(new StreamOutputStream(streamPath.getHostName(), streamPath.getPort(), bufferSize)); + } + + @Override + public boolean delete(String path, boolean recursive) + { + throw new NotImplementedException(); + } + + @Override + public boolean supportDirectCopy() { return false; } + + @Override + public boolean directCopy(String src, String dest) + { + throw new NotImplementedException(); + } + + @Override + public void close() throws IOException { } + + @Override + public boolean exists(String path) + { + throw new NotImplementedException(); + } + + @Override + public boolean isFile(String path) + { + return false; + } + + @Override + public boolean isDirectory(String path) + { + return false; + } +} \ No newline at end of file diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java new file mode 100644 index 000000000..9d3eda059 --- /dev/null +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java @@ -0,0 +1,195 @@ +package io.pixelsdb.pixels.storage.stream.io; + +import io.netty.buffer.ByteBuf; +import io.netty.channel.ChannelFuture; +import io.netty.channel.ChannelFutureListener; +import io.netty.channel.ChannelHandlerContext; +import io.netty.handler.codec.http.*; +import io.pixelsdb.pixels.common.utils.HttpServer; +import io.pixelsdb.pixels.common.utils.HttpServerHandler; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import sun.rmi.runtime.Log; + +import javax.net.ssl.SSLException; +import java.io.IOException; +import java.io.InputStream; +import java.security.cert.CertificateException; +import java.util.concurrent.*; + +public class StreamInputStream extends InputStream +{ + private static final Logger logger = LogManager.getLogger(StreamInputStream.class); + + /** + * indicates whether the stream is still open / valid + */ + private boolean open; + + /** + * The schema of http stream. + * Default value is http. + */ + private final String schema = "http"; + + /** + * The host of http stream. + */ + private String host; + + /** + * The port of http stream. + */ + private int port; + + /** + * The uri of http stream. + */ + private String uri; + + /** + * The temporary buffer used for storing the chunks. + */ + private final BlockingQueue contentQueue; + + /** + * The capacity of buffer. + */ + private final int bufferCapacity = 1000000000; + + /** + * The http server for receiving input stream. + */ + private final HttpServer httpServer; + + /** + * The thread to run http server. + */ + private final ExecutorService executorService; + + /** + * The future of http server. + */ + private final CompletableFuture httpServerFuture; + + public StreamInputStream(String host, int port) throws CertificateException, SSLException { + this.open = true; + this.contentQueue = new LinkedBlockingDeque<>(); + this.host = host; + this.port = port; + this.uri = this.schema + "://" + host + ":" + port; + this.httpServer = new HttpServer(new StreamHttpServerHandler(this)); + this.executorService = Executors.newFixedThreadPool(1); + this.httpServerFuture = CompletableFuture.runAsync(() -> { + try + { + this.httpServer.serve(this.port); + logger.info("http server closed"); + } catch (InterruptedException e) + { + logger.error("http server interrupted", e); + } + }, this.executorService); + } + + @Override + public int read() throws IOException + { + assertOpen(); + if (this.contentQueue.isEmpty()) + { + return -1; + } + return this.contentQueue.peek().readableBytes(); + } + + @Override + public int read(byte[] b) throws IOException + { + return read(b, 0, b.length); + } + + @Override + public int read(byte[] buf, int off, int len) throws IOException + { + this.assertOpen(); + if (this.contentQueue.isEmpty()) + { + return -1; + } + + ByteBuf content = this.contentQueue.peek(); + int offset = content.readerIndex(); + len = Math.min(len, content.readableBytes()); + content.readBytes(buf, offset, len); + if (!content.isReadable()) + { + this.contentQueue.poll(); + } + + return len; + } + + private void assertOpen() + { + if (!this.open) + { + throw new IllegalStateException("Closed"); + } + } + + public static class StreamHttpServerHandler extends HttpServerHandler + { + private static final Logger logger = LogManager.getLogger(StreamHttpServerHandler.class); + private StreamInputStream inputStream; + + public StreamHttpServerHandler(StreamInputStream inputStream) + { + this.inputStream = inputStream; + } + + @Override + public void channelRead0(ChannelHandlerContext ctx, HttpObject msg) + { + if (!(msg instanceof HttpRequest)) + { + return; + } + FullHttpRequest req = (FullHttpRequest) msg; + if (req.method() != HttpMethod.POST) + { + req.headers().set(HttpHeaderNames.CONNECTION, HttpHeaderValues.CLOSE); + sendResponse(ctx, req, HttpResponseStatus.BAD_REQUEST); + } + + if (!req.headers().get(HttpHeaderNames.CONTENT_TYPE).equals("application/x-protobuf")) + { + return; + } + ByteBuf content = req.content(); + content.retain(); + this.inputStream.contentQueue.add(content); + sendResponse(ctx, req, HttpResponseStatus.OK); + } + + private void sendResponse(ChannelHandlerContext ctx, FullHttpRequest req, HttpResponseStatus status) + { + FullHttpResponse response = new DefaultFullHttpResponse(req.protocolVersion(), status); + response.headers() + .set(HttpHeaderNames.CONTENT_TYPE, "text/plain") + .set(HttpHeaderNames.CONTENT_LENGTH, response.content().readableBytes()); + + if (req.headers().get(HttpHeaderNames.CONNECTION).equals(HttpHeaderValues.CLOSE.toString())) + { + response.headers().set(HttpHeaderNames.CONNECTION, HttpHeaderValues.CLOSE); + response.setStatus(status); + ChannelFuture f = ctx.writeAndFlush(response); + f.addListener(ChannelFutureListener.CLOSE); + } else + { + response.headers().set(HttpHeaderNames.CONNECTION, HttpHeaderValues.KEEP_ALIVE); + ctx.writeAndFlush(response); + } + } + } +} diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamOutputStream.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamOutputStream.java new file mode 100644 index 000000000..5f5fa26ab --- /dev/null +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamOutputStream.java @@ -0,0 +1,245 @@ +package io.pixelsdb.pixels.storage.stream.io; + +import io.netty.handler.codec.http.HttpHeaderNames; +import io.netty.handler.codec.http.HttpHeaderValues; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.asynchttpclient.*; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; + +public class StreamOutputStream extends OutputStream +{ + private static final Logger logger = LogManager.getLogger(StreamInputStream.class); + + /** + * indicates whether the stream is still open / valid + */ + private boolean open; + + /** + * The schema of http stream. + * Default value is http. + */ + private final String schema = "http"; + + /** + * The host of http stream. + */ + private String host; + + /** + * The port of http stream. + */ + private int port; + + /** + * The uri of http stream. + */ + private String uri; + + /** + * The maximum retry count. + */ + private static final int MAX_RETRIES = 10; + + /** + * The delay between two tries. + */ + private static final long RETRY_DELAY_MS = 1000; + + /** + * The temporary buffer used for storing the chunks. + */ + private final byte[] buffer; + + /** + * The position in the buffer. + */ + private int bufferPosition; + + /** + * The capacity of buffer. + */ + private int bufferCapacity; + + /** + * The http client. + */ + private final AsyncHttpClient httpClient; + + public StreamOutputStream(String host, int port, int bufferCapacity) + { + this.open = true; + this.host = host; + this.port = port; + this.uri = this.schema + "://" + host + ":" + port; + this.bufferCapacity = bufferCapacity; + this.buffer = new byte[bufferCapacity]; + this.bufferPosition = 0; + this.httpClient = Dsl.asyncHttpClient(); + } + + /** + * Write an array to the S3 output stream + * + * @param b + * @throws IOException + */ + @Override + public void write(byte[] b) throws IOException + { + write(b, 0, b.length); + } + + @Override + public void write(final byte[] buf, final int off, final int len) throws IOException + { + this.assertOpen(); + int offsetInBuf = off, remainToRead = len; + int remainInBuffer; + while (remainToRead > (remainInBuffer = this.buffer.length - bufferPosition)) + { + System.arraycopy(buf, offsetInBuf, this.buffer, this.bufferPosition, remainInBuffer); + this.bufferPosition += remainInBuffer; + flushBufferAndRewind(); + offsetInBuf += remainInBuffer; + remainToRead -= remainInBuffer; + } + System.arraycopy(buf, offsetInBuf, this.buffer, this.bufferPosition, remainToRead); + this.bufferPosition += remainToRead; + } + + @Override + public void write(int b) throws IOException + { + this.assertOpen(); + if (this.bufferPosition >= this.buffer.length) + { + flushBufferAndRewind(); + } + this.buffer[this.bufferPosition++] = (byte) b; + } + + @Override + public synchronized void flush() + { + assertOpen(); + } + + protected void flushBufferAndRewind() throws IOException + { + logger.debug("Sending {} bytes to stream", this.bufferPosition); + Request req = httpClient.preparePost(this.uri) + .setBody(ByteBuffer.wrap(this.buffer, 0, this.bufferPosition)) + .addHeader(HttpHeaderNames.CONTENT_TYPE, "application/x-protobuf") + .addHeader(HttpHeaderNames.CONTENT_LENGTH, this.bufferPosition) + .addHeader(HttpHeaderNames.CONNECTION, "keep-aliva") + .build(); + int retry = 0; + while (true) + { + StreamHttpClientHandler handler = new StreamHttpClientHandler(); + try + { + httpClient.executeRequest(req, handler) + .toCompletableFuture() + .get(); + break; + } catch (Exception e) + { + retry++; + if (retry > MAX_RETRIES || !(e.getCause() instanceof java.net.ConnectException)) + { + logger.error("retry count {}, exception cause {}, excepetion {}", retry, e.getCause(), e.getMessage()); + throw new IOException("Connect to stream failed"); + } else + { + try + { + Thread.sleep(RETRY_DELAY_MS); + } catch (InterruptedException e1) + { + throw new IOException(e1); + } + } + } + } + this.bufferPosition = 0; + } + + @Override + public void close() throws IOException + { + if (this.open) + { + this.open = false; + if (this.bufferPosition > 0) + { + flushBufferAndRewind(); + } + closeStreamReader(); + this.httpClient.close(); + } + } + + /** + * Tell stream reader that this stream closes. + */ + private void closeStreamReader() + { + Request req = httpClient.preparePost(this.uri) + .addHeader(HttpHeaderNames.CONTENT_TYPE, "application/x-protobuf") + .addHeader(HttpHeaderNames.CONTENT_LENGTH, 0) + .addHeader(HttpHeaderNames.CONNECTION, HttpHeaderValues.CLOSE) + .build(); + int retry = 0; + while (true) + { + StreamHttpClientHandler handler = new StreamHttpClientHandler(); + try + { + httpClient.executeRequest(req, handler) + .toCompletableFuture() + .get(); + break; + } catch (Exception e) + { + retry++; + if (retry > this.MAX_RETRIES || !(e.getCause() instanceof java.net.ConnectException)) + { + logger.error("failed to close stream reader"); + } + } + } + } + + private void assertOpen() + { + if (!this.open) + { + throw new IllegalStateException("Closed"); + } + } + + public static class StreamHttpClientHandler extends AsyncCompletionHandler + { + @Override + public Response onCompleted(Response response) throws Exception + { + if (response.getStatusCode() != 200) + { + throw new IOException("Failed to send package to server, status code: " + response.getStatusCode()); + } + return response; + } + + @Override + public void onThrowable(Throwable t) + { + logger.error("stream http client handler, {}", t.getMessage()); + } + } +} diff --git a/pixels-storage/pixels-storage-stream/src/test/java/io/pixelsdb/pixels/storage/stream/TestStream.java b/pixels-storage/pixels-storage-stream/src/test/java/io/pixelsdb/pixels/storage/stream/TestStream.java new file mode 100644 index 000000000..a0efca190 --- /dev/null +++ b/pixels-storage/pixels-storage-stream/src/test/java/io/pixelsdb/pixels/storage/stream/TestStream.java @@ -0,0 +1,43 @@ +/* + * Copyright 2024 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.storage.stream; + +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.physical.StorageFactory; +import org.apache.hadoop.io.IOUtils; +import org.junit.Test; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Paths; + +public class TestStream +{ + @Test + public void test() throws IOException + { + Storage stream = StorageFactory.Instance().getStorage(Storage.Scheme.httpstream); + InputStream fileInput = Files.newInputStream(Paths.get("/tmp/test1")); + OutputStream outputStream = stream.create("stream:///localhost:29920", false, 4096); + InputStream inputStream = stream.open("stream:///localhost:29920"); + OutputStream fileOutput = Files.newOutputStream(Paths.get("/tmp/test2"));IOUtils.copyBytes(fileInput, outputStream, 4096, true); + IOUtils.copyBytes(inputStream, fileOutput, 4096, true); + } +} diff --git a/pom.xml b/pom.xml index 0db4b7e3a..56a45e2b9 100644 --- a/pom.xml +++ b/pom.xml @@ -52,6 +52,7 @@ pixels-storage/pixels-storage-redis pixels-storage/pixels-storage-localfs pixels-storage/pixels-storage-mock + pixels-storage/pixels-storage-stream pixels-turbo/pixels-worker-common pixels-turbo/pixels-worker-lambda pixels-turbo/pixels-invoker-lambda From 1b88ede5e158f4f6d8bcb28aa4b6038a44ac8f4b Mon Sep 17 00:00:00 2001 From: huasiy Date: Thu, 7 Nov 2024 09:48:26 +0800 Subject: [PATCH 60/61] fix no data bug in stream storage --- .../pixels/common/utils/HttpServer.java | 29 +++++-- .../common/utils/HttpServerHandler.java | 5 ++ .../storage/stream/io/StreamInputStream.java | 85 +++++++++++++++---- 3 files changed, 97 insertions(+), 22 deletions(-) diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/HttpServer.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/HttpServer.java index 216f12540..3c3357a1c 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/HttpServer.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/HttpServer.java @@ -40,16 +40,20 @@ public final class HttpServer { final HttpServerInitializer initializer; + private EventLoopGroup bossGroup; + private EventLoopGroup workerGroup; + private Channel channel; public HttpServer(HttpServerHandler handler) throws CertificateException, SSLException { this.initializer = new HttpServerInitializer(HttpServerUtil.buildSslContext(), handler); + handler.setServerCloser(this::close); } public void serve(int PORT) throws InterruptedException { - EventLoopGroup bossGroup = new NioEventLoopGroup(1); - EventLoopGroup workerGroup = new NioEventLoopGroup(); + bossGroup = new NioEventLoopGroup(1); + workerGroup = new NioEventLoopGroup(); try { ServerBootstrap b = new ServerBootstrap(); @@ -59,13 +63,28 @@ public void serve(int PORT) throws InterruptedException .handler(new LoggingHandler(LogLevel.DEBUG)) .childHandler(this.initializer); - Channel ch = b.bind(PORT).sync().channel(); - - ch.closeFuture().sync(); + channel = b.bind(PORT).sync().channel(); + channel.closeFuture().sync(); } finally { bossGroup.shutdownGracefully(); workerGroup.shutdownGracefully(); } } + + public void close() + { + if (channel != null) + { + channel.close(); + } + if (bossGroup != null) + { + bossGroup.shutdownGracefully(); + } + if (workerGroup != null) + { + workerGroup.shutdownGracefully(); + } + } } diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/HttpServerHandler.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/HttpServerHandler.java index 4536b5d09..fbcbebc8a 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/HttpServerHandler.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/HttpServerHandler.java @@ -43,6 +43,7 @@ @ChannelHandler.Sharable public class HttpServerHandler extends SimpleChannelInboundHandler { + protected Runnable serverCloser; @Override public void channelReadComplete(ChannelHandlerContext ctx) @@ -100,4 +101,8 @@ public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) ChannelFuture f = ctx.writeAndFlush(response); f.addListener(ChannelFutureListener.CLOSE); } + + public void setServerCloser(Runnable serverCloser) { + this.serverCloser = serverCloser; + } } diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java index 9d3eda059..ba9aa777d 100644 --- a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java @@ -1,15 +1,12 @@ package io.pixelsdb.pixels.storage.stream.io; import io.netty.buffer.ByteBuf; -import io.netty.channel.ChannelFuture; -import io.netty.channel.ChannelFutureListener; import io.netty.channel.ChannelHandlerContext; import io.netty.handler.codec.http.*; import io.pixelsdb.pixels.common.utils.HttpServer; import io.pixelsdb.pixels.common.utils.HttpServerHandler; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import sun.rmi.runtime.Log; import javax.net.ssl.SSLException; import java.io.IOException; @@ -57,6 +54,16 @@ public class StreamInputStream extends InputStream */ private final int bufferCapacity = 1000000000; + /** + * The maximum tries to get data. + */ + private final int MAX_TRIES = 10; + + /** + * The milliseconds to sleep. + */ + private final int DELAY_MS = 2000; + /** * The http server for receiving input stream. */ @@ -84,7 +91,6 @@ public StreamInputStream(String host, int port) throws CertificateException, SSL try { this.httpServer.serve(this.port); - logger.info("http server closed"); } catch (InterruptedException e) { logger.error("http server interrupted", e); @@ -96,11 +102,22 @@ public StreamInputStream(String host, int port) throws CertificateException, SSL public int read() throws IOException { assertOpen(); - if (this.contentQueue.isEmpty()) + if (!assertData()) { return -1; } - return this.contentQueue.peek().readableBytes(); + + ByteBuf content = this.contentQueue.peek(); + int b = -1; + if (content != null) + { + b = content.readByte(); + if (!content.isReadable()) + { + this.contentQueue.poll(); + } + } + return b; } @Override @@ -112,22 +129,56 @@ public int read(byte[] b) throws IOException @Override public int read(byte[] buf, int off, int len) throws IOException { - this.assertOpen(); - if (this.contentQueue.isEmpty()) + assertOpen(); + if (!assertData()) { return -1; } - ByteBuf content = this.contentQueue.peek(); - int offset = content.readerIndex(); - len = Math.min(len, content.readableBytes()); - content.readBytes(buf, offset, len); - if (!content.isReadable()) + int readBytes = 0; + while (readBytes < len && !this.contentQueue.isEmpty()) + { + ByteBuf content = this.contentQueue.peek(); + int offset = content.readerIndex(); + int readLen = Math.min(len-readBytes, content.readableBytes()); + content.readBytes(buf, offset, readLen); + if (!content.isReadable()) + { + this.contentQueue.poll(); + } + readBytes += readLen; + } + + return readBytes; + } + + @Override + public void close() throws IOException + { + if (this.open) { - this.contentQueue.poll(); + this.open = false; + this.httpServerFuture.complete(null); + this.httpServer.close(); } + } - return len; + private boolean assertData() throws IOException + { + int tries = 0; + while (tries < this.MAX_TRIES && this.contentQueue.isEmpty() && !this.httpServerFuture.isDone()) + { + try + { + tries++; + Thread.sleep(this.DELAY_MS); + } catch (InterruptedException e) + { + throw new IOException(e); + } + } + + return this.contentQueue.isEmpty(); } private void assertOpen() @@ -183,8 +234,8 @@ private void sendResponse(ChannelHandlerContext ctx, FullHttpRequest req, HttpRe { response.headers().set(HttpHeaderNames.CONNECTION, HttpHeaderValues.CLOSE); response.setStatus(status); - ChannelFuture f = ctx.writeAndFlush(response); - f.addListener(ChannelFutureListener.CLOSE); + ctx.writeAndFlush(response); + this.serverCloser.run(); } else { response.headers().set(HttpHeaderNames.CONNECTION, HttpHeaderValues.KEEP_ALIVE); From 36b71a80ae2d512dd4e6e824b4119adfeb3f270c Mon Sep 17 00:00:00 2001 From: huasiy Date: Tue, 12 Nov 2024 11:43:08 +0800 Subject: [PATCH 61/61] implement physical reader and writer --- .../pixels/common/utils/Constants.java | 1 + .../storage/stream/PhysicalStreamReader.java | 138 ++++++++++++++++++ .../storage/stream/PhysicalStreamWriter.java | 115 +++++++++++++++ .../pixels/storage/stream/StreamProvider.java | 67 +++++++++ .../storage/stream/io/StreamInputStream.java | 23 ++- .../storage/stream/io/StreamOutputStream.java | 16 +- .../pixels/storage/stream/TestStream.java | 103 ++++++++++++- 7 files changed, 447 insertions(+), 16 deletions(-) create mode 100644 pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/PhysicalStreamReader.java create mode 100644 pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/PhysicalStreamWriter.java create mode 100644 pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/StreamProvider.java diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/Constants.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/Constants.java index a7ce4b42b..4f6decf3f 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/Constants.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/Constants.java @@ -32,6 +32,7 @@ public final class Constants public static final int S3_BUFFER_SIZE = 8 * 1024 * 1024; public static final int REDIS_BUFFER_SIZE = 8 * 1024 * 1024; public static final int GCS_BUFFER_SIZE = 8 * 1024 * 1024; + public static final int STREAM_BUFFER_SIZE = 8 * 1024 * 1024; public static final int MIN_REPEAT = 3; public static final int MAX_SCOPE = 512; diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/PhysicalStreamReader.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/PhysicalStreamReader.java new file mode 100644 index 000000000..cdc32a174 --- /dev/null +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/PhysicalStreamReader.java @@ -0,0 +1,138 @@ +package io.pixelsdb.pixels.storage.stream; + +import io.pixelsdb.pixels.common.physical.PhysicalReader; +import io.pixelsdb.pixels.common.physical.Storage; + +import java.io.DataInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; + +import static java.util.Objects.requireNonNull; + +public class PhysicalStreamReader implements PhysicalReader +{ + private final Storage stream; + private final String path; + private final DataInputStream dataInputStream; + + public PhysicalStreamReader(Storage storage, String path) throws IOException + { + if (storage instanceof Stream) + { + this.stream = (Stream) storage; + } + else + { + throw new IOException("Storage is not LocalFS."); + } + this.path = path; + this.dataInputStream = storage.open(path); + } + + @Override + public long getFileLength() throws IOException + { + throw new UnsupportedOperationException("Can't get file length in PhysicalStreamReader"); + } + + @Override + public void seek(long desired) throws IOException + { + throw new UnsupportedOperationException("Can't get file length in PhysicalStreamReader"); + } + + @Override + public ByteBuffer readFully(int length) throws IOException + { + byte[] buffer = new byte[length]; + dataInputStream.readFully(buffer); + return ByteBuffer.wrap(buffer); + } + + @Override + public void readFully(byte[] buffer) throws IOException + { + dataInputStream.readFully(buffer); + } + + @Override + public void readFully(byte[] buffer, int offset, int length) throws IOException + { + dataInputStream.readFully(buffer, offset, length); + } + + @Override + public long readLong(ByteOrder byteOrder) throws IOException + { + if (requireNonNull(byteOrder).equals(ByteOrder.BIG_ENDIAN)) + { + return dataInputStream.readLong(); + } + else + { + return Long.reverseBytes(dataInputStream.readLong()); + } + } + + @Override + public int readInt(ByteOrder byteOrder) throws IOException + { + if (requireNonNull(byteOrder).equals(ByteOrder.BIG_ENDIAN)) + { + return dataInputStream.readInt(); + } + else + { + return Integer.reverseBytes(dataInputStream.readInt()); + } + } + + @Override + public boolean supportsAsync() { return false; } + + @Override + public CompletableFuture readAsync(long offset, int len) throws IOException + { + throw new UnsupportedOperationException("Can't get file length in PhysicalStreamReader"); + } + + @Override + public void close() throws IOException + { + this.dataInputStream.close(); + } + + @Override + public String getPath() { return path; } + + /** + * Get the port in path. + * + * @return + */ + @Override + public String getName() + { + if (path == null) + { + return null; + } + int slash = path.lastIndexOf(":"); + return path.substring(slash + 1); + } + + @Override + public long getBlockId() throws IOException + { + throw new IOException("Can't get block id in PhysicalStreamReader"); + } + + @Override + public Storage.Scheme getStorageScheme() { return stream.getScheme(); } + + @Override + public int getNumReadRequests() { return 0; } +} diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/PhysicalStreamWriter.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/PhysicalStreamWriter.java new file mode 100644 index 000000000..e15637808 --- /dev/null +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/PhysicalStreamWriter.java @@ -0,0 +1,115 @@ +/* + * Copyright 2024 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.storage.stream; + +import io.pixelsdb.pixels.common.physical.PhysicalWriter; +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.utils.Constants; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * @author huasiy + * @create 2024-11-08 + */ +public class PhysicalStreamWriter implements PhysicalWriter +{ + private Stream stream; + private String path; + private long position; + private DataOutputStream dataOutputStream; + + public PhysicalStreamWriter(Storage stream, String path) throws IOException + { + if (stream instanceof Stream) + { + this.stream = (Stream) stream; + } + else + { + throw new IOException("Storage is not stream"); + } + this.path = path; + this.dataOutputStream = stream.create(path, false, Constants.STREAM_BUFFER_SIZE); + } + + /** + * Tell the writer the offset of next write. + * + * @param length length of content + * @return starting offset after preparing. + */ + @Override + public long prepare(int length) throws IOException + { + return this.position; + } + + /** + * Append content to the file. + * + * @param buffer content buffer + * @return start offset of content in the file. + */ + @Override + public long append(ByteBuffer buffer) throws IOException + { + buffer.flip(); + int length = buffer.remaining(); + return append(buffer.array(), buffer.arrayOffset() + buffer.position(), length); + } + + /** + * Append content to the file. + * + * @param buffer content buffer container + * @param offset start offset of actual content buffer + * @param length length of actual content buffer + * @return start offset of content in the file. + */ + @Override + public long append(byte[] buffer, int offset, int length) throws IOException + { + long start = this.position; + dataOutputStream.write(buffer, offset, length); + position += length; + return start; + } + + @Override + public void close() throws IOException + { + dataOutputStream.close(); + } + + @Override + public void flush() throws IOException + { + dataOutputStream.flush(); + } + + @Override + public String getPath() { return path; } + + @Override + public int getBufferSize() { return Constants.STREAM_BUFFER_SIZE; } +} diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/StreamProvider.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/StreamProvider.java new file mode 100644 index 000000000..4f0e92054 --- /dev/null +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/StreamProvider.java @@ -0,0 +1,67 @@ +/* + * Copyright 2024 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.storage.stream; + +import io.pixelsdb.pixels.common.physical.*; +import sun.reflect.generics.reflectiveObjects.NotImplementedException; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; + +/** + * @author huasiy + * @create 2024-11-05 + */ +public class StreamProvider implements StorageProvider +{ + @Override + public Storage createStorage(@Nonnull Storage.Scheme scheme) throws IOException + { + if (!this.compatibleWith(scheme)) + { + throw new IOException("incompatible storage scheme: " + scheme); + } + return new Stream(); + } + + @Override + public PhysicalReader createReader(@Nonnull Storage storage, @Nonnull String path, @Nullable PhysicalReaderOption option) throws IOException + { + if (!this.compatibleWith(storage.getScheme())) + { + throw new IOException("incompatible storage scheme: " + storage.getScheme()); + } + return new PhysicalStreamReader(storage, path); + } + + @Override + public PhysicalWriter createWriter(@Nonnull Storage storage, @Nonnull String path, @Nonnull PhysicalWriterOption option) throws IOException + { + if (!this.compatibleWith(storage.getScheme())) + { + throw new IOException("incompatible storage scheme: " + storage.getScheme()); + } + return new PhysicalStreamWriter(storage, path); + } + + @Override + public boolean compatibleWith(@Nonnull Storage.Scheme scheme) { return scheme.equals(Storage.Scheme.httpstream); } +} diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java index ba9aa777d..f2d8de611 100644 --- a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamInputStream.java @@ -111,7 +111,7 @@ public int read() throws IOException int b = -1; if (content != null) { - b = content.readByte(); + b = content.readUnsignedByte(); if (!content.isReadable()) { this.contentQueue.poll(); @@ -126,6 +126,14 @@ public int read(byte[] b) throws IOException return read(b, 0, b.length); } + /** + * Attempt to read data with a maximum length of len into the position off of buf. + * @param buf + * @param off + * @param len + * @return Actual number of bytes read + * @throws IOException + */ @Override public int read(byte[] buf, int off, int len) throws IOException { @@ -139,11 +147,11 @@ public int read(byte[] buf, int off, int len) throws IOException while (readBytes < len && !this.contentQueue.isEmpty()) { ByteBuf content = this.contentQueue.peek(); - int offset = content.readerIndex(); int readLen = Math.min(len-readBytes, content.readableBytes()); - content.readBytes(buf, offset, readLen); + content.readBytes(buf, readBytes, readLen); if (!content.isReadable()) { + content.release(); this.contentQueue.poll(); } readBytes += readLen; @@ -178,7 +186,7 @@ private boolean assertData() throws IOException } } - return this.contentQueue.isEmpty(); + return !this.contentQueue.isEmpty(); } private void assertOpen() @@ -218,8 +226,11 @@ public void channelRead0(ChannelHandlerContext ctx, HttpObject msg) return; } ByteBuf content = req.content(); - content.retain(); - this.inputStream.contentQueue.add(content); + if (content.isReadable()) + { + content.retain(); + this.inputStream.contentQueue.add(content); + } sendResponse(ctx, req, HttpResponseStatus.OK); } diff --git a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamOutputStream.java b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamOutputStream.java index 5f5fa26ab..68637a3e5 100644 --- a/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamOutputStream.java +++ b/pixels-storage/pixels-storage-stream/src/main/java/io/pixelsdb/pixels/storage/stream/io/StreamOutputStream.java @@ -127,6 +127,13 @@ public void write(int b) throws IOException public synchronized void flush() { assertOpen(); + try + { + flushBufferAndRewind(); + } catch (IOException e) + { + logger.error(e); + } } protected void flushBufferAndRewind() throws IOException @@ -144,9 +151,8 @@ protected void flushBufferAndRewind() throws IOException StreamHttpClientHandler handler = new StreamHttpClientHandler(); try { - httpClient.executeRequest(req, handler) - .toCompletableFuture() - .get(); + httpClient.executeRequest(req, handler).get(); + this.bufferPosition = 0; break; } catch (Exception e) { @@ -201,9 +207,7 @@ private void closeStreamReader() StreamHttpClientHandler handler = new StreamHttpClientHandler(); try { - httpClient.executeRequest(req, handler) - .toCompletableFuture() - .get(); + httpClient.executeRequest(req, handler).get(); break; } catch (Exception e) { diff --git a/pixels-storage/pixels-storage-stream/src/test/java/io/pixelsdb/pixels/storage/stream/TestStream.java b/pixels-storage/pixels-storage-stream/src/test/java/io/pixelsdb/pixels/storage/stream/TestStream.java index a0efca190..1ba2cf245 100644 --- a/pixels-storage/pixels-storage-stream/src/test/java/io/pixelsdb/pixels/storage/stream/TestStream.java +++ b/pixels-storage/pixels-storage-stream/src/test/java/io/pixelsdb/pixels/storage/stream/TestStream.java @@ -19,25 +19,120 @@ */ package io.pixelsdb.pixels.storage.stream; -import io.pixelsdb.pixels.common.physical.Storage; -import io.pixelsdb.pixels.common.physical.StorageFactory; +import io.pixelsdb.pixels.common.physical.*; import org.apache.hadoop.io.IOUtils; import org.junit.Test; import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.file.Files; import java.nio.file.Paths; +import static com.google.common.base.Preconditions.checkArgument; + public class TestStream { + private volatile Exception readerException = null; + private volatile Exception writerException = null; + @Test - public void test() throws IOException + public void testStorage() throws IOException { Storage stream = StorageFactory.Instance().getStorage(Storage.Scheme.httpstream); InputStream fileInput = Files.newInputStream(Paths.get("/tmp/test1")); OutputStream outputStream = stream.create("stream:///localhost:29920", false, 4096); InputStream inputStream = stream.open("stream:///localhost:29920"); - OutputStream fileOutput = Files.newOutputStream(Paths.get("/tmp/test2"));IOUtils.copyBytes(fileInput, outputStream, 4096, true); + OutputStream fileOutput = Files.newOutputStream(Paths.get("/tmp/test2")); + IOUtils.copyBytes(fileInput, outputStream, 4096, true); IOUtils.copyBytes(inputStream, fileOutput, 4096, true); } + + /** + * Occasionally, the physicalReader fails to read the desired length of the string, causing the test to fail, + * with a probability of less than 1/20. + * @throws IOException + */ + @Test + public void testPhysicalReaderAndWriter() throws IOException + { + Storage stream = StorageFactory.Instance().getStorage(Storage.Scheme.httpstream); + Thread readerThread = new Thread(() -> { + try + { + try (PhysicalReader fsReader = PhysicalReaderUtil.newPhysicalReader(stream, "stream:///localhost:29920")) + { + int num1 = fsReader.readInt(ByteOrder.BIG_ENDIAN); + assert(num1 == 13); + num1 = fsReader.readInt(ByteOrder.BIG_ENDIAN); + assert(num1 == 169); + + long num2 = fsReader.readLong(ByteOrder.BIG_ENDIAN); + assert(num2 == 28561); + num2 = fsReader.readLong(ByteOrder.BIG_ENDIAN); + assert(num2 == 815730721); + + ByteBuffer buffer; + for (int len = 1; len < 1000000000; len=len*2) + { + buffer = fsReader.readFully(len); + for (int i = 0; i < len; i++) + { + byte tmp = buffer.get(); + if (tmp != 'a') + { + System.out.println(len); + throw new IOException("failed: " + len); + } + } + } + } + } catch (IOException e) + { + readerException = e; + throw new RuntimeException(e); + } + }); + Thread writerThread = new Thread(() -> { + try + { + try (PhysicalWriter fsWriter = PhysicalWriterUtil.newPhysicalWriter(stream, "stream:///localhost:29920", null)) + { + ByteBuffer buffer = ByteBuffer.allocate(24); + buffer.putInt(13); + buffer.putInt(169); + buffer.putLong(28561); + buffer.putLong(815730721); + fsWriter.append(buffer); + fsWriter.flush(); + for (int len = 1; len < 1000000000; len=len*2) + { + buffer = ByteBuffer.allocate(len); + for (int i = 0; i < len; i++) + { + buffer.put((byte) 'a'); + } + fsWriter.append(buffer); + } + } + } catch (IOException e) + { + writerException = e; + throw new RuntimeException(e); + } + }); + readerThread.start(); + writerThread.start(); + try + { + readerThread.join(); + writerThread.join(); + if (this.readerException != null || this.writerException != null) + { + throw new IOException(); + } + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } }