Skip to content

Commit db0cc66

Browse files
authored
planner: track number of migrations that have taken place (#358)
* planner: add client/server method to get the number of migrations and use in tests * gh: bump minor code version * planner-cli: return the number of migrations * planner: add very simple client/server test
1 parent 217c6f5 commit db0cc66

18 files changed

+93
-18
lines changed

.env

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FAABRIC_VERSION=0.10.0
2-
FAABRIC_CLI_IMAGE=faasm.azurecr.io/faabric:0.10.0
1+
FAABRIC_VERSION=0.11.0
2+
FAABRIC_CLI_IMAGE=faasm.azurecr.io/faabric:0.11.0
33
COMPOSE_PROJECT_NAME=faabric-dev
44
CONAN_CACHE_MOUNT_SOURCE=./conan-cache/

.github/workflows/tests.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
if: github.event.pull_request.draft == false
2424
runs-on: ubuntu-latest
2525
container:
26-
image: faasm.azurecr.io/faabric:0.10.0
26+
image: faasm.azurecr.io/faabric:0.11.0
2727
credentials:
2828
username: ${{ secrets.ACR_SERVICE_PRINCIPAL_ID }}
2929
password: ${{ secrets.ACR_SERVICE_PRINCIPAL_PASSWORD }}
@@ -36,7 +36,7 @@ jobs:
3636
if: github.event.pull_request.draft == false
3737
runs-on: ubuntu-latest
3838
container:
39-
image: faasm.azurecr.io/faabric:0.10.0
39+
image: faasm.azurecr.io/faabric:0.11.0
4040
credentials:
4141
username: ${{ secrets.ACR_SERVICE_PRINCIPAL_ID }}
4242
password: ${{ secrets.ACR_SERVICE_PRINCIPAL_PASSWORD }}
@@ -50,7 +50,7 @@ jobs:
5050
if: github.event.pull_request.draft == false
5151
runs-on: ubuntu-latest
5252
container:
53-
image: faasm.azurecr.io/faabric:0.10.0
53+
image: faasm.azurecr.io/faabric:0.11.0
5454
credentials:
5555
username: ${{ secrets.ACR_SERVICE_PRINCIPAL_ID }}
5656
password: ${{ secrets.ACR_SERVICE_PRINCIPAL_PASSWORD }}
@@ -73,7 +73,7 @@ jobs:
7373
REDIS_QUEUE_HOST: redis
7474
REDIS_STATE_HOST: redis
7575
container:
76-
image: faasm.azurecr.io/faabric:0.10.0
76+
image: faasm.azurecr.io/faabric:0.11.0
7777
credentials:
7878
username: ${{ secrets.ACR_SERVICE_PRINCIPAL_ID }}
7979
password: ${{ secrets.ACR_SERVICE_PRINCIPAL_PASSWORD }}
@@ -113,7 +113,7 @@ jobs:
113113
REDIS_QUEUE_HOST: redis
114114
REDIS_STATE_HOST: redis
115115
container:
116-
image: faasm.azurecr.io/faabric:0.10.0
116+
image: faasm.azurecr.io/faabric:0.11.0
117117
credentials:
118118
username: ${{ secrets.ACR_SERVICE_PRINCIPAL_ID }}
119119
password: ${{ secrets.ACR_SERVICE_PRINCIPAL_PASSWORD }}
@@ -167,7 +167,7 @@ jobs:
167167
REDIS_QUEUE_HOST: redis
168168
REDIS_STATE_HOST: redis
169169
container:
170-
image: faasm.azurecr.io/faabric:0.10.0
170+
image: faasm.azurecr.io/faabric:0.11.0
171171
credentials:
172172
username: ${{ secrets.ACR_SERVICE_PRINCIPAL_ID }}
173173
password: ${{ secrets.ACR_SERVICE_PRINCIPAL_PASSWORD }}

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.10.0
1+
0.11.0

include/faabric/planner/Planner.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ class Planner
8181

8282
faabric::batch_scheduler::InFlightReqs getInFlightReqs();
8383

84+
// Helper method to get the number of migrations that have happened since
85+
// the planner was last reset
86+
int getNumMigrations();
87+
8488
// Main entrypoint to request the execution of batches
8589
std::shared_ptr<faabric::batch_scheduler::SchedulingDecision> callBatch(
8690
std::shared_ptr<BatchExecuteRequest> req);

include/faabric/planner/PlannerApi.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ enum PlannerCalls
1515
GetMessageResult = 9,
1616
GetBatchResults = 10,
1717
GetSchedulingDecision = 11,
18-
CallBatch = 12,
19-
PreloadSchedulingDecision = 13,
18+
GetNumMigrations = 12,
19+
CallBatch = 13,
20+
PreloadSchedulingDecision = 14,
2021
};
2122
}

include/faabric/planner/PlannerClient.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class PlannerClient final : public faabric::transport::MessageEndpointClient
5757

5858
void ping();
5959

60+
// Clear the client-local cache
6061
void clearCache();
6162

6263
// ------
@@ -92,6 +93,8 @@ class PlannerClient final : public faabric::transport::MessageEndpointClient
9293
faabric::batch_scheduler::SchedulingDecision getSchedulingDecision(
9394
std::shared_ptr<faabric::BatchExecuteRequest> req);
9495

96+
int getNumMigrations();
97+
9598
void preloadSchedulingDecision(
9699
std::shared_ptr<faabric::batch_scheduler::SchedulingDecision> preloadDec);
97100

include/faabric/planner/PlannerServer.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ class PlannerServer final : public faabric::transport::MessageEndpointServer
4040
std::unique_ptr<google::protobuf::Message> recvGetSchedulingDecision(
4141
std::span<const uint8_t> buffer);
4242

43+
std::unique_ptr<google::protobuf::Message> recvGetNumMigrations(
44+
std::span<const uint8_t> buffer);
45+
4346
std::unique_ptr<google::protobuf::Message> recvPreloadSchedulingDecision(
4447
std::span<const uint8_t> buffer);
4548

include/faabric/planner/PlannerState.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,8 @@ struct PlannerState
3333
// planner's scheduling
3434
std::map<int, std::shared_ptr<batch_scheduler::SchedulingDecision>>
3535
preloadedSchedulingDecisions;
36+
37+
// Helper coutner of the total number of migrations
38+
std::atomic<int> numMigrations = 0;
3639
};
3740
}

src/endpoint/FaabricEndpointHandler.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,19 @@ void FaabricEndpointHandler::onRequest(
3838
// TODO: for the moment we keep the endpoint handler, but we are not meant
3939
// to receive any requests here. Eventually we will delete it
4040
if (requestStr.empty()) {
41-
SPDLOG_ERROR("Planner handler received empty request");
41+
SPDLOG_WARN("Worker HTTP handler received empty request!");
4242
response.result(beast::http::status::bad_request);
4343
response.body() = std::string("Empty request");
4444
return ctx.sendFunction(std::move(response));
4545
}
4646

47-
SPDLOG_ERROR("Worker HTTP handler received non-empty request (body: {})",
48-
request.body());
49-
throw std::runtime_error("Worker HTTP handler received non-empty request");
47+
// We don't expect to receive any requests here, but we sometimes do (e.g.
48+
// probing if exposed port is vulnerable to certain CVEs when deployed in
49+
// AKS)
50+
SPDLOG_WARN("Worker HTTP handler received non-empty request (body: {})",
51+
request.body());
52+
response.result(beast::http::status::bad_request);
53+
response.body() = std::string("Unexpected request!");
54+
return ctx.sendFunction(std::move(response));
5055
}
5156
}

src/planner/Planner.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ void Planner::flushSchedulingState()
151151
state.inFlightReqs.clear();
152152
state.appResults.clear();
153153
state.appResultWaiters.clear();
154+
state.numMigrations = 0;
154155
}
155156

156157
std::vector<std::shared_ptr<Host>> Planner::getAvailableHosts()
@@ -517,6 +518,11 @@ faabric::batch_scheduler::InFlightReqs Planner::getInFlightReqs()
517518
return inFlightReqsCopy;
518519
}
519520

521+
int Planner::getNumMigrations()
522+
{
523+
return state.numMigrations.load(std::memory_order_acquire);
524+
}
525+
520526
static faabric::batch_scheduler::HostMap convertToBatchSchedHostMap(
521527
std::map<std::string, std::shared_ptr<Host>> hostMapIn)
522528
{
@@ -662,6 +668,7 @@ Planner::callBatch(std::shared_ptr<BatchExecuteRequest> req)
662668
oldDec->print("info");
663669
SPDLOG_INFO("New decision:");
664670
decision->print("info");
671+
state.numMigrations += 1;
665672

666673
// We want to let all hosts involved in the migration (not only
667674
// those in the new decision) that we are gonna migrate. For the

src/planner/PlannerClient.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,16 @@ PlannerClient::getSchedulingDecision(
381381
return decision;
382382
}
383383

384+
int PlannerClient::getNumMigrations()
385+
{
386+
EmptyRequest request;
387+
NumMigrationsResponse response;
388+
389+
syncSend(PlannerCalls::GetNumMigrations, &request, &response);
390+
391+
return response.nummigrations();
392+
}
393+
384394
void PlannerClient::preloadSchedulingDecision(
385395
std::shared_ptr<faabric::batch_scheduler::SchedulingDecision> preloadDec)
386396
{

src/planner/PlannerEndpointHandler.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ void PlannerEndpointHandler::onRequest(
185185
}
186186
}
187187

188+
// Also include the total number of migrations to-date
189+
int numMigrations =
190+
faabric::planner::getPlanner().getNumMigrations();
191+
inFlightAppsResponse.set_nummigrations(numMigrations);
192+
188193
response.result(beast::http::status::ok);
189194
response.body() =
190195
faabric::util::messageToJson(inFlightAppsResponse);

src/planner/PlannerServer.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ std::unique_ptr<google::protobuf::Message> PlannerServer::doSyncRecv(
6161
case PlannerCalls::GetSchedulingDecision: {
6262
return recvGetSchedulingDecision(message.udata());
6363
}
64+
case PlannerCalls::GetNumMigrations: {
65+
return recvGetNumMigrations(message.udata());
66+
}
6467
case PlannerCalls::PreloadSchedulingDecision: {
6568
return recvPreloadSchedulingDecision(message.udata());
6669
}
@@ -195,6 +198,15 @@ PlannerServer::recvGetSchedulingDecision(std::span<const uint8_t> buffer)
195198
return std::make_unique<faabric::PointToPointMappings>(mappings);
196199
}
197200

201+
std::unique_ptr<google::protobuf::Message> PlannerServer::recvGetNumMigrations(
202+
std::span<const uint8_t> buffer)
203+
{
204+
NumMigrationsResponse response;
205+
response.set_nummigrations(planner.getNumMigrations());
206+
207+
return std::make_unique<NumMigrationsResponse>(response);
208+
}
209+
198210
std::unique_ptr<google::protobuf::Message>
199211
PlannerServer::recvPreloadSchedulingDecision(std::span<const uint8_t> buffer)
200212
{

src/planner/planner.proto

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ message HttpMessage {
4242
GET_CONFIG = 6;
4343
GET_EXEC_GRAPH = 7;
4444
GET_IN_FLIGHT_APPS = 8;
45-
EXECUTE_BATCH = 9;
46-
EXECUTE_BATCH_STATUS = 10;
47-
PRELOAD_SCHEDULING_DECISION = 11;
45+
EXECUTE_BATCH = 10;
46+
EXECUTE_BATCH_STATUS = 11;
47+
PRELOAD_SCHEDULING_DECISION = 12;
4848
}
4949

5050
Type type = 1 [json_name = "http_type"];
@@ -65,6 +65,11 @@ message GetInFlightAppsResponse {
6565
}
6666

6767
repeated InFlightApp apps = 1;
68+
int32 numMigrations = 2;
69+
}
70+
71+
message NumMigrationsResponse {
72+
int32 numMigrations = 1;
6873
}
6974

7075
// ---------------------------------------------

tests/dist/dist_test_fixtures.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ class DistTestsFixture
2121
public:
2222
DistTestsFixture()
2323
{
24+
// Reset the planner
25+
resetPlanner();
26+
2427
// Make sure the host list is up to date
2528
sch.addHostToGlobalSet(getMasterIP());
2629
sch.addHostToGlobalSet(getWorkerIP());

tests/dist/mpi/test_mpi_functions.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ TEST_CASE_METHOD(MpiDistTestsFixture, "Test MPI function migration", "[mpi]")
189189

190190
// Wait for messages to be finished
191191
checkAllocationAndResult(req, hostsAfterMigration);
192+
193+
// Check that, indeed, one migration happened
194+
REQUIRE(plannerCli.getNumMigrations() == 1);
192195
}
193196

194197
TEST_CASE_METHOD(MpiDistTestsFixture, "Test MPI gather", "[mpi]")

tests/test/planner/test_planner_client_server.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,16 @@ TEST_CASE_METHOD(PlannerClientServerExecTestFixture,
259259
}
260260
}
261261

262+
TEST_CASE_METHOD(PlannerClientServerExecTestFixture,
263+
"Test getting the number of migrations",
264+
"[planner]")
265+
{
266+
// We should be able to get the number of migrations from the planner
267+
REQUIRE(plannerCli.getNumMigrations() == 0);
268+
269+
// To see a non-zero result, we need a distributed test
270+
}
271+
262272
TEST_CASE_METHOD(PlannerClientServerExecTestFixture,
263273
"Test preloading a scheduling decision from the client",
264274
"[planner]")

tests/test/planner/test_planner_endpoint.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ TEST_CASE_METHOD(PlannerEndpointExecTestFixture,
630630
HttpMessage inFlightMsg;
631631
inFlightMsg.set_type(HttpMessage_Type_GET_IN_FLIGHT_APPS);
632632
GetInFlightAppsResponse expectedResponse;
633+
expectedResponse.set_nummigrations(0);
633634
msgJsonStr = faabric::util::messageToJson(inFlightMsg);
634635
expectedResponseBody = faabric::util::messageToJson(expectedResponse);
635636
expectedReturnCode = beast::http::status::ok;

0 commit comments

Comments
 (0)