Skip to content

Commit ddba013

Browse files
authored
planner: introduce SPOT VMs policy (#433)
1 parent 35af765 commit ddba013

26 files changed

+1618
-56
lines changed

.env

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FAABRIC_VERSION=0.18.0
2-
FAABRIC_CLI_IMAGE=faasm.azurecr.io/faabric:0.18.0
1+
FAABRIC_VERSION=0.19.0
2+
FAABRIC_CLI_IMAGE=faasm.azurecr.io/faabric:0.19.0
33
COMPOSE_PROJECT_NAME=faabric-dev
44
CONAN_CACHE_MOUNT_SOURCE=./conan-cache/

.github/workflows/tests.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
if: github.event.pull_request.draft == false
2121
runs-on: ubuntu-latest
2222
container:
23-
image: faasm.azurecr.io/faabric:0.18.0
23+
image: faasm.azurecr.io/faabric:0.19.0
2424
env:
2525
DEPLOYMENT_TYPE: gha-ci
2626
steps:
@@ -34,7 +34,7 @@ jobs:
3434
if: github.event.pull_request.draft == false
3535
runs-on: ubuntu-latest
3636
container:
37-
image: faasm.azurecr.io/faabric:0.18.0
37+
image: faasm.azurecr.io/faabric:0.19.0
3838
steps:
3939
- name: "Check out code"
4040
uses: actions/checkout@v4
@@ -45,7 +45,7 @@ jobs:
4545
if: github.event.pull_request.draft == false
4646
runs-on: ubuntu-latest
4747
container:
48-
image: faasm.azurecr.io/faabric:0.18.0
48+
image: faasm.azurecr.io/faabric:0.19.0
4949
steps:
5050
- name: "Check out code"
5151
uses: actions/checkout@v4
@@ -65,7 +65,7 @@ jobs:
6565
REDIS_QUEUE_HOST: redis
6666
REDIS_STATE_HOST: redis
6767
container:
68-
image: faasm.azurecr.io/faabric:0.18.0
68+
image: faasm.azurecr.io/faabric:0.19.0
6969
options: --privileged
7070
services:
7171
redis:
@@ -104,7 +104,7 @@ jobs:
104104
REDIS_QUEUE_HOST: redis
105105
REDIS_STATE_HOST: redis
106106
container:
107-
image: faasm.azurecr.io/faabric:0.18.0
107+
image: faasm.azurecr.io/faabric:0.19.0
108108
options: --privileged
109109
services:
110110
redis:
@@ -156,7 +156,7 @@ jobs:
156156
REDIS_QUEUE_HOST: redis
157157
REDIS_STATE_HOST: redis
158158
container:
159-
image: faasm.azurecr.io/faabric:0.18.0
159+
image: faasm.azurecr.io/faabric:0.19.0
160160
services:
161161
redis:
162162
image: redis

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.18.0
1+
0.19.0

include/faabric/batch-scheduler/BatchScheduler.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
#define NOT_ENOUGH_SLOTS_DECISION \
1313
faabric::batch_scheduler::SchedulingDecision(NOT_ENOUGH_SLOTS, \
1414
NOT_ENOUGH_SLOTS)
15+
#define MUST_FREEZE -97
16+
#define MUST_FREEZE_DECISION \
17+
faabric::batch_scheduler::SchedulingDecision(MUST_FREEZE, MUST_FREEZE)
18+
19+
#define MUST_EVICT_IP "E.VI.CT.ME"
1520

1621
namespace faabric::batch_scheduler {
1722

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#pragma once
2+
3+
#include <faabric/batch-scheduler/BatchScheduler.h>
4+
#include <faabric/batch-scheduler/SchedulingDecision.h>
5+
#include <faabric/util/batch.h>
6+
7+
namespace faabric::batch_scheduler {
8+
9+
// This batch scheduler behaves in the same way than BinPack for NEW and
10+
// SCALE_CHANGE requests, but for DIST_CHANGE it considers if any of the
11+
// hosts in the Host Map have been tainted with the eviction mark. In which
12+
// case it first tries to migrate them to other running hosts and, if not
13+
// enough hosts are available, freezes the messages.
14+
class SpotScheduler final : public BatchScheduler
15+
{
16+
public:
17+
std::shared_ptr<SchedulingDecision> makeSchedulingDecision(
18+
HostMap& hostMap,
19+
const InFlightReqs& inFlightReqs,
20+
std::shared_ptr<faabric::BatchExecuteRequest> req) override;
21+
22+
private:
23+
bool isFirstDecisionBetter(
24+
std::shared_ptr<SchedulingDecision> decisionA,
25+
std::shared_ptr<SchedulingDecision> decisionB) override;
26+
27+
std::vector<Host> getSortedHosts(
28+
HostMap& hostMap,
29+
const InFlightReqs& inFlightReqs,
30+
std::shared_ptr<faabric::BatchExecuteRequest> req,
31+
const DecisionType& decisionType) override;
32+
};
33+
}

include/faabric/planner/Planner.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ class Planner
3333

3434
void printConfig() const;
3535

36+
std::string getPolicy();
37+
3638
void setPolicy(const std::string& newPolicy);
3739

3840
// ----------
@@ -87,10 +89,21 @@ class Planner
8789
// the planner was last reset
8890
int getNumMigrations();
8991

92+
// Helper method to get the next host that will be evicted
93+
std::set<std::string> getNextEvictedHostIps();
94+
95+
std::map<int32_t, std::shared_ptr<BatchExecuteRequest>> getEvictedReqs();
96+
9097
// Main entrypoint to request the execution of batches
9198
std::shared_ptr<faabric::batch_scheduler::SchedulingDecision> callBatch(
9299
std::shared_ptr<BatchExecuteRequest> req);
93100

101+
// ----------
102+
// API exclusive to SPOT policy mode
103+
// ----------
104+
105+
void setNextEvictedVm(const std::set<std::string>& vmIp);
106+
94107
private:
95108
// There's a singleton instance of the planner running, but it must allow
96109
// concurrent requests

include/faabric/planner/PlannerState.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ namespace faabric::planner {
1212
*/
1313
struct PlannerState
1414
{
15+
// Policy to operate the planner in. Mostly determins the batch scheduler
16+
// behaviour, but also the planner's in some cases
17+
std::string policy;
18+
1519
// Accounting of the hosts that are registered in the system and responsive
1620
// We deliberately use the host's IP as unique key, but assign a unique host
1721
// id for redundancy
@@ -36,5 +40,19 @@ struct PlannerState
3640

3741
// Helper coutner of the total number of migrations
3842
std::atomic<int> numMigrations = 0;
43+
44+
// -----
45+
// Data structures used only under the SPOT policy
46+
// -----
47+
48+
// Map containing the BER that have been evicted due to a SPOT VM eviction.
49+
// All messages in the VM have been checkpointed, are in the snapshot
50+
// registry in the planner, and are ready to be scheduled when capacity
51+
// appears
52+
std::map<int, std::shared_ptr<BatchExecuteRequest>> evictedRequests;
53+
54+
// This variable simulates the values we would get from a cloud provider's
55+
// API indicating the (set of) VM to be evicted next
56+
std::set<std::string> nextEvictedHostIps;
3957
};
4058
}

include/faabric/util/func.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,18 @@
66
#include <vector>
77

88
#define MIGRATED_FUNCTION_RETURN_VALUE -99
9+
#define FROZEN_FUNCTION_RETURN_VALUE -98
910

1011
namespace faabric::util {
1112

13+
class FunctionFrozenException : public faabric::util::FaabricException
14+
{
15+
public:
16+
explicit FunctionFrozenException(std::string message)
17+
: FaabricException(std::move(message))
18+
{}
19+
};
20+
1221
class FunctionMigratedException : public faabric::util::FaabricException
1322
{
1423
public:

src/batch-scheduler/BatchScheduler.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <faabric/batch-scheduler/BatchScheduler.h>
22
#include <faabric/batch-scheduler/BinPackScheduler.h>
33
#include <faabric/batch-scheduler/CompactScheduler.h>
4+
#include <faabric/batch-scheduler/SpotScheduler.h>
45
#include <faabric/util/config.h>
56
#include <faabric/util/logging.h>
67

@@ -23,6 +24,8 @@ std::shared_ptr<BatchScheduler> getBatchScheduler()
2324
batchScheduler = std::make_shared<BinPackScheduler>();
2425
} else if (mode == "compact") {
2526
batchScheduler = std::make_shared<CompactScheduler>();
27+
} else if (mode == "spot") {
28+
batchScheduler = std::make_shared<SpotScheduler>();
2629
} else {
2730
SPDLOG_ERROR("Unrecognised batch scheduler mode: {}", mode);
2831
throw std::runtime_error("Unrecognised batch scheduler mode");

src/batch-scheduler/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ faabric_lib(batch_scheduler
77
BatchScheduler.cpp
88
BinPackScheduler.cpp
99
CompactScheduler.cpp
10+
SpotScheduler.cpp
1011
)
1112

1213
target_link_libraries(batch_scheduler PRIVATE

src/batch-scheduler/CompactScheduler.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ bool CompactScheduler::isFirstDecisionBetter(
9898
throw std::runtime_error("Method not supported for COMPACT scheduler");
9999
}
100100

101-
HostMap deepCopyHostMap(const HostMap& hostMap)
101+
static HostMap deepCopyHostMap(const HostMap& hostMap)
102102
{
103103
HostMap newHostMap;
104104

@@ -173,9 +173,9 @@ bool CompactScheduler::isFirstDecisionBetter(
173173

174174
// Filter-out from the host map all nodes that are executing requests from a
175175
// different user
176-
void filterHosts(HostMap& hostMap,
177-
const InFlightReqs& inFlightReqs,
178-
std::shared_ptr<faabric::BatchExecuteRequest> req)
176+
static void filterHosts(HostMap& hostMap,
177+
const InFlightReqs& inFlightReqs,
178+
std::shared_ptr<faabric::BatchExecuteRequest> req)
179179
{
180180
// We temporarily use the request subtype field to attach a user id for our
181181
// multi-tenant simulations

0 commit comments

Comments
 (0)