Skip to content

Commit

Permalink
Discourage concurrent creation of VMs on a single host
Browse files Browse the repository at this point in the history
In the allocator, we add a penalty for assigning a VM to a host that
is busy provisioning other VMs.
  • Loading branch information
bsatzger committed May 10, 2024
1 parent 8e3510f commit e2b9870
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
12 changes: 10 additions & 2 deletions scheduling/allocator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def self.candidate_hosts(request)
.join(:total_ipv4, routed_to_host_id: Sequel[:vm_host][:id])
.join(:used_ipv4, routed_to_host_id: Sequel[:vm_host][:id])
.left_join(:gpus, vm_host_id: Sequel[:vm_host][:id])
.select(Sequel[:vm_host][:id].as(:vm_host_id), :total_cores, :used_cores, :total_hugepages_1g, :used_hugepages_1g, :location, :num_storage_devices, :available_storage_gib, :total_storage_gib, :storage_devices, :total_ipv4, :used_ipv4, Sequel.function(:coalesce, :num_gpus, 0).as(:num_gpus), Sequel.function(:coalesce, :available_gpus, 0).as(:available_gpus), :available_iommu_groups)
.left_join(:vm_provisioning, vm_host_id: Sequel[:vm_host][:id])
.select(Sequel[:vm_host][:id].as(:vm_host_id), :total_cores, :used_cores, :total_hugepages_1g, :used_hugepages_1g, :location, :num_storage_devices, :available_storage_gib, :total_storage_gib, :storage_devices, :total_ipv4, :used_ipv4, Sequel.function(:coalesce, :num_gpus, 0).as(:num_gpus), Sequel.function(:coalesce, :available_gpus, 0).as(:available_gpus), :available_iommu_groups, Sequel.function(:coalesce, :vm_provisioning_count, 0).as(:vm_provisioning_count))
.where(arch: request.arch_filter)
.where { (total_hugepages_1g - used_hugepages_1g >= request.mem_gib) }
.where { (total_cores - used_cores >= request.cores) }
Expand All @@ -69,6 +70,10 @@ def self.candidate_hosts(request)
.select_append { sum(Sequel.case({{vm_id: nil} => 1}, 0)).as(available_gpus) }
.select_append { array_remove(array_agg(Sequel.case({{vm_id: nil} => :iommu_group}, nil)), nil).as(available_iommu_groups) }
.where(device_class: ["0300", "0302"]))
.with(:vm_provisioning, DB[:vm]
.select_group(:vm_host_id)
.select_append { count.function.*.as(vm_provisioning_count) }
.where(display_state: "creating"))

ds = ds.where { used_ipv4 < total_ipv4 } if request.ip4_enabled
ds = ds.where { available_gpus > 0 } if request.gpu_enabled
Expand Down Expand Up @@ -132,13 +137,16 @@ def calculate_score
# imbalance score, in range [0, 1]
imbalance_score = util.max - util.min

# penalty for ongoing vm provisionings on the host
vm_provisioning_penalty = @candidate_host[:vm_provisioning_count] * 0.2

# penalty of 5 if host has a GPU but VM doesn't require a GPU
gpu_penalty = (@request.gpu_enabled || @candidate_host[:num_gpus] == 0) ? 0 : 5

# penalty of 10 if location preference is not honored
location_preference_penalty = (@request.location_preference.empty? || @request.location_preference.include?(@candidate_host[:location])) ? 0 : 10

utilization_score + imbalance_score + gpu_penalty + location_preference_penalty
utilization_score + imbalance_score + vm_provisioning_penalty + gpu_penalty + location_preference_penalty
end
end

Expand Down
39 changes: 37 additions & 2 deletions spec/scheduling/allocator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,34 @@
num_gpus: 0,
available_gpus: 0,
available_iommu_groups: nil,
used_ipv4: 1}])
used_ipv4: 1,
vm_provisioning_count: 0}])
end

it "retrieves provisioning count" do
vmh = VmHost.create(allocation_state: "accepting", arch: "x64", location: "loc1", total_cores: 7, used_cores: 3, total_hugepages_1g: 10, used_hugepages_1g: 2) { _1.id = Sshable.create_with_id.id }
Address.create_with_id(cidr: "1.1.1.0/30", routed_to_host_id: vmh.id)
sd1 = StorageDevice.create_with_id(vm_host_id: vmh.id, name: "stor1", available_storage_gib: 123, total_storage_gib: 345)
Vm.create_with_id(vm_host_id: vmh.id, family: "standard", cores: 1, name: "dummy-vm", arch: "x64", location: "loc1", ip4_enabled: false, created_at: Time.now, unix_user: "", public_key: "", boot_image: "")
Vm.create_with_id(vm_host_id: vmh.id, family: "standard", cores: 1, name: "dummy-vm", arch: "x64", location: "loc1", ip4_enabled: false, created_at: Time.now, unix_user: "", public_key: "", boot_image: "")

expect(Al::Allocation.candidate_hosts(req))
.to eq([{location: vmh.location,
num_storage_devices: 1,
storage_devices: [{"available_storage_gib" => sd1.available_storage_gib, "id" => sd1.id, "total_storage_gib" => sd1.total_storage_gib}],
total_cores: vmh.total_cores,
total_hugepages_1g: vmh.total_hugepages_1g,
total_storage_gib: sd1.total_storage_gib,
available_storage_gib: sd1.available_storage_gib,
used_cores: vmh.used_cores,
used_hugepages_1g: vmh.used_hugepages_1g,
vm_host_id: vmh.id,
total_ipv4: 4,
num_gpus: 0,
available_gpus: 0,
available_iommu_groups: nil,
used_ipv4: 1,
vm_provisioning_count: 2}])
end

it "applies host filter" do
Expand Down Expand Up @@ -226,7 +253,8 @@
used_hugepages_1g: 9,
num_gpus: 0,
available_gpus: 0,
vm_host_id: "the_id"}
vm_host_id: "the_id",
vm_provisioning_count: 0}
}

it "initializes individual resource allocations" do
Expand Down Expand Up @@ -296,6 +324,13 @@
expect(score_imbalance).to be > score_balance
end

it "penalizes concurrent provisioning" do
expect(Al::VmHostAllocation).to receive(:new).twice.and_return(TestResourceAllocation.new(req.target_host_utilization, true))
expect(Al::StorageAllocation).to receive(:new).and_return(TestResourceAllocation.new(req.target_host_utilization, true))
vmhds[:vm_provisioning_count] = 1
expect(Al::Allocation.new(vmhds, req, 0).score).to be > 0
end

it "respects location preferences" do
expect(Al::VmHostAllocation).to receive(:new).twice.and_return(TestResourceAllocation.new(0, true))
expect(Al::StorageAllocation).to receive(:new).and_return(TestResourceAllocation.new(0, true))
Expand Down

0 comments on commit e2b9870

Please sign in to comment.