ubicloud · bsatzger · May 10, 2024 · May 9, 2024
diff --git a/scheduling/allocator.rb b/scheduling/allocator.rb
@@ -43,7 +43,8 @@ def self.candidate_hosts(request)
         .join(:total_ipv4, routed_to_host_id: Sequel[:vm_host][:id])
         .join(:used_ipv4, routed_to_host_id: Sequel[:vm_host][:id])
         .left_join(:gpus, vm_host_id: Sequel[:vm_host][:id])
-        .select(Sequel[:vm_host][:id].as(:vm_host_id), :total_cores, :used_cores, :total_hugepages_1g, :used_hugepages_1g, :location, :num_storage_devices, :available_storage_gib, :total_storage_gib, :storage_devices, :total_ipv4, :used_ipv4, Sequel.function(:coalesce, :num_gpus, 0).as(:num_gpus), Sequel.function(:coalesce, :available_gpus, 0).as(:available_gpus), :available_iommu_groups)
+        .left_join(:vm_provisioning, vm_host_id: Sequel[:vm_host][:id])
+        .select(Sequel[:vm_host][:id].as(:vm_host_id), :total_cores, :used_cores, :total_hugepages_1g, :used_hugepages_1g, :location, :num_storage_devices, :available_storage_gib, :total_storage_gib, :storage_devices, :total_ipv4, :used_ipv4, Sequel.function(:coalesce, :num_gpus, 0).as(:num_gpus), Sequel.function(:coalesce, :available_gpus, 0).as(:available_gpus), :available_iommu_groups, Sequel.function(:coalesce, :vm_provisioning_count, 0).as(:vm_provisioning_count))
         .where(arch: request.arch_filter)
         .where { (total_hugepages_1g - used_hugepages_1g >= request.mem_gib) }
         .where { (total_cores - used_cores >= request.cores) }
@@ -69,6 +70,10 @@ def self.candidate_hosts(request)
           .select_append { sum(Sequel.case({{vm_id: nil} => 1}, 0)).as(available_gpus) }
           .select_append { array_remove(array_agg(Sequel.case({{vm_id: nil} => :iommu_group}, nil)), nil).as(available_iommu_groups) }
           .where(device_class: ["0300", "0302"]))
+        .with(:vm_provisioning, DB[:vm]
+          .select_group(:vm_host_id)
+          .select_append { count.function.*.as(vm_provisioning_count) }
+          .where(display_state: "creating"))
 
       ds = ds.where { used_ipv4 < total_ipv4 } if request.ip4_enabled
       ds = ds.where { available_gpus > 0 } if request.gpu_enabled
@@ -132,13 +137,16 @@ def calculate_score
       # imbalance score, in range [0, 1]
       imbalance_score = util.max - util.min
 
+      # penalty for ongoing vm provisionings on the host
+      vm_provisioning_penalty = @candidate_host[:vm_provisioning_count] * 0.2
+
       # penalty of 5 if host has a GPU but VM doesn't require a GPU
       gpu_penalty = (@request.gpu_enabled || @candidate_host[:num_gpus] == 0) ? 0 : 5
 
       # penalty of 10 if location preference is not honored
       location_preference_penalty = (@request.location_preference.empty? || @request.location_preference.include?(@candidate_host[:location])) ? 0 : 10
 
-      utilization_score + imbalance_score + gpu_penalty + location_preference_penalty
+      utilization_score + imbalance_score + vm_provisioning_penalty + gpu_penalty + location_preference_penalty
     end
   end
 

diff --git a/spec/scheduling/allocator_spec.rb b/spec/scheduling/allocator_spec.rb
@@ -111,7 +111,34 @@
                  num_gpus: 0,
                  available_gpus: 0,
                  available_iommu_groups: nil,
-                 used_ipv4: 1}])
+                 used_ipv4: 1,
+                 vm_provisioning_count: 0}])
+    end
+
+    it "retrieves provisioning count" do
+      vmh = VmHost.create(allocation_state: "accepting", arch: "x64", location: "loc1", total_cores: 7, used_cores: 3, total_hugepages_1g: 10, used_hugepages_1g: 2) { _1.id = Sshable.create_with_id.id }
+      Address.create_with_id(cidr: "1.1.1.0/30", routed_to_host_id: vmh.id)
+      sd1 = StorageDevice.create_with_id(vm_host_id: vmh.id, name: "stor1", available_storage_gib: 123, total_storage_gib: 345)
+      Vm.create_with_id(vm_host_id: vmh.id, family: "standard", cores: 1, name: "dummy-vm", arch: "x64", location: "loc1", ip4_enabled: false, created_at: Time.now, unix_user: "", public_key: "", boot_image: "")
+      Vm.create_with_id(vm_host_id: vmh.id, family: "standard", cores: 1, name: "dummy-vm", arch: "x64", location: "loc1", ip4_enabled: false, created_at: Time.now, unix_user: "", public_key: "", boot_image: "")
+
+      expect(Al::Allocation.candidate_hosts(req))
+        .to eq([{location: vmh.location,
+                 num_storage_devices: 1,
+                 storage_devices: [{"available_storage_gib" => sd1.available_storage_gib, "id" => sd1.id, "total_storage_gib" => sd1.total_storage_gib}],
+                 total_cores: vmh.total_cores,
+                 total_hugepages_1g: vmh.total_hugepages_1g,
+                 total_storage_gib: sd1.total_storage_gib,
+                 available_storage_gib: sd1.available_storage_gib,
+                 used_cores: vmh.used_cores,
+                 used_hugepages_1g: vmh.used_hugepages_1g,
+                 vm_host_id: vmh.id,
+                 total_ipv4: 4,
+                 num_gpus: 0,
+                 available_gpus: 0,
+                 available_iommu_groups: nil,
+                 used_ipv4: 1,
+                 vm_provisioning_count: 2}])
     end
 
     it "applies host filter" do
@@ -226,7 +253,8 @@
        used_hugepages_1g: 9,
        num_gpus: 0,
        available_gpus: 0,
-       vm_host_id: "the_id"}
+       vm_host_id: "the_id",
+       vm_provisioning_count: 0}
     }
 
     it "initializes individual resource allocations" do
@@ -296,6 +324,13 @@
       expect(score_imbalance).to be > score_balance
     end
 
+    it "penalizes concurrent provisioning" do
+      expect(Al::VmHostAllocation).to receive(:new).twice.and_return(TestResourceAllocation.new(req.target_host_utilization, true))
+      expect(Al::StorageAllocation).to receive(:new).and_return(TestResourceAllocation.new(req.target_host_utilization, true))
+      vmhds[:vm_provisioning_count] = 1
+      expect(Al::Allocation.new(vmhds, req, 0).score).to be > 0
+    end
+
     it "respects location preferences" do
       expect(Al::VmHostAllocation).to receive(:new).twice.and_return(TestResourceAllocation.new(0, true))
       expect(Al::StorageAllocation).to receive(:new).and_return(TestResourceAllocation.new(0, true))