diff --git a/Sources/tart/OCI/Manifest.swift b/Sources/tart/OCI/Manifest.swift index 955a016e..53769b91 100644 --- a/Sources/tart/OCI/Manifest.swift +++ b/Sources/tart/OCI/Manifest.swift @@ -78,7 +78,7 @@ struct OCIManifestConfig: Codable, Equatable { var digest: String } -struct OCIManifestLayer: Codable, Equatable { +struct OCIManifestLayer: Codable, Equatable, Hashable { var mediaType: String var size: Int var digest: String @@ -113,6 +113,14 @@ struct OCIManifestLayer: Codable, Equatable { func uncompressedContentDigest() -> String? { annotations?[uncompressedContentDigestAnnotation] } + + static func == (lhs: Self, rhs: Self) -> Bool { + return lhs.digest == rhs.digest + } + + func hash(into hasher: inout Hasher) { + hasher.combine(digest) + } } struct Descriptor: Equatable { diff --git a/Sources/tart/VMDirectory+OCI.swift b/Sources/tart/VMDirectory+OCI.swift index c09e0986..07c59225 100644 --- a/Sources/tart/VMDirectory+OCI.swift +++ b/Sources/tart/VMDirectory+OCI.swift @@ -79,6 +79,9 @@ extension VMDirectory { nvram.write(data) } try nvram.close() + + // Serialize VM's manifest to enable better de-duplication on subsequent "tart pull"'s + try manifest.toJSON().write(to: manifestURL) } func pushToRegistry(registry: Registry, references: [String], chunkSizeMb: Int, diskFormat: String) async throws -> RemoteName { diff --git a/Sources/tart/VMDirectory.swift b/Sources/tart/VMDirectory.swift index 650d8d2a..ccd7e791 100644 --- a/Sources/tart/VMDirectory.swift +++ b/Sources/tart/VMDirectory.swift @@ -23,6 +23,9 @@ struct VMDirectory: Prunable { var stateURL: URL { baseURL.appendingPathComponent("state.vzvmsave") } + var manifestURL: URL { + baseURL.appendingPathComponent("manifest.json") + } var explicitlyPulledMark: URL { baseURL.appendingPathComponent(".explicitly-pulled") diff --git a/Sources/tart/VMStorageOCI.swift b/Sources/tart/VMStorageOCI.swift index 0212698d..6a920f78 100644 --- a/Sources/tart/VMStorageOCI.swift +++ b/Sources/tart/VMStorageOCI.swift @@ -197,14 +197,8 @@ class VMStorageOCI: PrunableStorage { try await withTaskCancellationHandler(operation: { try await retry(maxAttempts: 5, backoff: .exponentialWithFullJitter(baseDelay: .seconds(5), maxDelay: .seconds(60))) { - var localLayerCache: LocalLayerCache? = nil - - if name.reference.type == .Tag, - let vmDir = try? open(name), - let digest = try? digest(name), - let (manifest, _) = try? await registry.pullManifest(reference: digest) { - localLayerCache = try LocalLayerCache(vmDir.diskURL, manifest) - } + // Choose the best base image which has the most deduplication ratio + let localLayerCache = try await chooseLocalLayerCache(name, manifest, registry) try await tmpVMDir.pullFromRegistry(registry: registry, manifest: manifest, concurrency: concurrency, localLayerCache: localLayerCache) } recoverFromFailure: { error in @@ -249,6 +243,57 @@ class VMStorageOCI: PrunableStorage { try gc() } + + func chooseLocalLayerCache(_ name: RemoteName, _ manifest: OCIManifest, _ registry: Registry) async throws -> LocalLayerCache? { + // Establish a closure that will calculate how much bytes + // we'll de-duplicate if we re-use the given manifest + let target = Swift.Set(manifest.layers) + + let calculateDeduplicatedBytes = { (manifest: OCIManifest) -> Int in + target.intersection(manifest.layers).map({ $0.size }).reduce(0, +) + } + + // Load OCI VM images and their manifests (if present) + var candidates: [(name: String, vmDir: VMDirectory, manifest: OCIManifest, deduplicatedBytes: Int)] = [] + + for (name, vmDir, isSymlink) in try list() { + if isSymlink { + continue + } + + guard let manifestJSON = try? Data(contentsOf: vmDir.manifestURL) else { + continue + } + + guard let manifest = try? OCIManifest(fromJSON: manifestJSON) else { + continue + } + + candidates.append((name, vmDir, manifest, calculateDeduplicatedBytes(manifest))) + } + + // Previously we haven't stored the OCI VM image manifests, but still fetched the VM image manifest if + // what the user was trying to pull was a tagged image, and we already had that image in the OCI VM cache + // + // Keep supporting this behavior for backwards comaptibility, but only communicate + // with the registry if we haven't already retrieved the manifest for that OCI VM image. + if name.reference.type == .Tag, + let vmDir = try? open(name), + let digest = try? digest(name), + try !candidates.contains(where: {try $0.manifest.digest() == digest}), + let (manifest, _) = try? await registry.pullManifest(reference: digest) { + candidates.append((name.description, vmDir, manifest, calculateDeduplicatedBytes(manifest))) + } + + // Now, find the best match based on how many bytes we'll de-duplicate + let choosen = candidates.max { left, right in + return left.deduplicatedBytes < right.deduplicatedBytes + } + + return try choosen.flatMap({ choosen in + try LocalLayerCache(choosen.vmDir.diskURL, choosen.manifest) + }) + } } extension URL {