wit_component/metadata.rs
1//! Definition for encoding of custom sections within core wasm modules of
2//! component-model related data.
3//!
4//! When creating a component from a source language the high-level process for
5//! doing this is that code will be generated into the source language by
6//! `wit-bindgen` or a similar tool which will be compiled down to core wasm.
7//! The core wasm file is then fed into `wit-component` and a component is
8//! created. This means that the componentization process is decoupled from the
9//! binding generation process and intentionally affords for linking together
10//! libraries into the main core wasm module that import different interfaces.
11//!
12//! The purpose of this module is to define an intermediate format to reside in
13//! a custom section in the core wasm output. This intermediate format is
14//! carried through the wasm linker through a custom section whose name starts
15//! with `component-type`. This custom section is created
16//! per-language-binding-generation and consumed by slurping up all the
17//! sections during the component creation process.
18//!
19//! Currently the encoding of this custom section is itself a component. The
20//! component has a single export which is a component type which represents the
21//! `world` that was bound during bindings generation. This single export is
22//! used to decode back into a `Resolve` with a WIT representation.
23//!
24//! Currently the component additionally has a custom section named
25//! `wit-component-encoding` (see `CUSTOM_SECTION_NAME`). This section is
26//! currently defined as 2 bytes:
27//!
28//! * The first byte is `CURRENT_VERSION` to help protect against future and
29//! past changes.
30//! * The second byte indicates the string encoding used for imports/exports as
31//! part of the bindings process. The mapping is defined by
32//! `encode_string_encoding`.
33//!
34//! This means that the top-level `encode` function takes a `Resolve`, a
35//! `WorldId`, and a `StringEncoding`. Note that the top-level `decode` function
36//! is slightly difference because it's taking all custom sections in a core
37//! wasm binary, possibly from multiple invocations of bindgen, and unioning
38//! them all together. This means that the output is a `Bindgen` which
39//! represents the union of all previous bindings.
40//!
41//! The dual of `encode` is the `decode_custom_section` fucntion which decodes
42//! the three arguments originally passed to `encode`.
43
44use crate::{DecodedWasm, StringEncoding};
45use anyhow::{bail, Context, Result};
46use indexmap::{IndexMap, IndexSet};
47use std::borrow::Cow;
48use wasm_encoder::{
49 ComponentBuilder, ComponentExportKind, ComponentType, ComponentTypeRef, CustomSection,
50};
51use wasm_metadata::Producers;
52use wasmparser::{BinaryReader, Encoding, Parser, Payload};
53use wit_parser::{Package, PackageName, Resolve, World, WorldId, WorldItem, WorldKey};
54
55const CURRENT_VERSION: u8 = 0x04;
56const CUSTOM_SECTION_NAME: &str = "wit-component-encoding";
57
58/// The result of decoding binding information from a WebAssembly binary.
59///
60/// This structure is returned by [`decode`] and represents the interface of a
61/// WebAssembly binary.
62pub struct Bindgen {
63 /// Interface and type information for this binary.
64 pub resolve: Resolve,
65 /// The world that was bound.
66 pub world: WorldId,
67 /// Metadata about this specific module that was bound.
68 pub metadata: ModuleMetadata,
69 /// Producer information about tools used to produce this specific module.
70 pub producers: Option<Producers>,
71}
72
73impl Default for Bindgen {
74 fn default() -> Bindgen {
75 let mut resolve = Resolve::default();
76 let package = resolve.packages.alloc(Package {
77 name: PackageName {
78 namespace: "root".to_string(),
79 name: "root".to_string(),
80 version: None,
81 },
82 docs: Default::default(),
83 interfaces: Default::default(),
84 worlds: Default::default(),
85 });
86 let world = resolve.worlds.alloc(World {
87 name: "root".to_string(),
88 docs: Default::default(),
89 imports: Default::default(),
90 exports: Default::default(),
91 includes: Default::default(),
92 include_names: Default::default(),
93 package: Some(package),
94 stability: Default::default(),
95 });
96 resolve.packages[package]
97 .worlds
98 .insert("root".to_string(), world);
99 Bindgen {
100 resolve,
101 world,
102 metadata: ModuleMetadata::default(),
103 producers: None,
104 }
105 }
106}
107
108/// Module-level metadata that's specific to one core WebAssembly module. This
109/// is extracted with a [`Bindgen`].
110#[derive(Default)]
111pub struct ModuleMetadata {
112 /// Per-function options imported into the core wasm module, currently only
113 /// related to string encoding.
114 pub import_encodings: EncodingMap,
115
116 /// Per-function options exported from the core wasm module, currently only
117 /// related to string encoding.
118 pub export_encodings: EncodingMap,
119}
120
121/// Internal map that keeps track of encodings for various world imports and
122/// exports.
123///
124/// Stored in [`ModuleMetadata`].
125#[derive(Default)]
126pub struct EncodingMap {
127 /// A map of an "identifying string" for world items to what string
128 /// encoding the import or export is using.
129 ///
130 /// The keys of this map are created by `EncodingMap::key` and are
131 /// specifically chosen to be able to be looked up during both insertion and
132 /// fetching. Note that in particular this map does not use `*Id` types such
133 /// as `InterfaceId` from `wit_parser`. This is due to the fact that during
134 /// world merging new interfaces are created for named imports (e.g. `import
135 /// x: interface { ... }`) as inline interfaces are copied from one world to
136 /// another. Additionally during world merging different interfaces at the
137 /// same version may be deduplicated.
138 ///
139 /// For these reasons a string-based key is chosen to avoid juggling IDs
140 /// through the world merging process. Additionally versions are chopped off
141 /// for now to help with a problem such as:
142 ///
143 /// * The main module imports a:b/c@0.1.0
144 /// * An adapter imports a:b/c@0.1.1
145 /// * The final world uses a:b/c@0.1.1, but the main module has no
146 /// encoding listed for that exact item.
147 ///
148 /// By chopping off versions this is able to get everything registered
149 /// correctly even in the fact of merging interfaces and worlds.
150 encodings: IndexMap<String, StringEncoding>,
151}
152
153impl EncodingMap {
154 fn insert_all(
155 &mut self,
156 resolve: &Resolve,
157 set: &IndexMap<WorldKey, WorldItem>,
158 encoding: StringEncoding,
159 ) {
160 for (name, item) in set {
161 match item {
162 WorldItem::Function(func) => {
163 let key = self.key(resolve, name, &func.name);
164 self.encodings.insert(key, encoding);
165 }
166 WorldItem::Interface { id, .. } => {
167 for (func, _) in resolve.interfaces[*id].functions.iter() {
168 let key = self.key(resolve, name, func);
169 self.encodings.insert(key, encoding);
170 }
171 }
172 WorldItem::Type(_) => {}
173 }
174 }
175 }
176
177 /// Looks up the encoding of the function `func` which is scoped under `key`
178 /// in the world in question.
179 pub fn get(&self, resolve: &Resolve, key: &WorldKey, func: &str) -> Option<StringEncoding> {
180 let key = self.key(resolve, key, func);
181 self.encodings.get(&key).copied()
182 }
183
184 fn key(&self, resolve: &Resolve, key: &WorldKey, func: &str) -> String {
185 format!(
186 "{}/{func}",
187 match key {
188 WorldKey::Name(name) => name.to_string(),
189 WorldKey::Interface(id) => {
190 let iface = &resolve.interfaces[*id];
191 let pkg = &resolve.packages[iface.package.unwrap()];
192 format!(
193 "{}:{}/{}",
194 pkg.name.namespace,
195 pkg.name.name,
196 iface.name.as_ref().unwrap()
197 )
198 }
199 }
200 )
201 }
202
203 fn merge(&mut self, other: EncodingMap) -> Result<()> {
204 for (key, encoding) in other.encodings {
205 if let Some(prev) = self.encodings.insert(key.clone(), encoding) {
206 if prev != encoding {
207 bail!("conflicting string encodings specified for `{key}`");
208 }
209 }
210 }
211 Ok(())
212 }
213}
214
215/// This function will parse the core `wasm` binary given as input and return a
216/// [`Bindgen`] which extracts the custom sections describing component-level
217/// types from within the binary itself.
218///
219/// This is used to parse the output of `wit-bindgen`-generated modules and is
220/// one of the earliest phases in transitioning such a module to a component.
221/// The extraction here provides the metadata necessary to continue the process
222/// later on.
223///
224/// This will return an error if `wasm` is not a valid WebAssembly module.
225///
226/// If a `component-type` custom section was found then a new binary is
227/// optionally returned with the custom sections stripped out. If no
228/// `component-type` custom sections are found then `None` is returned.
229pub fn decode(wasm: &[u8]) -> Result<(Option<Vec<u8>>, Bindgen)> {
230 let mut ret = Bindgen::default();
231 let mut new_module = wasm_encoder::Module::new();
232
233 let mut found_custom = false;
234 for payload in wasmparser::Parser::new(0).parse_all(wasm) {
235 let payload = payload.context("decoding item in module")?;
236 match payload {
237 wasmparser::Payload::CustomSection(cs) if cs.name().starts_with("component-type") => {
238 let data = Bindgen::decode_custom_section(cs.data())
239 .with_context(|| format!("decoding custom section {}", cs.name()))?;
240 ret.merge(data)
241 .with_context(|| format!("updating metadata for section {}", cs.name()))?;
242 found_custom = true;
243 }
244 wasmparser::Payload::Version { encoding, .. } if encoding != Encoding::Module => {
245 bail!("decoding a component is not supported")
246 }
247 _ => {
248 if let Some((id, range)) = payload.as_section() {
249 new_module.section(&wasm_encoder::RawSection {
250 id,
251 data: &wasm[range],
252 });
253 }
254 }
255 }
256 }
257
258 if found_custom {
259 Ok((Some(new_module.finish()), ret))
260 } else {
261 Ok((None, ret))
262 }
263}
264
265/// Creates a `component-type*` custom section to be decoded by `decode` above.
266///
267/// This is primarily created by wit-bindgen-based guest generators to embed
268/// into the final core wasm binary. The core wasm binary is later fed
269/// through `wit-component` to produce the actual component where this returned
270/// section will be decoded.
271pub fn encode(
272 resolve: &Resolve,
273 world: WorldId,
274 string_encoding: StringEncoding,
275 extra_producers: Option<&Producers>,
276) -> Result<Vec<u8>> {
277 let ty = crate::encoding::encode_world(resolve, world)?;
278
279 let world = &resolve.worlds[world];
280 let mut outer_ty = ComponentType::new();
281 outer_ty.ty().component(&ty);
282 outer_ty.export(
283 &resolve.id_of_name(world.package.unwrap(), &world.name),
284 ComponentTypeRef::Component(0),
285 );
286
287 let mut builder = ComponentBuilder::default();
288
289 let string_encoding = encode_string_encoding(string_encoding);
290 builder.custom_section(&CustomSection {
291 name: CUSTOM_SECTION_NAME.into(),
292 data: Cow::Borrowed(&[CURRENT_VERSION, string_encoding]),
293 });
294
295 let ty = builder.type_component(&outer_ty);
296 builder.export(&world.name, ComponentExportKind::Type, ty, None);
297
298 let mut producers = crate::base_producers();
299 if let Some(p) = extra_producers {
300 producers.merge(&p);
301 }
302 builder.raw_custom_section(&producers.raw_custom_section());
303 Ok(builder.finish())
304}
305
306fn decode_custom_section(wasm: &[u8]) -> Result<(Resolve, WorldId, StringEncoding)> {
307 let (resolve, world) = wit_parser::decoding::decode_world(wasm)?;
308 let mut custom_section = None;
309
310 for payload in Parser::new(0).parse_all(wasm) {
311 match payload? {
312 Payload::CustomSection(s) if s.name() == CUSTOM_SECTION_NAME => {
313 custom_section = Some(s.data());
314 }
315 _ => {}
316 }
317 }
318 let string_encoding = match custom_section {
319 None => bail!("missing custom section of name `{CUSTOM_SECTION_NAME}`"),
320 Some([CURRENT_VERSION, byte]) => decode_string_encoding(*byte)?,
321 Some([]) => bail!("custom section `{CUSTOM_SECTION_NAME}` in unknown format"),
322 Some([version, ..]) => bail!(
323 "custom section `{CUSTOM_SECTION_NAME}` uses format {version} but only {CURRENT_VERSION} is supported"
324 ),
325 };
326 Ok((resolve, world, string_encoding))
327}
328
329fn encode_string_encoding(e: StringEncoding) -> u8 {
330 match e {
331 StringEncoding::UTF8 => 0x00,
332 StringEncoding::UTF16 => 0x01,
333 StringEncoding::CompactUTF16 => 0x02,
334 }
335}
336
337fn decode_string_encoding(byte: u8) -> Result<StringEncoding> {
338 match byte {
339 0x00 => Ok(StringEncoding::UTF8),
340 0x01 => Ok(StringEncoding::UTF16),
341 0x02 => Ok(StringEncoding::CompactUTF16),
342 byte => bail!("invalid string encoding {byte:#x}"),
343 }
344}
345
346impl Bindgen {
347 fn decode_custom_section(data: &[u8]) -> Result<Bindgen> {
348 let wasm;
349 let world;
350 let resolve;
351 let encoding;
352
353 let mut reader = BinaryReader::new(data, 0);
354 match reader.read_u8()? {
355 // Historical 0x03 format where the support here will be deleted in
356 // the future
357 0x03 => {
358 encoding = decode_string_encoding(reader.read_u8()?)?;
359 let world_name = reader.read_string()?;
360 wasm = &data[reader.original_position()..];
361
362 let (r, pkg) = match crate::decode(wasm)? {
363 DecodedWasm::WitPackage(resolve, pkgs) => (resolve, pkgs),
364 DecodedWasm::Component(..) => bail!("expected encoded wit package(s)"),
365 };
366 resolve = r;
367 world = resolve.select_world(pkg, Some(world_name.into()))?;
368 }
369
370 // Current format where `data` is a wasm component itself.
371 _ => {
372 wasm = data;
373 (resolve, world, encoding) = decode_custom_section(wasm)?;
374 }
375 }
376
377 Ok(Bindgen {
378 metadata: ModuleMetadata::new(&resolve, world, encoding),
379 producers: wasm_metadata::Producers::from_wasm(wasm)?,
380 resolve,
381 world,
382 })
383 }
384
385 /// Merges another `BindgenMetadata` into this one.
386 ///
387 /// This operation is intended to be akin to "merging worlds" when the
388 /// abstraction level for that is what we're working at here. For now the
389 /// merge operation only succeeds if the two metadata descriptions are
390 /// entirely disjoint.
391 ///
392 /// Note that at this time there's no support for changing string encodings
393 /// between metadata.
394 ///
395 /// This function returns the set of exports that the main world of
396 /// `other` added to the world in `self`.
397 pub fn merge(&mut self, other: Bindgen) -> Result<IndexSet<WorldKey>> {
398 let Bindgen {
399 resolve,
400 world,
401 metadata:
402 ModuleMetadata {
403 import_encodings,
404 export_encodings,
405 },
406 producers,
407 } = other;
408
409 let remap = self
410 .resolve
411 .merge(resolve)
412 .context("failed to merge WIT package sets together")?;
413 let world = remap.map_world(world, None)?;
414 let exports = self.resolve.worlds[world].exports.keys().cloned().collect();
415 self.resolve
416 .merge_worlds(world, self.world)
417 .context("failed to merge worlds from two documents")?;
418
419 self.metadata.import_encodings.merge(import_encodings)?;
420 self.metadata.export_encodings.merge(export_encodings)?;
421 if let Some(producers) = producers {
422 if let Some(mine) = &mut self.producers {
423 mine.merge(&producers);
424 } else {
425 self.producers = Some(producers);
426 }
427 }
428
429 Ok(exports)
430 }
431}
432
433impl ModuleMetadata {
434 /// Creates a new `ModuleMetadata` instance holding the given set of
435 /// interfaces which are expected to all use the `encoding` specified.
436 pub fn new(resolve: &Resolve, world: WorldId, encoding: StringEncoding) -> ModuleMetadata {
437 let mut ret = ModuleMetadata::default();
438
439 let world = &resolve.worlds[world];
440 ret.export_encodings
441 .insert_all(resolve, &world.exports, encoding);
442 ret.import_encodings
443 .insert_all(resolve, &world.imports, encoding);
444
445 ret
446 }
447}