1//! A JavaScript string value. JS strings are sequences of UTF-16 code units2//! with no validity requirement, so a value can contain unpaired surrogate3//! halves that Rust's `String` cannot represent. `JsString` keeps the common4//! valid case as UTF-8 and falls back to code units only when the value is5//! ill-formed, so the compiler computes on true program values instead of6//! replacement characters or escape hatches.7//!8//! Wire format: the babel bridge transports lone surrogates as9//! `__SURROGATE_XXXX__` markers (see `sanitizeJsonSurrogates` in bridge.ts),10//! because serde_json can neither parse nor emit a lone `\uXXXX` escape.11//! Serde for `JsString` decodes and re-emits that marker form, which keeps the12//! JS side of the bridge unchanged.1314use std::fmt;1516use serde::Deserialize;17use serde::Serialize;1819/// Invariant: `Repr::Utf8` holds every well-formed value and `Repr::Wtf16`20/// only ill-formed ones (at least one unpaired surrogate). The derived21/// `PartialEq`/`Hash` are only sound under this invariant: a well-formed22/// value smuggled into `Wtf16` would compare unequal to its `Utf8` twin. The23/// representation is private so the invariant holds by construction; match on24/// [`JsString::as_ref`] to branch on well-formedness.25#[derive(Debug, Clone, PartialEq, Eq, Hash)]26pub struct JsString(Repr);2728#[derive(Debug, Clone, PartialEq, Eq, Hash)]29enum Repr {30 /// A well-formed string (no unpaired surrogates), stored as UTF-8.31 Utf8(String),32 /// An ill-formed string, stored as UTF-16 code units.33 Wtf16(Vec<u16>),34}3536/// Borrowed view of a [`JsString`] for callers that need to branch on37/// well-formedness.38#[derive(Debug, Clone, Copy, PartialEq, Eq)]39pub enum JsStringRef<'a> {40 Utf8(&'a str),41 Wtf16(&'a [u16]),42}4344impl JsString {45 /// Build from UTF-16 code units, normalizing to UTF-8 when well-formed.46 pub fn from_code_units(units: Vec<u16>) -> Self {47 match String::from_utf16(&units) {48 Ok(s) => JsString(Repr::Utf8(s)),49 Err(_) => JsString(Repr::Wtf16(units)),50 }51 }5253 pub fn as_ref(&self) -> JsStringRef<'_> {54 match &self.0 {55 Repr::Utf8(s) => JsStringRef::Utf8(s),56 Repr::Wtf16(units) => JsStringRef::Wtf16(units),57 }58 }5960 /// The UTF-8 view, when the value is well-formed.61 pub fn as_str(&self) -> Option<&str> {62 match &self.0 {63 Repr::Utf8(s) => Some(s),64 Repr::Wtf16(_) => None,65 }66 }6768 pub fn code_units(&self) -> Vec<u16> {69 match &self.0 {70 Repr::Utf8(s) => s.encode_utf16().collect(),71 Repr::Wtf16(units) => units.clone(),72 }73 }7475 /// Length in UTF-16 code units (JS `String.prototype.length`).76 pub fn len_utf16(&self) -> usize {77 match &self.0 {78 Repr::Utf8(s) => s.encode_utf16().count(),79 Repr::Wtf16(units) => units.len(),80 }81 }8283 /// The value with unpaired surrogates replaced by U+FFFD, for consumers84 /// whose string type cannot represent ill-formed values.85 pub fn to_string_lossy(&self) -> String {86 match &self.0 {87 Repr::Utf8(s) => s.clone(),88 Repr::Wtf16(units) => String::from_utf16_lossy(units),89 }90 }9192 /// Decode the bridge wire form: a UTF-8 string in which lone surrogates93 /// appear as `__SURROGATE_XXXX__` markers (uppercase hex, mirroring what94 /// `sanitizeJsonSurrogates` emits and `restoreJsonSurrogates` accepts).95 ///96 /// All scanning is byte-wise: a marker is 18 ASCII bytes, so byte-slice97 /// comparisons cannot land on a UTF-8 char boundary the way `str` range98 /// indexing can when multibyte text follows the prefix.99 pub fn from_marker_string(s: &str) -> Self {100 const PREFIX: &[u8] = b"__SURROGATE_";101 const MARKER_LEN: usize = 18;102 if !s.contains("__SURROGATE_") {103 return JsString(Repr::Utf8(s.to_string()));104 }105 let bytes = s.as_bytes();106 let mut units: Vec<u16> = Vec::with_capacity(s.len());107 let mut pos = 0;108 let mut segment_start = 0;109 while let Some(found) = s[pos..].find("__SURROGATE_") {110 let idx = pos + found;111 let tail = &bytes[idx..];112 let well_formed = tail.len() >= MARKER_LEN113 && &tail[MARKER_LEN - 2..MARKER_LEN] == b"__"114 && tail[PREFIX.len()..PREFIX.len() + 4]115 .iter()116 .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_lowercase());117 if well_formed {118 let hex = std::str::from_utf8(&tail[PREFIX.len()..PREFIX.len() + 4])119 .expect("ascii hex is valid utf8");120 let unit = u16::from_str_radix(hex, 16).expect("validated hex digits");121 units.extend(s[segment_start..idx].encode_utf16());122 units.push(unit);123 pos = idx + MARKER_LEN;124 segment_start = pos;125 } else {126 // Not a well-formed marker: keep the literal text and continue127 // scanning after the prefix.128 pos = idx + PREFIX.len();129 }130 }131 units.extend(s[segment_start..].encode_utf16());132 JsString::from_code_units(units)133 }134135 /// Encode to the bridge wire form (markers for unpaired surrogates).136 pub fn to_marker_string(&self) -> String {137 match &self.0 {138 Repr::Utf8(s) => s.clone(),139 Repr::Wtf16(units) => {140 let mut out = String::with_capacity(units.len() * 2);141 let mut iter = units.iter().copied().peekable();142 while let Some(unit) = iter.next() {143 match unit {144 0xD800..=0xDBFF => {145 if let Some(&next) = iter.peek() {146 if (0xDC00..=0xDFFF).contains(&next) {147 iter.next();148 let cp = 0x10000149 + ((unit as u32 - 0xD800) << 10)150 + (next as u32 - 0xDC00);151 out.push(char::from_u32(cp).expect("valid supplementary"));152 continue;153 }154 }155 out.push_str(&format!("__SURROGATE_{unit:04X}__"));156 }157 0xDC00..=0xDFFF => {158 out.push_str(&format!("__SURROGATE_{unit:04X}__"));159 }160 _ => {161 out.push(162 char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),163 );164 }165 }166 }167 out168 }169 }170 }171172 /// Render as JS-source-style escaped text, matching the form TS's debug173 /// printer produces via JSON.stringify: unpaired surrogates print as174 /// lowercase `\udXXX` escapes inside the otherwise UTF-8 text.175 pub fn to_escaped_string(&self) -> String {176 match &self.0 {177 Repr::Utf8(s) => s.clone(),178 Repr::Wtf16(units) => {179 let mut out = String::with_capacity(units.len() * 2);180 let mut iter = units.iter().copied().peekable();181 while let Some(unit) = iter.next() {182 match unit {183 0xD800..=0xDBFF => {184 if let Some(&next) = iter.peek() {185 if (0xDC00..=0xDFFF).contains(&next) {186 iter.next();187 let cp = 0x10000188 + ((unit as u32 - 0xD800) << 10)189 + (next as u32 - 0xDC00);190 out.push(char::from_u32(cp).expect("valid supplementary"));191 continue;192 }193 }194 out.push_str(&format!("\\u{unit:04x}"));195 }196 0xDC00..=0xDFFF => {197 out.push_str(&format!("\\u{unit:04x}"));198 }199 _ => {200 out.push(201 char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),202 );203 }204 }205 }206 out207 }208 }209 }210}211212impl From<String> for JsString {213 fn from(s: String) -> Self {214 // A Rust String is valid UTF-8 and so cannot contain an unpaired215 // surrogate; constructing Utf8 directly preserves the invariant.216 JsString(Repr::Utf8(s))217 }218}219220impl From<&str> for JsString {221 fn from(s: &str) -> Self {222 JsString(Repr::Utf8(s.to_string()))223 }224}225226impl PartialEq<str> for JsString {227 fn eq(&self, other: &str) -> bool {228 self.as_str() == Some(other)229 }230}231232impl PartialEq<&str> for JsString {233 fn eq(&self, other: &&str) -> bool {234 self.as_str() == Some(*other)235 }236}237238impl fmt::Display for JsString {239 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {240 f.write_str(&self.to_escaped_string())241 }242}243244impl Serialize for JsString {245 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {246 serializer.serialize_str(&self.to_marker_string())247 }248}249250impl<'de> Deserialize<'de> for JsString {251 fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {252 let s = String::deserialize(deserializer)?;253 Ok(JsString::from_marker_string(&s))254 }255}256257#[cfg(test)]258mod tests {259 use super::JsString;260 use super::JsStringRef;261262 #[test]263 fn as_ref_views_match_well_formedness() {264 assert!(matches!(265 JsString::from("plain").as_ref(),266 JsStringRef::Utf8("plain")267 ));268 assert!(matches!(269 JsString::from_code_units(vec![0xD83E]).as_ref(),270 JsStringRef::Wtf16(&[0xD83E])271 ));272 // Well-formed code units normalize to the Utf8 representation, so273 // equal logical strings are equal values regardless of how they274 // were constructed.275 assert_eq!(276 JsString::from_code_units("plain".encode_utf16().collect()),277 JsString::from("plain")278 );279 }280281 #[test]282 fn marker_round_trip_preserves_lone_surrogates() {283 let js = JsString::from_marker_string("__SURROGATE_D83E__");284 assert_eq!(js.code_units(), vec![0xD83E]);285 assert_eq!(js.to_marker_string(), "__SURROGATE_D83E__");286 assert_eq!(js.to_escaped_string(), "\\ud83e");287 }288289 #[test]290 fn paired_halves_render_as_the_supplementary_character() {291 let js = JsString::from_code_units(vec![0xD83E, 0xDD21]);292 assert_eq!(js.as_str(), Some("\u{1F921}"));293 }294295 #[test]296 fn plain_strings_stay_utf8_and_compare_with_str() {297 let js = JsString::from("use memo");298 assert!(js == "use memo");299 assert_eq!(js.to_marker_string(), "use memo");300 }301302 #[test]303 fn malformed_marker_text_is_kept_literally() {304 let js = JsString::from_marker_string("__SURROGATE_XYZ__");305 assert_eq!(js.as_str(), Some("__SURROGATE_XYZ__"));306 }307308 #[test]309 fn multibyte_text_after_marker_prefix_does_not_panic() {310 let input = "__SURROGATE_\u{20AC}\u{20AC}";311 let js = JsString::from_marker_string(input);312 assert_eq!(js.as_str(), Some(input));313314 let truncated = "__SURROGATE_D8";315 assert_eq!(316 JsString::from_marker_string(truncated).as_str(),317 Some(truncated)318 );319320 let mixed = "a\u{20AC}__SURROGATE_D83E__b\u{20AC}";321 let js = JsString::from_marker_string(mixed);322 let mut expected: Vec<u16> = "a\u{20AC}".encode_utf16().collect();323 expected.push(0xD83E);324 expected.extend("b\u{20AC}".encode_utf16());325 assert_eq!(js.code_units(), expected);326 }327328 #[test]329 fn lowercase_hex_markers_are_not_decoded() {330 // The bridge emits uppercase hex only; lowercase marker-shaped text is331 // user text and must survive verbatim.332 let input = "__SURROGATE_d83e__";333 assert_eq!(JsString::from_marker_string(input).as_str(), Some(input));334 }335}
Code quality findings 20
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
/// Borrowed view of a [`JsString`] for callers that need to branch on
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
Wtf16(&'a [u16]),
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
while let Some(found) = s[pos..].find("__SURROGATE_") {
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
let tail = &bytes[idx..];
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
&& &tail[MARKER_LEN - 2..MARKER_LEN] == b"__"
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
&& tail[PREFIX.len()..PREFIX.len() + 4]
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
let hex = std::str::from_utf8(&tail[PREFIX.len()..PREFIX.len() + 4])
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning
correctness
expect-usage
.expect("ascii hex is valid utf8");
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning
correctness
expect-usage
let unit = u16::from_str_radix(hex, 16).expect("validated hex digits");
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
units.extend(s[segment_start..idx].encode_utf16());
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning
correctness
unchecked-indexing
units.extend(s[segment_start..].encode_utf16());
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning
correctness
expect-usage
out.push(char::from_u32(cp).expect("valid supplementary"));
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning
correctness
expect-usage
char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning
correctness
expect-usage
out.push(char::from_u32(cp).expect("valid supplementary"));
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning
correctness
expect-usage
char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),
Performance Info: Calling .to_string() (especially on &str) allocates a new String. If done repeatedly in loops, consider alternatives like working with &str or using crates like `itoa`/`ryu` for number-to-string conversion.
info
performance
to-string-in-loop
return JsString(Repr::Utf8(s.to_string()));
Performance Info: Frequent cloning, especially of Strings, Vecs, or other heap-allocated types inside loops, can be expensive. Consider using references/borrowing where possible.
info
performance
clone-in-loop
Repr::Utf8(s) => s.clone(),
Performance Info: Calling .push() repeatedly inside a loop without prior capacity reservation can lead to multiple reallocations. Consider using `Vec::with_capacity(n)` or `vec.reserve(n)` if the approximate number of elements is known.
info
performance
push-without-reserve
out.push(char::from_u32(cp).expect("valid supplementary"));
Performance Info: Frequent cloning, especially of Strings, Vecs, or other heap-allocated types inside loops, can be expensive. Consider using references/borrowing where possible.
info
performance
clone-in-loop
Repr::Utf8(s) => s.clone(),
Performance Info: Calling .push() repeatedly inside a loop without prior capacity reservation can lead to multiple reallocations. Consider using `Vec::with_capacity(n)` or `vec.reserve(n)` if the approximate number of elements is known.
info
performance
push-without-reserve
out.push(char::from_u32(cp).expect("valid supplementary"));