AACTools · OwenMcGirr · Apr 11, 2026 · Apr 11, 2026
diff --git a/src/__tests__/azure-mstts-namespace.test.ts b/src/__tests__/azure-mstts-namespace.test.ts
@@ -149,5 +149,40 @@ describe("Azure MSTTS Namespace Handling", () => {
       expect(result).toMatch(/<voice[^>]*>\s*<prosody[^>]*>/);
       expect(result).toMatch(/<\/prosody>\s*<\/voice>/);
     });
+
+    it("should produce a single <prosody> element (not double-nested) when options are provided", async () => {
+      // Regression test for: https://github.com/willwade/js-tts-wrapper/issues/40
+      // this.properties defaults (rate="medium", pitch="medium", volume=100) were always
+      // truthy, causing a first prosody to be added, then options adding a second one on top.
+      const plainSSML = `<speak>Hello world</speak>`;
+      const options = { rate: "fast", pitch: "high", volume: 80 };
+
+      const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options);
+
+      const prosodyMatches = result.match(/<prosody/g);
+      expect(prosodyMatches?.length).toBe(1);
+    });
+
+    it("should not emit <prosody> when all values are at Azure defaults", async () => {
+      // No prosody element needed when everything is at the implicit default
+      const plainSSML = `<speak>Hello world</speak>`;
+      const options = { rate: "medium", pitch: "medium", volume: 100 };
+
+      const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options);
+
+      expect(result).not.toContain("<prosody");
+    });
+
+    it("should normalise 0-1 volume fraction to 0-100 percentage", async () => {
+      // Regression test for: https://github.com/willwade/js-tts-wrapper/issues/40
+      // Callers commonly pass volume as a 0-1 float; 0.8 should become volume="80%", not "0.8%".
+      const plainSSML = `<speak>Hello world</speak>`;
+      const options = { volume: 0.8 };
+
+      const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options);
+
+      expect(result).toContain('volume="80%"');
+      expect(result).not.toContain('volume="0.8%"');
+    });
   });
 });
diff --git a/src/engines/azure.ts b/src/engines/azure.ts
@@ -616,38 +616,42 @@ export class AzureTTSClient extends AbstractTTSClient {
       }
     }
 
-    // Add prosody if properties are set
-    if (this.properties.rate || this.properties.pitch || this.properties.volume) {
-      // Extract content between voice tags or speak tags
-      let content = "";
-      if (ssml.includes("<voice")) {
-        const match = ssml.match(/<voice[^>]*>(.*?)<\/voice>/s);
-        if (match) {
-          content = match[1];
-          const prosodyContent = this.constructProsodyTag(content);
-          ssml = ssml.replace(content, prosodyContent);
-        }
-      } else {
-        const match = ssml.match(/<speak[^>]*>(.*?)<\/speak>/s);
-        if (match) {
-          content = match[1];
-          const prosodyContent = this.constructProsodyTag(content);
-          ssml = ssml.replace(content, prosodyContent);
-        }
+    // Build prosody attributes by merging this.properties defaults with per-call options.
+    // Options take precedence. We only emit a <prosody> element when at least one
+    // attribute differs from Azure's implicit defaults (medium/medium/100%), to avoid
+    // wrapping content in a no-op element.
+    {
+      const DEFAULT_RATE = "medium";
+      const DEFAULT_PITCH = "medium";
+      const DEFAULT_VOLUME = 100;
+
+      const rate = options?.rate ?? (this.properties.rate as string | undefined);
+      const pitch = options?.pitch ?? (this.properties.pitch as string | undefined);
+      // volume: SpeakOptions types volume as 0-100. Guard against callers who pass a
+      // 0-1 fraction by normalising: any value ≤ 1 (and > 0) is treated as a fraction
+      // and scaled to 0-100.
+      let rawVolume: number | undefined =
+        options?.volume !== undefined
+          ? options.volume
+          : (this.properties.volume as number | undefined);
+      if (rawVolume !== undefined && rawVolume > 0 && rawVolume <= 1) {
+        rawVolume = Math.round(rawVolume * 100);
       }
-    }
+      const volume = rawVolume !== undefined ? rawVolume : DEFAULT_VOLUME;
+
+      const hasNonDefaultProsody =
+        (rate !== undefined && rate !== DEFAULT_RATE) ||
+        (pitch !== undefined && pitch !== DEFAULT_PITCH) ||
+        volume !== DEFAULT_VOLUME;
+
+      if (hasNonDefaultProsody) {
+        const attrs: string[] = [];
+        if (rate && rate !== DEFAULT_RATE) attrs.push(`rate="${rate}"`);
+        if (pitch && pitch !== DEFAULT_PITCH) attrs.push(`pitch="${pitch}"`);
+        if (volume !== DEFAULT_VOLUME) attrs.push(`volume="${volume}%"`);
 
-    // Also add prosody from options if provided
-    if (options?.rate || options?.pitch || options?.volume !== undefined) {
-      // Create prosody attributes
-      const attrs: string[] = [];
-      if (options.rate) attrs.push(`rate="${options.rate}"`);
-      if (options.pitch) attrs.push(`pitch="${options.pitch}"`);
-      if (options.volume !== undefined) attrs.push(`volume="${options.volume}%"`);
-
-      if (attrs.length > 0) {
-        // Extract content from inside <voice> if present, otherwise from <speak>.
-        // Prosody must be nested inside <voice>, not as a direct child of <speak>.
+        // <prosody> must be nested inside <voice>, not as a direct child of <speak>.
+        // Azure rejects: Node [speak] should not contain node [prosody] with type [Others].
         if (ssml.includes("<voice")) {
           const match = ssml.match(/<voice[^>]*>(.*?)<\/voice>/s);
           if (match) {