feat: Add RLE and Shannon-Fano compression algorithms

Microindole · Microindole · commit 0190f6eaffed · 2025-10-14T10:50:48.000+08:00
diff --git a/src/main/java/com/thealgorithms/compression/RunLengthEncoding.java b/src/main/java/com/thealgorithms/compression/RunLengthEncoding.java
@@ -0,0 +1,87 @@
+package com.thealgorithms.compression;
+
+/**
+ * An implementation of the Run-Length Encoding (RLE) algorithm.
+ *
+ * <p>Run-Length Encoding is a simple form of lossless data compression in which
+ * runs of data (sequences in which the same data value occurs in many
+ * consecutive data elements) are stored as a single data value and count,
+ * rather than as the original run.
+ *
+ * <p>This implementation provides methods for both compressing and decompressing
+ * a string. For example:
+ * <ul>
+ * <li>Compressing "AAAABBBCCDAA" results in "4A3B2C1D2A".</li>
+ * <li>Decompressing "4A3B2C1D2A" results in "AAAABBBCCDAA".</li>
+ * </ul>
+ *
+ * <p>Time Complexity: O(n) for both compression and decompression, where n is the
+ * length of the input string.
+ *
+ * <p>References:
+ * <ul>
+ * <li><a href="https://en.wikipedia.org/wiki/Run-length_encoding">Wikipedia: Run-length encoding</a></li>
+ * </ul>
+ */
+public final class RunLengthEncoding {
+
+    /**
+     * Private constructor to prevent instantiation of this utility class.
+     */
+    private RunLengthEncoding() {
+    }
+
+    /**
+     * Compresses a string using the Run-Length Encoding algorithm.
+     *
+     * @param text The string to be compressed. Must not be null.
+     * @return The compressed string. Returns an empty string if the input is empty.
+     */
+    public static String compress(String text) {
+        if (text == null || text.isEmpty()) {
+            return "";
+        }
+
+        StringBuilder compressed = new StringBuilder();
+        int count = 1;
+
+        for (int i = 0; i < text.length(); i++) {
+            // Check if it's the last character or if the next character is different
+            if (i == text.length() - 1 || text.charAt(i) != text.charAt(i + 1)) {
+                compressed.append(count);
+                compressed.append(text.charAt(i));
+                count = 1; // Reset count for the new character
+            } else {
+                count++;
+            }
+        }
+        return compressed.toString();
+    }
+
+    /**
+     * Decompresses a string that was compressed using the Run-Length Encoding algorithm.
+     *
+     * @param compressedText The compressed string. Must not be null.
+     * @return The original, uncompressed string.
+     */
+    public static String decompress(String compressedText) {
+        if (compressedText == null || compressedText.isEmpty()) {
+            return "";
+        }
+
+        StringBuilder decompressed = new StringBuilder();
+        int count = 0;
+
+        for (char ch : compressedText.toCharArray()) {
+            if (Character.isDigit(ch)) {
+                // Build the number for runs of 10 or more (e.g., "12A")
+                count = count * 10 + (ch - '0');
+            } else {
+                // Append the character 'count' times
+                decompressed.append(String.valueOf(ch).repeat(Math.max(0, count)));
+                count = 0; // Reset count for the next sequence
+            }
+        }
+        return decompressed.toString();
+    }
+}
diff --git a/src/main/java/com/thealgorithms/compression/ShannonFano.java b/src/main/java/com/thealgorithms/compression/ShannonFano.java
@@ -0,0 +1,162 @@
+package com.thealgorithms.compression;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * An implementation of the Shannon-Fano algorithm for generating prefix codes.
+ *
+ * <p>Shannon-Fano coding is an entropy encoding technique for lossless data
+ * compression. It assigns variable-length codes to symbols based on their
+ * frequencies of occurrence. It is a precursor to Huffman coding and works by
+ * recursively partitioning a sorted list of symbols into two sub-lists with
+ * nearly equal total frequencies.
+ *
+ * <p>The algorithm works as follows:
+ * <ol>
+ * <li>Count the frequency of each symbol in the input data.</li>
+ * <li>Sort the symbols in descending order of their frequencies.</li>
+ * <li>Recursively divide the list of symbols into two parts with sums of
+ * frequencies as close as possible to each other.</li>
+ * <li>Assign a '0' bit to the codes in the first part and a '1' bit to the codes
+ * in the second part.</li>
+ * <li>Repeat the process for each part until a part contains only one symbol.</li>
+ * </ol>
+ *
+ * <p>Time Complexity: O(n^2) in this implementation due to the partitioning logic,
+ * or O(n log n) if a more optimized partitioning strategy is used.
+ * Sorting takes O(n log n), where n is the number of unique symbols.
+ *
+ * <p>References:
+ * <ul>
+ * <li><a href="https://en.wikipedia.org/wiki/ShannonÃ¢â‚¬"Fano_coding">Wikipedia: ShannonÃ¢â‚¬"Fano coding</a></li>
+ * </ul>
+ */
+public final class ShannonFano {
+
+    /**
+     * Private constructor to prevent instantiation of this utility class.
+     */
+    private ShannonFano() {
+    }
+
+    /**
+     * A private inner class to represent a symbol and its frequency.
+     * Implements Comparable to allow sorting based on frequency.
+     */
+    private static class Symbol implements Comparable<Symbol> {
+        final char character;
+        final int frequency;
+        String code = "";
+
+        Symbol(char character, int frequency) {
+            this.character = character;
+            this.frequency = frequency;
+        }
+
+        @Override
+        public int compareTo(Symbol other) {
+            return Integer.compare(other.frequency, this.frequency); // Sort descending
+        }
+    }
+
+    /**
+     * Generates Shannon-Fano codes for the symbols in a given text.
+     *
+     * @param text The input string for which to generate codes. Must not be null.
+     * @return A map where keys are characters and values are their corresponding Shannon-Fano codes.
+     */
+    public static Map<Character, String> generateCodes(String text) {
+        if (text == null || text.isEmpty()) {
+            return Collections.emptyMap();
+        }
+
+        Map<Character, Integer> frequencyMap = new HashMap<>();
+        for (char c : text.toCharArray()) {
+            frequencyMap.put(c, frequencyMap.getOrDefault(c, 0) + 1);
+        }
+
+        List<Symbol> symbols = new ArrayList<>();
+        for (Map.Entry<Character, Integer> entry : frequencyMap.entrySet()) {
+            symbols.add(new Symbol(entry.getKey(), entry.getValue()));
+        }
+
+        Collections.sort(symbols);
+
+        // Special case: only one unique symbol
+        if (symbols.size() == 1) {
+            symbols.getFirst().code = "0";
+        } else {
+            buildCodeTree(symbols, 0, symbols.size() - 1, "");
+        }
+
+        return symbols.stream().collect(Collectors.toMap(s -> s.character, s -> s.code));
+    }
+
+    /**
+     * Recursively builds the Shannon-Fano code tree by partitioning the list of symbols.
+     * Uses index-based approach to avoid sublist creation issues.
+     *
+     * @param symbols The sorted list of symbols to be processed.
+     * @param start   The start index of the current partition.
+     * @param end     The end index of the current partition (inclusive).
+     * @param prefix  The current prefix code being built for the symbols in this partition.
+     */
+    private static void buildCodeTree(List<Symbol> symbols, int start, int end, String prefix) {
+        if (start > end) {
+            return;
+        }
+
+        if (start == end) {
+            // Base case: single symbol gets the current prefix as its code.
+            symbols.get(start).code = prefix.isEmpty() ? "0" : prefix;
+            return;
+        }
+
+        // Find the optimal split point
+        int splitIndex = findSplitIndex(symbols, start, end);
+
+        // Recursively process left and right partitions with updated prefixes
+        buildCodeTree(symbols, start, splitIndex, prefix + "0");
+        buildCodeTree(symbols, splitIndex + 1, end, prefix + "1");
+    }
+
+    /**
+     * Finds the index that splits the range into two parts with the most balanced frequency sums.
+     * This method tries every possible split point and returns the index that minimizes the
+     * absolute difference between the two partition sums.
+     *
+     * @param symbols The sorted list of symbols.
+     * @param start   The start index of the range.
+     * @param end     The end index of the range (inclusive).
+     * @return The index of the last element in the first partition.
+     */
+    private static int findSplitIndex(List<Symbol> symbols, int start, int end) {
+        // Calculate total frequency for the entire range
+        long totalFrequency = 0;
+        for (int i = start; i <= end; i++) {
+            totalFrequency += symbols.get(i).frequency;
+        }
+
+        long leftSum = 0;
+        long minDifference = Long.MAX_VALUE;
+        int splitIndex = start;
+
+        // Try every possible split point and find the one with minimum difference
+        for (int i = start; i < end; i++) {
+            leftSum += symbols.get(i).frequency;
+            long rightSum = totalFrequency - leftSum;
+            long difference = Math.abs(leftSum - rightSum);
+
+            if (difference < minDifference) {
+                minDifference = difference;
+                splitIndex = i;
+            }
+        }
+        return splitIndex;
+    }
+}
diff --git a/src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java b/src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java
@@ -0,0 +1,81 @@
+package com.thealgorithms.compression;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+class RunLengthEncodingTest {
+
+    @Test
+    void testCompressionSimple() {
+        // Test a typical string with multiple runs
+        String input = "AAAABBBCCDAA";
+        String expected = "4A3B2C1D2A";
+        assertEquals(expected, RunLengthEncoding.compress(input));
+    }
+
+    @Test
+    void testCompressionWithNoRuns() {
+        // Test a string with no consecutive characters
+        String input = "ABCDE";
+        String expected = "1A1B1C1D1E";
+        assertEquals(expected, RunLengthEncoding.compress(input));
+    }
+
+    @Test
+    void testCompressionEdgeCases() {
+        // Test with an empty string
+        assertEquals("", RunLengthEncoding.compress(""));
+
+        // Test with a single character
+        assertEquals("1A", RunLengthEncoding.compress("A"));
+
+        // Test with a long run of a single character
+        assertEquals("10Z", RunLengthEncoding.compress("ZZZZZZZZZZ"));
+    }
+
+    @Test
+    void testDecompressionSimple() {
+        // Test decompression of a typical RLE string
+        String input = "4A3B2C1D2A";
+        String expected = "AAAABBBCCDAA";
+        assertEquals(expected, RunLengthEncoding.decompress(input));
+    }
+
+    @Test
+    void testDecompressionWithNoRuns() {
+        // Test decompression of a string with single characters
+        String input = "1A1B1C1D1E";
+        String expected = "ABCDE";
+        assertEquals(expected, RunLengthEncoding.decompress(input));
+    }
+
+    @Test
+    void testDecompressionWithMultiDigitCount() {
+        // Test decompression where a run count is greater than 9
+        String input = "12A1B3C";
+        String expected = "AAAAAAAAAAAABCCC";
+        assertEquals(expected, RunLengthEncoding.decompress(input));
+    }
+
+    @Test
+    void testDecompressionEdgeCases() {
+        // Test with an empty string
+        assertEquals("", RunLengthEncoding.decompress(""));
+
+        // Test with a single character run
+        assertEquals("A", RunLengthEncoding.decompress("1A"));
+    }
+
+    @Test
+    void testSymmetry() {
+        // Test that compressing and then decompressing returns the original string
+        String original1 = "WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB";
+        String compressed = RunLengthEncoding.compress(original1);
+        String decompressed = RunLengthEncoding.decompress(compressed);
+        assertEquals(original1, decompressed);
+
+        String original2 = "A";
+        assertEquals(original2, RunLengthEncoding.decompress(RunLengthEncoding.compress(original2)));
+    }
+}
diff --git a/src/test/java/com/thealgorithms/compression/ShannonFanoTest.java b/src/test/java/com/thealgorithms/compression/ShannonFanoTest.java