Skip to content

Commit 0190f6e

Browse files
committed
feat: Add RLE and Shannon-Fano compression algorithms
1 parent 8726d40 commit 0190f6e

File tree

4 files changed

+397
-0
lines changed

4 files changed

+397
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
package com.thealgorithms.compression;
2+
3+
/**
4+
* An implementation of the Run-Length Encoding (RLE) algorithm.
5+
*
6+
* <p>Run-Length Encoding is a simple form of lossless data compression in which
7+
* runs of data (sequences in which the same data value occurs in many
8+
* consecutive data elements) are stored as a single data value and count,
9+
* rather than as the original run.
10+
*
11+
* <p>This implementation provides methods for both compressing and decompressing
12+
* a string. For example:
13+
* <ul>
14+
* <li>Compressing "AAAABBBCCDAA" results in "4A3B2C1D2A".</li>
15+
* <li>Decompressing "4A3B2C1D2A" results in "AAAABBBCCDAA".</li>
16+
* </ul>
17+
*
18+
* <p>Time Complexity: O(n) for both compression and decompression, where n is the
19+
* length of the input string.
20+
*
21+
* <p>References:
22+
* <ul>
23+
* <li><a href="https://en.wikipedia.org/wiki/Run-length_encoding">Wikipedia: Run-length encoding</a></li>
24+
* </ul>
25+
*/
26+
public final class RunLengthEncoding {
27+
28+
/**
29+
* Private constructor to prevent instantiation of this utility class.
30+
*/
31+
private RunLengthEncoding() {
32+
}
33+
34+
/**
35+
* Compresses a string using the Run-Length Encoding algorithm.
36+
*
37+
* @param text The string to be compressed. Must not be null.
38+
* @return The compressed string. Returns an empty string if the input is empty.
39+
*/
40+
public static String compress(String text) {
41+
if (text == null || text.isEmpty()) {
42+
return "";
43+
}
44+
45+
StringBuilder compressed = new StringBuilder();
46+
int count = 1;
47+
48+
for (int i = 0; i < text.length(); i++) {
49+
// Check if it's the last character or if the next character is different
50+
if (i == text.length() - 1 || text.charAt(i) != text.charAt(i + 1)) {
51+
compressed.append(count);
52+
compressed.append(text.charAt(i));
53+
count = 1; // Reset count for the new character
54+
} else {
55+
count++;
56+
}
57+
}
58+
return compressed.toString();
59+
}
60+
61+
/**
62+
* Decompresses a string that was compressed using the Run-Length Encoding algorithm.
63+
*
64+
* @param compressedText The compressed string. Must not be null.
65+
* @return The original, uncompressed string.
66+
*/
67+
public static String decompress(String compressedText) {
68+
if (compressedText == null || compressedText.isEmpty()) {
69+
return "";
70+
}
71+
72+
StringBuilder decompressed = new StringBuilder();
73+
int count = 0;
74+
75+
for (char ch : compressedText.toCharArray()) {
76+
if (Character.isDigit(ch)) {
77+
// Build the number for runs of 10 or more (e.g., "12A")
78+
count = count * 10 + (ch - '0');
79+
} else {
80+
// Append the character 'count' times
81+
decompressed.append(String.valueOf(ch).repeat(Math.max(0, count)));
82+
count = 0; // Reset count for the next sequence
83+
}
84+
}
85+
return decompressed.toString();
86+
}
87+
}
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.ArrayList;
4+
import java.util.Collections;
5+
import java.util.HashMap;
6+
import java.util.List;
7+
import java.util.Map;
8+
import java.util.stream.Collectors;
9+
10+
/**
11+
* An implementation of the Shannon-Fano algorithm for generating prefix codes.
12+
*
13+
* <p>Shannon-Fano coding is an entropy encoding technique for lossless data
14+
* compression. It assigns variable-length codes to symbols based on their
15+
* frequencies of occurrence. It is a precursor to Huffman coding and works by
16+
* recursively partitioning a sorted list of symbols into two sub-lists with
17+
* nearly equal total frequencies.
18+
*
19+
* <p>The algorithm works as follows:
20+
* <ol>
21+
* <li>Count the frequency of each symbol in the input data.</li>
22+
* <li>Sort the symbols in descending order of their frequencies.</li>
23+
* <li>Recursively divide the list of symbols into two parts with sums of
24+
* frequencies as close as possible to each other.</li>
25+
* <li>Assign a '0' bit to the codes in the first part and a '1' bit to the codes
26+
* in the second part.</li>
27+
* <li>Repeat the process for each part until a part contains only one symbol.</li>
28+
* </ol>
29+
*
30+
* <p>Time Complexity: O(n^2) in this implementation due to the partitioning logic,
31+
* or O(n log n) if a more optimized partitioning strategy is used.
32+
* Sorting takes O(n log n), where n is the number of unique symbols.
33+
*
34+
* <p>References:
35+
* <ul>
36+
* <li><a href="https://en.wikipedia.org/wiki/Shannonâ€"Fano_coding">Wikipedia: Shannonâ€"Fano coding</a></li>
37+
* </ul>
38+
*/
39+
public final class ShannonFano {
40+
41+
/**
42+
* Private constructor to prevent instantiation of this utility class.
43+
*/
44+
private ShannonFano() {
45+
}
46+
47+
/**
48+
* A private inner class to represent a symbol and its frequency.
49+
* Implements Comparable to allow sorting based on frequency.
50+
*/
51+
private static class Symbol implements Comparable<Symbol> {
52+
final char character;
53+
final int frequency;
54+
String code = "";
55+
56+
Symbol(char character, int frequency) {
57+
this.character = character;
58+
this.frequency = frequency;
59+
}
60+
61+
@Override
62+
public int compareTo(Symbol other) {
63+
return Integer.compare(other.frequency, this.frequency); // Sort descending
64+
}
65+
}
66+
67+
/**
68+
* Generates Shannon-Fano codes for the symbols in a given text.
69+
*
70+
* @param text The input string for which to generate codes. Must not be null.
71+
* @return A map where keys are characters and values are their corresponding Shannon-Fano codes.
72+
*/
73+
public static Map<Character, String> generateCodes(String text) {
74+
if (text == null || text.isEmpty()) {
75+
return Collections.emptyMap();
76+
}
77+
78+
Map<Character, Integer> frequencyMap = new HashMap<>();
79+
for (char c : text.toCharArray()) {
80+
frequencyMap.put(c, frequencyMap.getOrDefault(c, 0) + 1);
81+
}
82+
83+
List<Symbol> symbols = new ArrayList<>();
84+
for (Map.Entry<Character, Integer> entry : frequencyMap.entrySet()) {
85+
symbols.add(new Symbol(entry.getKey(), entry.getValue()));
86+
}
87+
88+
Collections.sort(symbols);
89+
90+
// Special case: only one unique symbol
91+
if (symbols.size() == 1) {
92+
symbols.getFirst().code = "0";
93+
} else {
94+
buildCodeTree(symbols, 0, symbols.size() - 1, "");
95+
}
96+
97+
return symbols.stream().collect(Collectors.toMap(s -> s.character, s -> s.code));
98+
}
99+
100+
/**
101+
* Recursively builds the Shannon-Fano code tree by partitioning the list of symbols.
102+
* Uses index-based approach to avoid sublist creation issues.
103+
*
104+
* @param symbols The sorted list of symbols to be processed.
105+
* @param start The start index of the current partition.
106+
* @param end The end index of the current partition (inclusive).
107+
* @param prefix The current prefix code being built for the symbols in this partition.
108+
*/
109+
private static void buildCodeTree(List<Symbol> symbols, int start, int end, String prefix) {
110+
if (start > end) {
111+
return;
112+
}
113+
114+
if (start == end) {
115+
// Base case: single symbol gets the current prefix as its code.
116+
symbols.get(start).code = prefix.isEmpty() ? "0" : prefix;
117+
return;
118+
}
119+
120+
// Find the optimal split point
121+
int splitIndex = findSplitIndex(symbols, start, end);
122+
123+
// Recursively process left and right partitions with updated prefixes
124+
buildCodeTree(symbols, start, splitIndex, prefix + "0");
125+
buildCodeTree(symbols, splitIndex + 1, end, prefix + "1");
126+
}
127+
128+
/**
129+
* Finds the index that splits the range into two parts with the most balanced frequency sums.
130+
* This method tries every possible split point and returns the index that minimizes the
131+
* absolute difference between the two partition sums.
132+
*
133+
* @param symbols The sorted list of symbols.
134+
* @param start The start index of the range.
135+
* @param end The end index of the range (inclusive).
136+
* @return The index of the last element in the first partition.
137+
*/
138+
private static int findSplitIndex(List<Symbol> symbols, int start, int end) {
139+
// Calculate total frequency for the entire range
140+
long totalFrequency = 0;
141+
for (int i = start; i <= end; i++) {
142+
totalFrequency += symbols.get(i).frequency;
143+
}
144+
145+
long leftSum = 0;
146+
long minDifference = Long.MAX_VALUE;
147+
int splitIndex = start;
148+
149+
// Try every possible split point and find the one with minimum difference
150+
for (int i = start; i < end; i++) {
151+
leftSum += symbols.get(i).frequency;
152+
long rightSum = totalFrequency - leftSum;
153+
long difference = Math.abs(leftSum - rightSum);
154+
155+
if (difference < minDifference) {
156+
minDifference = difference;
157+
splitIndex = i;
158+
}
159+
}
160+
return splitIndex;
161+
}
162+
}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package com.thealgorithms.compression;
2+
3+
import static org.junit.jupiter.api.Assertions.assertEquals;
4+
5+
import org.junit.jupiter.api.Test;
6+
7+
class RunLengthEncodingTest {
8+
9+
@Test
10+
void testCompressionSimple() {
11+
// Test a typical string with multiple runs
12+
String input = "AAAABBBCCDAA";
13+
String expected = "4A3B2C1D2A";
14+
assertEquals(expected, RunLengthEncoding.compress(input));
15+
}
16+
17+
@Test
18+
void testCompressionWithNoRuns() {
19+
// Test a string with no consecutive characters
20+
String input = "ABCDE";
21+
String expected = "1A1B1C1D1E";
22+
assertEquals(expected, RunLengthEncoding.compress(input));
23+
}
24+
25+
@Test
26+
void testCompressionEdgeCases() {
27+
// Test with an empty string
28+
assertEquals("", RunLengthEncoding.compress(""));
29+
30+
// Test with a single character
31+
assertEquals("1A", RunLengthEncoding.compress("A"));
32+
33+
// Test with a long run of a single character
34+
assertEquals("10Z", RunLengthEncoding.compress("ZZZZZZZZZZ"));
35+
}
36+
37+
@Test
38+
void testDecompressionSimple() {
39+
// Test decompression of a typical RLE string
40+
String input = "4A3B2C1D2A";
41+
String expected = "AAAABBBCCDAA";
42+
assertEquals(expected, RunLengthEncoding.decompress(input));
43+
}
44+
45+
@Test
46+
void testDecompressionWithNoRuns() {
47+
// Test decompression of a string with single characters
48+
String input = "1A1B1C1D1E";
49+
String expected = "ABCDE";
50+
assertEquals(expected, RunLengthEncoding.decompress(input));
51+
}
52+
53+
@Test
54+
void testDecompressionWithMultiDigitCount() {
55+
// Test decompression where a run count is greater than 9
56+
String input = "12A1B3C";
57+
String expected = "AAAAAAAAAAAABCCC";
58+
assertEquals(expected, RunLengthEncoding.decompress(input));
59+
}
60+
61+
@Test
62+
void testDecompressionEdgeCases() {
63+
// Test with an empty string
64+
assertEquals("", RunLengthEncoding.decompress(""));
65+
66+
// Test with a single character run
67+
assertEquals("A", RunLengthEncoding.decompress("1A"));
68+
}
69+
70+
@Test
71+
void testSymmetry() {
72+
// Test that compressing and then decompressing returns the original string
73+
String original1 = "WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB";
74+
String compressed = RunLengthEncoding.compress(original1);
75+
String decompressed = RunLengthEncoding.decompress(compressed);
76+
assertEquals(original1, decompressed);
77+
78+
String original2 = "A";
79+
assertEquals(original2, RunLengthEncoding.decompress(RunLengthEncoding.compress(original2)));
80+
}
81+
}

0 commit comments

Comments
 (0)