forked from trizen/perl-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathburrows-wheeler_file_transform.pl
More file actions
130 lines (98 loc) · 2.97 KB
/
burrows-wheeler_file_transform.pl
File metadata and controls
130 lines (98 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/perl
# Author: Trizen
# Date: 14 June 2023
# Edit: 17 September 2023
# https://github.com/trizen
# Apply the Burrows–Wheeler transform on a file.
# https://rosettacode.org/wiki/Burrows–Wheeler_transform
# References:
# Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
# https://youtube.com/watch?v=rQ7wwh4HRZM
#
# Data Compression (Summer 2023) - Lecture 13 - BZip2
# https://youtube.com/watch?v=cvoZbBZ3M2A
use 5.036;
use Getopt::Std qw(getopts);
use File::Basename qw(basename);
use constant {
LOOKAHEAD_LEN => 128, # lower values are usually faster
};
sub bwt_sort ($s) { # O(n * LOOKAHEAD_LEN) space (fast)
my $len = length($s);
my $double_s = $s . $s; # Pre-compute doubled string
# Schwartzian transform with optimized sorting
return [
map { $_->[1] }
sort { ($a->[0] cmp $b->[0]) || (substr($double_s, $a->[1], $len) cmp substr($double_s, $b->[1], $len)) }
map {
my $pos = $_;
my $end = $pos + LOOKAHEAD_LEN;
# Handle wraparound efficiently
my $t =
($end <= $len)
? substr($s, $pos, LOOKAHEAD_LEN)
: substr($double_s, $pos, LOOKAHEAD_LEN);
[$t, $pos]
} 0 .. $len - 1
];
}
sub bwt_encode ($s) {
my $bwt = bwt_sort($s);
my $ret = '';
my $idx = 0;
my $i = 0;
foreach my $pos (@$bwt) {
$ret .= substr($s, $pos - 1, 1);
$idx = $i if !$pos;
++$i;
}
return ($ret, $idx);
}
sub bwt_decode ($bwt, $idx) { # fast inversion: O(n * log(n))
my @tail = split(//, $bwt);
my @head = sort @tail;
if ($idx > $#head) {
die "Invalid bwt-index: $idx (must be <= $#head)\n";
}
my %indices;
foreach my $i (0 .. $#tail) {
push @{$indices{$tail[$i]}}, $i;
}
my @table;
foreach my $v (@head) {
push @table, shift(@{$indices{$v}});
}
my $dec = '';
my $i = $idx;
for (1 .. scalar(@head)) {
$dec .= $head[$i];
$i = $table[$i];
}
return $dec;
}
getopts('dh', \my %opts);
if ($opts{h} or !@ARGV) {
die "usage: $0 [-d] [input file] [output file]\n";
}
my $input_file = $ARGV[0];
my $output_file = $ARGV[1] // (basename($input_file) . ($opts{d} ? '.dec' : '.bw'));
my $content = do {
open my $fh, '<:raw', $input_file
or die "Can't open file <<$input_file>> for reading: $!";
local $/;
<$fh>;
};
if ($opts{d}) { # decode mode
my $idx = unpack('N', substr($content, 0, 4, ''));
my $dec = bwt_decode($content, $idx);
open my $out_fh, '>:raw', $output_file
or die "Can't open file <<$output_file>> for writing: $!";
print $out_fh $dec;
}
else {
my ($bwt, $idx) = bwt_encode($content);
open my $out_fh, '>:raw', $output_file
or die "Can't open file <<$output_file>> for writing: $!";
print $out_fh pack('N', $idx);
print $out_fh $bwt;
}