LZW
From CodeCodex
The Lempel-Ziv-Welch algorithm provides lossless data compression. (read on Wikipedia) It was patented, but it fell in the public domain in 2004.
Contents
Implementations[edit]
OCaml[edit]
OCaml, the Imperative Way[edit]
(** Lempel-Ziv-Welch compression algorithm *) let push li v = li := v :: !li (** compress a string to a list of output symbols *) let compress ~uncompressed = (* build the dictionary *) let dict_size = ref 256 in let dictionary = Hashtbl.create 397 in for i=0 to 255 do let str = String.make 1 (char_of_int i) in Hashtbl.add dictionary str i done; let result = ref [] in let w = ref "" in String.iter uncompressed (fun c -> let c = String.make 1 c in let wc = !w ^ c in if Hashtbl.mem dictionary wc then w := wc else begin push result (Hashtbl.find dictionary !w); (* add wc to the dictionary *) Hashtbl.add dictionary wc !dict_size; incr dict_size; w := c end ); (* output the code for w *) if !w <> "" then push result (Hashtbl.find dictionary !w); List.rev !result ;; exception ValueError of string (** decompress a list of output symbols to a string *) let decompress ~compressed = (* build the dictionary *) let dict_size = ref 256 in let dictionary = Hashtbl.create 397 in for i=0 to pred !dict_size do let str = String.make 1 (char_of_int i) in Hashtbl.add dictionary i str done; let w, compressed = match compressed with | hd::tl -> ref(String.make 1 (char_of_int hd)), tl | [] -> failwith "empty" in let result = ref [!w] in List.iter (fun k -> let entry = if Hashtbl.mem dictionary k then Hashtbl.find dictionary k else if k = Hashtbl.length dictionary then !w ^ String.sub !w 0 1 else raise (ValueError (Printf.sprintf "Bad compressed k: %d" k)) in result := entry :: !result; (* add (w ^ entry.[0]) to the dictionary *) Hashtbl.add dictionary !dict_size (!w ^ String.sub entry 0 1); incr dict_size; w := entry; ) compressed; List.rev !result ;;
OCaml, the Functional Way[edit]
(** Lempel-Ziv-Welch compression algorithm *) #directory "+site-lib/extlib/" #load "extLib.cma" open ExtString (** compress a string to a list of output symbols *) let compress ~uncompressed = (* build the dictionary *) let dict_size = 256 in let dictionary = Hashtbl.create 397 in for i=0 to 255 do let str = String.make 1 (char_of_int i) in Hashtbl.add dictionary str i done; let f = (fun (w, dict_size, result) c -> let c = String.make 1 c in let wc = w ^ c in if Hashtbl.mem dictionary wc then (wc, dict_size, result) else begin (* add wc to the dictionary *) Hashtbl.add dictionary wc dict_size; let this = Hashtbl.find dictionary w in (c, dict_size + 1, this::result) end ) in let w, _, result = String.fold_left f ("", dict_size, []) uncompressed in (* output the code for w *) let result = if w = "" then result else (Hashtbl.find dictionary w) :: result in (List.rev result) ;; exception ValueError of string (** decompress a list of output symbols to a string *) let decompress ~compressed = (* build the dictionary *) let dict_size = 256 in let dictionary = Hashtbl.create 397 in for i=0 to pred dict_size do let str = String.make 1 (char_of_int i) in Hashtbl.add dictionary i str done; let w, compressed = match compressed with | hd::tl -> (String.make 1 (char_of_int hd)), tl | [] -> failwith "empty input" in let result = w::[] in let result, _, _ = List.fold_left (fun (result, w, dict_size) k -> let entry = if Hashtbl.mem dictionary k then Hashtbl.find dictionary k else if k = Hashtbl.length dictionary then w ^ (String.make 1 w.[0]) else raise(ValueError(Printf.sprintf "Bad compressed k: %d" k)) in let result = entry :: result in (* add (w ^ entry.[0]) to the dictionary *) Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0])); (result, entry, dict_size + 1) ) (result, w, dict_size) compressed in (List.rev result) ;;
OCaml, How to use[edit]
Both implementations provide the same interface:
val compress : uncompressed:string -> int list = <fun> val decompress : compressed:int list -> string list = <fun>
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.
let greatest = List.fold_left max 0 ;;
(** number of bits needed to encode the integer m *) let n_bits m = let rec aux m n = if m = 0 then n else aux (m lsr 1) (n + 1) in aux m 0 ;;
Then you can then write the datas to a file:
#directory "+site-lib/extlib/" #load "extLib.cma" let write_compressed ~filename ~compressed ~nbits = let oc = open_out filename in output_byte oc nbits; let ob = IO.output_bits(IO.output_channel oc) in List.iter (IO.write_bits ob nbits) compressed; IO.flush_bits ob; close_out oc; ;;
Python[edit]
# Lempel-Ziv-Welch compression algorithm def compress(uncompressed): """ compress a string to a list of output symbols """ # build the dictionary dict_size = 256 dictionary = {} for i in xrange(dict_size): dictionary[chr(i)] = i result = [] w = "" for c in uncompressed: wc = w + c if wc in dictionary: w = wc else: result.append(dictionary[w]) # add wc to the dictionary dictionary[wc] = dict_size dict_size += 1 w = c # output the code for w if w: result.append(dictionary[w]) return result def decompress(compressed): """ decompress a list of output symbols to a string """ # build the dictionary dict_size = 256 dictionary = {} for i in xrange(dict_size): dictionary[i] = chr(i) if compressed: w = chr(compressed.pop(0)) else: raise ValueError, "empty" result = [w] for k in compressed: if k in dictionary: entry = dictionary[k] elif k = len(dictionary): entry = w + w[0] else: raise ValueError, "Bad compressed k: %d" % k result.append(entry) # add (w + entry.[0]) to the dictionary dictionary[dict_size] = w + entry[0] dict_size += 1 w = entry return result