LZW

From CodeCodex

The Lempel-Ziv-Welch algorithm provides lossless data compression. (read on Wikipedia) It was patented, but it fell in the public domain in 2004.

Implementations[edit]

OCaml[edit]

OCaml, the Imperative Way[edit]

(** Lempel-Ziv-Welch compression algorithm *)

let push li v = li := v :: !li
 
(** compress a string to a list of output symbols *)
let compress ~uncompressed =
  (* build the dictionary *)
  let dict_size = ref 256 in
  let dictionary = Hashtbl.create 397 in
  for i=0 to 255 do
    let str = String.make 1 (char_of_int i) in
    Hashtbl.add dictionary str i
  done;
 
  let result = ref [] in

  let w = ref "" in
  String.iter uncompressed (fun c ->
    let c = String.make 1 c in
    let wc = !w ^ c in
    if Hashtbl.mem dictionary wc then
      w := wc
    else
      begin
        push result (Hashtbl.find dictionary !w);
        (* add wc to the dictionary *)
        Hashtbl.add dictionary wc !dict_size;
        incr dict_size;
        w := c
      end
  );

  (* output the code for w *)
  if !w <> "" then
    push result (Hashtbl.find dictionary !w);

  List.rev !result
;;

exception ValueError of string

(** decompress a list of output symbols to a string *)
let decompress ~compressed =
 
  (* build the dictionary *)
  let dict_size = ref 256 in
  let dictionary = Hashtbl.create 397 in
  for i=0 to pred !dict_size do
    let str = String.make 1 (char_of_int i) in
    Hashtbl.add dictionary i str
  done;

  let w, compressed =
    match compressed with
    | hd::tl -> ref(String.make 1 (char_of_int hd)), tl
    | [] -> failwith "empty"
  in
 
  let result = ref [!w] in

  List.iter (fun k ->
    let entry =
      if Hashtbl.mem dictionary k then
        Hashtbl.find dictionary k
      else if k = Hashtbl.length dictionary then
        !w ^ String.sub !w 0 1
      else
        raise (ValueError (Printf.sprintf "Bad compressed k: %d" k))
    in
    result := entry :: !result;
 
    (* add (w ^ entry.[0]) to the dictionary *)
    Hashtbl.add dictionary !dict_size (!w ^ String.sub entry 0 1);
    incr dict_size;
 
    w := entry;
  ) compressed;

  List.rev !result
;;


OCaml, the Functional Way[edit]

(** Lempel-Ziv-Welch compression algorithm *)

#directory "+site-lib/extlib/"
#load "extLib.cma"
open ExtString

(** compress a string to a list of output symbols *)
let compress ~uncompressed =
  (* build the dictionary *)
  let dict_size = 256 in
  let dictionary = Hashtbl.create 397 in
  for i=0 to 255 do
    let str = String.make 1 (char_of_int i) in
    Hashtbl.add dictionary str i
  done;
 
  let f = (fun (w, dict_size, result) c ->
    let c = String.make 1 c in
    let wc = w ^ c in
    if Hashtbl.mem dictionary wc then
      (wc, dict_size, result)
    else
      begin
        (* add wc to the dictionary *)
        Hashtbl.add dictionary wc dict_size;
        let this = Hashtbl.find dictionary w in
        (c, dict_size + 1, this::result)
      end
  ) in
  let w, _, result =
    String.fold_left f ("", dict_size, []) uncompressed
  in

  (* output the code for w *)
  let result =
    if w = ""
    then result
    else (Hashtbl.find dictionary w) :: result
  in

  (List.rev result)
;;

exception ValueError of string

(** decompress a list of output symbols to a string *)
let decompress ~compressed =
  (* build the dictionary *)
  let dict_size = 256 in
  let dictionary = Hashtbl.create 397 in
  for i=0 to pred dict_size do
    let str = String.make 1 (char_of_int i) in
    Hashtbl.add dictionary i str
  done;

  let w, compressed =
    match compressed with
    | hd::tl -> (String.make 1 (char_of_int hd)), tl
    | [] -> failwith "empty input"
  in
 
  let result = w::[] in

  let result, _, _ =
    List.fold_left (fun (result, w, dict_size) k ->
      let entry =
        if Hashtbl.mem dictionary k then
          Hashtbl.find dictionary k
        else if k = Hashtbl.length dictionary then
          w ^ (String.make 1 w.[0])
        else
          raise(ValueError(Printf.sprintf "Bad compressed k: %d" k))
      in
      let result = entry :: result in
   
      (* add (w ^ entry.[0]) to the dictionary *)
      Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0]));
      (result, entry, dict_size + 1)
    ) (result, w, dict_size) compressed
  in
  (List.rev result)
;;

OCaml, How to use[edit]

Both implementations provide the same interface:

val compress : uncompressed:string -> int list = <fun>
val decompress : compressed:int list -> string list = <fun>

The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.

let greatest = List.fold_left max 0 ;;
(** number of bits needed to encode the integer m *)
let n_bits m =
  let rec aux m n =
    if m = 0 then n
    else aux (m lsr 1) (n + 1)
  in
  aux m 0
;;

Then you can then write the datas to a file:

#directory "+site-lib/extlib/"
#load "extLib.cma"

let write_compressed ~filename ~compressed ~nbits =
  let oc = open_out filename in
  output_byte oc nbits;
  let ob = IO.output_bits(IO.output_channel oc) in
  List.iter (IO.write_bits ob nbits) compressed;
  IO.flush_bits ob;
  close_out oc;
;;

Python[edit]

# Lempel-Ziv-Welch compression algorithm
 
def compress(uncompressed):
  """ compress a string to a list of output symbols """
  # build the dictionary
  dict_size = 256
  dictionary = {}
  for i in xrange(dict_size):
    dictionary[chr(i)] = i
 
  result = []

  w = ""
  for c in uncompressed:
    wc = w + c
    if wc in dictionary:
      w = wc
    else:
      result.append(dictionary[w])
      # add wc to the dictionary
      dictionary[wc] = dict_size
      dict_size += 1
      w = c

  # output the code for w
  if w:
    result.append(dictionary[w])

  return result

def decompress(compressed):
  """ decompress a list of output symbols to a string """

  # build the dictionary
  dict_size = 256
  dictionary = {}
  for i in xrange(dict_size):
    dictionary[i] = chr(i)

  if compressed:
    w = chr(compressed.pop(0))
  else:
    raise ValueError, "empty"

  result = [w]

  for k in compressed:
    if k in dictionary:
      entry = dictionary[k]
    elif k = len(dictionary):
      entry = w + w[0]
    else:
      raise ValueError, "Bad compressed k: %d" % k

    result.append(entry)
 
    # add (w + entry.[0]) to the dictionary
    dictionary[dict_size] = w + entry[0]
    dict_size += 1
 
    w = entry

  return result