Sparse HashのRubyバインディング

Sparse Hashに興味があったので、Ruby バインディングを作ってみた。

http://storehouse.sakura.ne.jp/viewvc/viewvc.cgi/ruby-sparsehash/?root=svn

#include "google/sparse_hash_map"
#include "google/dense_hash_map"

#include "sparsehash_internal.h"

namespace {

template <class T>
struct Sparsehash {
  typename T *m;

  static void rb_sparsehash_mark(Sparsehash<T> *p) {
    if (!p->m) {
      return;
    }

    T *m = p->m;

    for(T::iterator i = m->begin(); i != m->end(); i++) {
      rb_gc_mark(i->first);
      rb_gc_mark(i->second);
    }
  }

  static void rb_sparsehash_free(Sparsehash<T> *p) {
    if (p->m) {
      delete p->m;
    }

    delete p;
  }

  static VALUE rb_sparsehash_alloc(VALUE klass) {
    Sparsehash<T> *p;

    p = new Sparsehash<T>;
    p->m = NULL;
    
    return Data_Wrap_Struct(klass, &rb_sparsehash_mark, &rb_sparsehash_free, p);
  }

  static VALUE rb_sparsehash_get(VALUE self, VALUE key) {
    Sparsehash<T> *p;

    Data_Get_Struct(self, Sparsehash<T>, p);
    T &m = *(p->m);

    VALUE value = m[key];

    return(value ? value : Qnil);
  }

  static VALUE rb_sparsehash_set(VALUE self, VALUE key, VALUE value) {
    Sparsehash<T> *p;

    Data_Get_Struct(self, Sparsehash<T>, p);
    T &m = *(p->m);

    m[key] = value;

    return value;
  }

  static VALUE rb_sparsehash_each0(VALUE args) {
    return rb_yield(args);
  }

  static VALUE rb_sparsehash_each(VALUE self) {
    Sparsehash<T> *p;

    rb_need_block();

    Data_Get_Struct(self, Sparsehash<T>, p);
    T &m = *(p->m);
    int status = 0;

    for(T::iterator i = m.begin(); i != m.end(); i++) {
      VALUE args = rb_ary_new3(2, i->first, i->second);

      rb_protect(rb_sparsehash_each0, args, &status);

      if (status != 0) {
        break;
      }
    }

    if (status != 0) {
      rb_jump_tag(status);
    }

    return self;
  }


  static VALUE rb_sparsehash_erase(VALUE self, VALUE key) {
    Sparsehash<T> *p;

    Data_Get_Struct(self, Sparsehash<T>, p);
    T &m = *(p->m);

    VALUE value = m[key];
    m.erase(key);

    return(value ? value : Qnil);
  }

  static void init(VALUE &rb_mSparsehash, const char *name) {
    VALUE rb_cSparseHashMap = rb_define_class_under(rb_mSparsehash, name, rb_cObject);

    rb_define_alloc_func(rb_cSparseHashMap, &rb_sparsehash_alloc);
    rb_define_private_method(rb_cSparseHashMap, "initialize", __F(rb_sparsehash_initialize<T>), 0);
    rb_define_method(rb_cSparseHashMap, "[]", __F(&rb_sparsehash_get), 1);
    rb_define_method(rb_cSparseHashMap, "[]=", __F(&rb_sparsehash_set), 2);
    rb_define_method(rb_cSparseHashMap, "each", __F(&rb_sparsehash_each), 0);
    rb_define_method(rb_cSparseHashMap, "erase", __F(&rb_sparsehash_erase), 1);
    rb_define_method(rb_cSparseHashMap, "delete", __F(rb_sparsehash_erase), 1);
  }
};

struct rb_sparsehash_equal_key {
  bool operator() (const VALUE &x, const VALUE &y) const {
    return (rb_funcall(x, rb_intern("=="), 1, y) == Qtrue);
  }
};

struct rb_sparsehash_hash {
  size_t operator() (const VALUE &x) const {
    VALUE hash = rb_funcall(x, rb_intern("hash"), 0);
    return NUM2INT(hash);
  }
};

typedef google::sparse_hash_map<VALUE, VALUE, rb_sparsehash_hash, rb_sparsehash_equal_key> SHMap;
typedef google::dense_hash_map<VALUE, VALUE, rb_sparsehash_hash, rb_sparsehash_equal_key> DHMap;

template <class T>
VALUE rb_sparsehash_initialize(VALUE self) {
  Sparsehash<T> *p;

  Data_Get_Struct(self, Sparsehash<T>, p);
  p->m = new T;

  return Qnil;
}

template <>
VALUE rb_sparsehash_initialize<DHMap>(VALUE self) {
  Sparsehash<DHMap> *p;

  Data_Get_Struct(self, Sparsehash<DHMap>, p);
  p->m = new DHMap;
  p->m->set_empty_key(Qnil);

  return Qnil;
}

} // namespace

void Init_sparsehash() {
  VALUE rb_mSparsehash = rb_define_module("Sparsehash");
  Sparsehash<SHMap>::init(rb_mSparsehash, "SparseHashMap");
  Sparsehash<DHMap>::init(rb_mSparsehash, "DenseHashMap");
}

せっかくなのでテンプレートを活用。結構、コードを圧縮できるなぁ…でも、staticばっかの書き方はどうなんだろう？

で、以下のテストコードで計測。

require 'sparsehash'
require 'timeout'

include Sparsehash

def memoryusage()
  tasks = `tasklist`
  lines = tasks.split("\n")
  lines.each do |line|
    row = line.split(/\s+/)
    if row[1].to_i == $$
      return row[3].gsub(',', '').to_i / 1024.0
    end
  end
  return -1;
end

#def memoryusage()
#  status = `cat /proc/#{$$}/status`
#  lines = status.split("\n")
#  lines.each do |line|
#    if line =~ /^VmRSS:/
#      line.gsub!(/.*:\s*(\d+).*/, '\1')
#      return line.to_i / 1024.0
#    end
#  end
#  return -1;
#end

[100, 10000, 1000000].each do |rnum|
  [Hash, SparseHashMap, DenseHashMap].each do |clazz|
    puts("#{clazz}: #{rnum}")

    time = Time.now
    size = memoryusage
    hash = clazz.new

    begin
      timeout(120) do
        (0...rnum).each do |i|
          buf = sprintf("%08d", i)
          hash[buf] = buf
        end

        time = Time.now - time
        GC.start
        size = memoryusage - size

        printf("Time: %.3f sec.\n", time)
        printf("Usage: %.3f MB\n\n", size)
      end
    rescue Timeout::Error
      printf("-----\n\n")
    end
  end
end

結果はこんな感じ。

rnum =	100	10000	1000000
Hash	0.297 sec. / 0.145 MB	0.188 sec. / 2.543 MB	13.203 sec. / 245.555 MB
SparseHashMap	0.156 sec. / 0.039 MB	3.406 sec. / 0.027 MB	計測不能*1
DenseHashMap	0.141 sec. / 0.055 MB	0.656 sec. / 0.113 MB	69.047 sec. / 125.324 MB

使いどころは微妙かも。ハッシュ関数を変えたらもう少しパフォーマンスが改善するかなぁ。

*1:時間かかりすぎ…