// cedar -- C++ implementation of Efficiently-updatable Double ARray trie // $Id: bench_static.cc 1879 2015-01-29 12:03:37Z ynaga $ // Copyright (c) 2013-2015 Naoki Yoshinaga #ifdef __APPLE__ #include #endif #include #include #include #include #include #include #include #include #ifdef USE_PREFIX_TRIE #include #else #include #endif #if defined (USE_DARTS_CLONE) #include #elif defined (USE_DARTS_CLONE_OLD) #include #else #include #endif #include #include #include // typedef #if defined (USE_CEDAR_UNORDERED) typedef cedar::da cedar_t; #else typedef cedar::da cedar_t; #endif typedef Darts::DoubleArray darts_t; typedef tx_tool::tx tx_t; typedef ux::Trie ux_t; typedef marisa::Trie marisa_t; size_t get_process_size () { #ifdef __APPLE__ struct task_basic_info t_info; mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT; task_info (current_task (), TASK_BASIC_INFO, reinterpret_cast (&t_info), &t_info_count); return t_info.resident_size; #else FILE* fp = std::fopen("/proc/self/statm", "r"); size_t dummy (0), vm (0); std::fscanf (fp, "%ld %ld ", &dummy, &vm); // get resident (see procfs) std::fclose (fp); return vm * ::getpagesize (); #endif } size_t read_data (const char* file, char*& data) { int fd = ::open (file, O_RDONLY); if (fd < 0) { std::fprintf (stderr, "no such file: %s\n", file); std::exit (1); } size_t size = static_cast (::lseek (fd, 0L, SEEK_END)); data = new char[size]; ::lseek (fd, 0L, SEEK_SET); ::read (fd, data, size); ::close (fd); return size; } #ifdef USE_BINARY_DATA #define KEY_SEP '\0' inline char* find_sep (char* p) { while (*p != '\0') ++p; return p; } #else #define KEY_SEP '\n' inline char* find_sep (char* p) { while (*p != '\n') ++p; *p = '\0'; return p; } #endif template inline T* create () { return new T (); } template inline void destroy (T* t) { delete t; } // darts/darts-clone template void build (T* t, char* data, size_t size, int& n, const char* index) { std::vector key; std::vector len; std::vector val; for (char* start (data), *end (data), *tail (data + size); end != tail; start = ++end) { end = find_sep (end); key.push_back (::strdup (start)); len.push_back (end - start); val.push_back (++n); } t->build (key.size (), &key[0], &len[0], &val[0]); t->save (index); for (std::vector ::iterator it = key.begin (); it != key.end (); ++it) ::free (const_cast (*it)); } // cedar template <> void build (cedar_t* t, char* data, size_t size, int& n, const char* index) { for (char* start (data), *end (data), *tail (data + size); end != tail; start = ++end) { end = find_sep (end); t->update (start, end - start) = ++n; } t->save (index); } // tx template <> void build (tx_t* t, char* data, size_t size, int& n, const char* index) { std::vector str; for (char* start (data), *end (data), *tail (data + size); end != tail; start = ++end) { end = find_sep (end); str.push_back (start); ++n; } t->build (str, index); } // ux template <> void build (ux_t* t, char* data, size_t size, int& n, const char* index) { std::vector str; for (char* start (data), *end (data), *tail (data + size); end != tail; start = ++end) { end = find_sep (end); str.push_back (start); ++n; } t->build (str); t->save (index); } // marisa template <> void build (marisa_t* t, char* data, size_t size, int& n, const char* index) { marisa::Keyset keyset; for (char* start (data), *end (data), *tail (data + size); end != tail; start = ++end) { end = find_sep (end); keyset.push_back (start); ++n; } t->build (keyset); t->save (index); } // cedar/darts/darts-clone template inline bool lookup_key (const T& t, const char* key, size_t len) { return t.template exactMatchSearch (key, len) >= 0; } // tx template <> inline bool lookup_key (const tx_t& t, const char* key, size_t len) { size_t n = 0; return t.prefixSearch (key, len, n) != tx_tool::tx::NOTFOUND && n == len; } // ux template <> inline bool lookup_key (const ux_t& t, const char* key, size_t len) { size_t n = 0; return t.prefixSearch (key, len, n) != ux::NOTFOUND && n == len; } // marisa template <> inline bool lookup_key (const marisa_t& t, const char* key, size_t len) { static marisa::Agent agent; agent.set_query (key, len); return t.lookup (agent); } // lookup template void lookup (const T& t, char* data, size_t size, int& n_, int& n) { for (char* start (data), *end (data), *tail (data + size); end != tail; start = ++end) { end = find_sep (end); if (lookup_key (t, start, end - start)) ++n_; ++n; } } // cedar/darts/darts-clone template void read_trie (T* t, const char* index) { t->open (index); } // tx template <> void read_trie (tx_t* t, const char* index) { t->read (index); } // ux template <> void read_trie (ux_t* t, const char* index) { t->load (index); } // marisa-trie template <> void read_trie (marisa_t* t, const char* index) { t->load (index); } size_t get_size (const char* index) { int fd = ::open (index, O_RDONLY); if (fd < 0) { std::fprintf (stderr, "no such file: %s\n", index); std::exit (1); } size_t size = static_cast (::lseek (fd, 0L, SEEK_END)); ::close (fd); return size; } template void bench (const char* keys, const char* index, const char* queries, const char* label) { std::fprintf (stderr, "---- %-25s --------------------------\n", label); // T* t = create (); struct timeval st, et; if (std::strcmp (keys, "-") != 0) { // build trie char* data = 0; const size_t size = read_data (keys, data); size_t rss = get_process_size (); std::fprintf (stderr, "%-20s %.2f MiB (%ld bytes)\n", "Init RSS:", rss / 1048576.0, rss); int n = 0; ::gettimeofday (&st, NULL); build (t, data, size, n, index); ::gettimeofday (&et, NULL); double elapsed = (et.tv_sec - st.tv_sec) + (et.tv_usec - st.tv_usec) * 1e-6; std::fprintf (stderr, "%-20s %.2f sec (%.2f nsec per key)\n", "Time to insert:", elapsed, elapsed * 1e9 / n); std::fprintf (stderr, "%-20s %d\n", "Words:", n); // trie size rss = get_size (index); std::fprintf (stderr, "%-20s %.2f MiB (%ld bytes)\n", "Trie size:", rss / 1048576.0, rss); delete [] data; } else if (std::strcmp (queries, "-") != 0) { // load data char* data = 0; const size_t size = read_data (queries, data); // search read_trie (t, index); int n (0), n_ (0); ::gettimeofday (&st, NULL); lookup (*t, data, size, n_, n); ::gettimeofday (&et, NULL); double elapsed = (et.tv_sec - st.tv_sec) + (et.tv_usec - st.tv_usec) * 1e-6; std::fprintf (stderr, "%-20s %.2f sec (%.2f nsec per key)\n", "Time to search:", elapsed, elapsed * 1e9 / n); std::fprintf (stderr, "%-20s %d\n", "Words:", n); std::fprintf (stderr, "%-20s %d\n", "Found:", n_); delete [] data; } destroy (t); } int main (int argc, char** argv) { if (argc < 4) { std::fprintf (stderr, "Usage: %s keys index queries\n", argv[0]); std::exit (1); } // #ifdef USE_CEDAR #if defined (USE_PREFIX_TRIE) bench (argv[1], argv[2], argv[3], "cedar (prefix)"); #elif defined (USE_REDUCED_TRIE) bench (argv[1], argv[2], argv[3], "cedar (reduced)"); #else bench (argv[1], argv[2], argv[3], "cedar"); #endif #endif #ifdef USE_CEDAR_UNORDERED #if defined (USE_PREFIX_TRIE) bench (argv[1], argv[2], argv[3], "cedar unordered (prefix)"); #elif defined (USE_REDUCED_TRIE) bench (argv[1], argv[2], argv[3], "cedar unordered (reduced)"); #else bench (argv[1], argv[2], argv[3], "cedar unordered"); #endif #endif #ifdef USE_DARTS bench (argv[1], argv[2], argv[3], "darts"); #endif #ifdef USE_DARTS_CLONE bench (argv[1], argv[2], argv[3], "darts-clone"); #endif #ifdef USE_DARTS_CLONE_OLD bench (argv[1], argv[2], argv[3], "darts-clone_old"); #endif #ifdef USE_TX bench (argv[1], argv[2], argv[3], "tx"); #endif #ifdef USE_UX bench (argv[1], argv[2], argv[3], "ux"); #endif #ifdef USE_MARISA bench (argv[1], argv[2], argv[3], "marisa-trie"); #endif }