// cedar -- C++ implementation of Efficiently-updatable Double ARray trie // $Id: bench.cc 1913 2016-01-19 12:55:27Z ynaga $ // Copyright (c) 2013-2015 Naoki Yoshinaga #ifdef __APPLE__ #include #endif #include #include #include #include #include #include // for ternary search tree #include #include #include // C++ containers #include // cedar variants #ifdef USE_PREFIX_TRIE #include #else #include #endif // Google's containers and variants #include #include // Cache-concious containers #include #ifdef USE_HAT #include #endif #ifdef USE_AHASH #include #endif #ifdef USE_HHASH #include #endif #include struct str_hash { size_t operator () (const char* key) const { return CityHash64 (key, std::strlen (key)); } size_t operator () (const char* key, size_t key_size) const { return CityHash64 (key, key_size); } }; struct str_equal { bool operator () (const char* p, const char* q) const { // return std::strcmp (p, q) == 0; // slow for (; *p || *q; ++q, ++p) if (*p != *q) return false; // reject immediately return true; } }; // typedef #if defined (USE_CEDAR_UNORDERED) typedef cedar::da cedar_t; #else typedef cedar::da cedar_t; #endif typedef std::unordered_map hash_t; typedef google::dense_hash_map gdhash_t; typedef spp::sparse_hash_map shash_t; typedef Pvoid_t judy_t; #ifdef USE_AHASH typedef tsl::array_map ahash_t; #endif #ifdef USE_HAT typedef tsl::htrie_map hat_t; #endif #ifdef USE_HHASH typedef tsl::hopscotch_map hhash_t; #endif size_t get_process_size () { #ifdef __APPLE__ struct task_basic_info t_info; mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT; task_info (current_task (), TASK_BASIC_INFO, reinterpret_cast (&t_info), &t_info_count); return t_info.resident_size; #else FILE* fp = std::fopen("/proc/self/statm", "r"); size_t dummy (0), vm (0); std::fscanf (fp, "%ld %ld ", &dummy, &vm); // get resident (see procfs) std::fclose (fp); return vm * ::getpagesize (); #endif } size_t read_data (const char* file, char*& data) { int fd = ::open (file, O_RDONLY); if (fd < 0) { std::fprintf (stderr, "no such file: %s\n", file); std::exit (1); } size_t size = static_cast (::lseek (fd, 0L, SEEK_END)); data = new char[size]; ::lseek (fd, 0L, SEEK_SET); ::read (fd, data, size); ::close (fd); return size; } #ifdef USE_BINARY_DATA #define KEY_SEP '\0' inline char* find_sep (char* p) { while (*p != '\0') ++p; return p; } #else #define KEY_SEP '\n' inline char* find_sep (char* p) { while (*p != '\n') ++p; *p = '\0'; return p; } #endif template inline T* create () { return new T (); } template <> inline gdhash_t* create () { gdhash_t* p = new gdhash_t; p->set_empty_key (""); return p; } template inline void destroy (T* t) { for (typename T::const_iterator it = t->begin (); it != t->end (); ++it) delete [] it->first; delete t; } template <> inline void destroy (cedar_t* t) { delete t; } #ifdef USE_HAT template <> inline void destroy (hat_t* t) { delete t; } #endif #ifdef USE_AHASH template <> inline void destroy (ahash_t* t) { delete t; } #endif template <> inline void destroy (judy_t* t) { Word_t bytes = 0; JSLFA (bytes, *t); delete t; } // avoid to new temporary objects regarding std::string template // hash inline void insert_key (T* t, const char* key, size_t len, int n) { t->insert ({::strdup (key), n}); } template // hash inline bool lookup_key (const T& t, const char* key, size_t len) { return t.find (key) != t.end (); } // hat-trie #ifdef USE_HAT template <> inline void insert_key (hat_t* t, const char* key, const size_t len, int n) { t->insert_ks (key, len, n); } template <> inline bool lookup_key (const hat_t& t, const char* key, const size_t len) { return t.find_ks (key, len) != t.end (); } #endif // array-hash #ifdef USE_AHASH template <> inline void insert_key (ahash_t* t, const char* key, const size_t len, int n) { t->insert_ks (key, len, n); } template <> inline bool lookup_key (const ahash_t& t, const char* key, const size_t len) { return t.find_ks (key, len) != t.end (); } #endif // cedar template <> inline void insert_key (cedar_t* t, const char* key, const size_t len, int n) { t->update (key, len) = n; } template <> inline bool lookup_key (const cedar_t& t, const char* key, const size_t len) { return t.exactMatchSearch (key, len) >= 0; } // judy array template <> inline void insert_key (judy_t* t, const char* key, const size_t len, int n) { Word_t* PValue = 0; JSLI (PValue, *t, key); *PValue = n; } template <> inline bool lookup_key (const judy_t& t, const char* key, size_t len) { PWord_t PValue = 0; JSLG (PValue, t, key); return PValue; } template void insert (T* t, char* data, size_t size, int& n) { for (char* start (data), *end (data), *tail (data + size); end != tail; start = ++end) { end = find_sep (end); insert_key (t, start, end - start, ++n); } } // lookup template void lookup (const T& t, char* data, size_t size, int& n_, int& n) { for (char* start (data), *end (data), *tail (data + size); end != tail; start = ++end) { end = find_sep (end); if (lookup_key (t, start, end - start)) ++n_; ++n; } } template void bench (const char* keys, const char* queries, const char* label) { std::fprintf (stderr, "---- %-25s --------------------------\n", label); // T* t = create (); struct timeval st, et; { // build trie char* data = 0; const size_t size = read_data (keys, data); size_t rss = get_process_size (); std::fprintf (stderr, "%-20s %.2f MiB (%ld bytes)\n", "Init RSS:", rss / 1048576.0, rss); int n = 0; ::gettimeofday (&st, NULL); insert (t, data, size, n); ::gettimeofday (&et, NULL); double elapsed = (et.tv_sec - st.tv_sec) + (et.tv_usec - st.tv_usec) * 1e-6; std::fprintf (stderr, "%-20s %.2f sec (%.2f nsec per key)\n", "Time to insert:", elapsed, elapsed * 1e9 / n); std::fprintf (stderr, "%-20s %d\n", "Words:", n); delete [] data; } if (std::strcmp (queries, "-") != 0) { // load data char* data = 0; const size_t size = read_data (queries, data); // search int n (0), n_ (0); ::gettimeofday (&st, NULL); lookup (*t, data, size, n_, n); ::gettimeofday (&et, NULL); double elapsed = (et.tv_sec - st.tv_sec) + (et.tv_usec - st.tv_usec) * 1e-6; std::fprintf (stderr, "%-20s %.2f sec (%.2f nsec per key)\n", "Time to search:", elapsed, elapsed * 1e9 / n); std::fprintf (stderr, "%-20s %d\n", "Words:", n); std::fprintf (stderr, "%-20s %d\n", "Found:", n_); delete [] data; } destroy (t); } int main (int argc, char** argv) { if (argc < 3) { std::fprintf (stderr, "Usage: %s keys queries\n", argv[0]); std::exit (1); } // #ifdef USE_CEDAR #if defined (USE_PREFIX_TRIE) bench (argv[1], argv[2], "cedar (prefix)"); #elif defined (USE_REDUCED_TRIE) bench (argv[1], argv[2], "cedar (reduced)"); #else bench (argv[1], argv[2], "cedar"); #endif #endif #ifdef USE_CEDAR_UNORDERED #if defined (USE_PREFIX_TRIE) bench (argv[1], argv[2], "cedar unordered (prefix)"); #elif defined (USE_REDUCED_TRIE) bench (argv[1], argv[2], "cedar unordered (reduced)"); #else bench (argv[1], argv[2], "cedar unordered"); #endif #endif #ifdef USE_HASH bench (argv[1], argv[2], "hash"); #endif #ifdef USE_GOOGLE_DHASH bench (argv[1], argv[2], "gdhash"); #endif #ifdef USE_SHASH bench (argv[1], argv[2], "shash"); #endif #ifdef USE_SPLAY bench (argv[1], argv[2], "splay"); #endif #ifdef USE_JUDY bench (argv[1], argv[2], "judy"); #endif #ifdef USE_HAT bench (argv[1], argv[2], "hat"); #endif #ifdef USE_AHASH bench (argv[1], argv[2], "ahash"); #endif #ifdef USE_HHASH bench (argv[1], argv[2], "hhash"); #endif }