# Partial list of contributors:
Kevin Regan <kevin.d.regan@gmail.com>
+Johan Bilien <jobi@litl.com>
--- /dev/null
+# Contributing
+
+We'd love to accept your code patches! However, before we can take them, we
+have to jump a couple of legal hurdles.
+
+## Contributor License Agreements
+
+Please fill out either the individual or corporate Contributor License
+Agreement as appropriate.
+
+* If you are an individual writing original source code and you're sure you
+own the intellectual property, then sign an [individual CLA](https://developers.google.com/open-source/cla/individual).
+* If you work for a company that wants to allow you to contribute your work,
+then sign a [corporate CLA](https://developers.google.com/open-source/cla/corporate).
+
+Follow either of the two links above to access the appropriate CLA and
+instructions for how to sign and return it.
+
+## Submitting a Patch
+
+1. Sign the contributors license agreement above.
+2. Decide which code you want to submit. A submission should be a set of changes
+that addresses one issue in the [issue tracker](https://github.com/google/leveldb/issues).
+Please don't mix more than one logical change per submission, because it makes
+the history hard to follow. If you want to make a change
+(e.g. add a sample or feature) that doesn't have a corresponding issue in the
+issue tracker, please create one.
+3. **Submitting**: When you are ready to submit, send us a Pull Request. Be
+sure to include the issue number you fixed and the name you used to sign
+the CLA.
+
+## Writing Code ##
+
+If your contribution contains code, please make sure that it follows
+[the style guide](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml).
+Otherwise we will have to ask you to make changes, and that's no fun for anyone.
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
# to switch between compilation modes.
-OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode)
-# OPT ?= -g2 # (B) Debug mode, w/ full line-level debugging symbols
-# OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
+# (A) Production use (optimized mode)
+OPT ?= -O2 -DNDEBUG
+# (B) Debug mode, w/ full line-level debugging symbols
+# OPT ?= -g2
+# (C) Profiling mode: opt, but w/debugging symbols
+# OPT ?= -O2 -g2 -DNDEBUG
#-----------------------------------------------
# detect what platform we're building on
TESTUTIL = ./util/testutil.o
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+# Note: iOS should probably be using libtool, not ar.
+ifeq ($(PLATFORM), IOS)
+AR=xcrun ar
+endif
+
TESTS = \
arena_test \
+ autocompact_test \
bloom_test \
c_test \
cache_test \
env_test \
filename_test \
filter_block_test \
+ hash_test \
issue178_test \
+ issue200_test \
log_test \
memenv_test \
skiplist_test \
else
# Update db.h if you change these.
SHARED_MAJOR = 1
-SHARED_MINOR = 12
+SHARED_MINOR = 18
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+autocompact_test: db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CXX) $(LDFLAGS) db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+hash_test: util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CXX) $(LDFLAGS) util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+
issue178_test: issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+issue200_test: issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CXX) $(LDFLAGS) issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
+IOSARCH=-arch armv6 -arch armv7 -arch armv7s -arch arm64
.cc.o:
mkdir -p ios-x86/$(dir $@)
- $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+ xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
- $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
- lipo ios-x86/$@ ios-arm/$@ -create -output $@
+ xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk $(IOSARCH) -c $< -o ios-arm/$@
+ xcrun lipo ios-x86/$@ ios-arm/$@ -create -output $@
.c.o:
mkdir -p ios-x86/$(dir $@)
- $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+ xcrun -sdk iphonesimulator $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
- $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
- lipo ios-x86/$@ ios-arm/$@ -create -output $@
+ xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk $(IOSARCH) -c $< -o ios-arm/$@
+ xcrun lipo ios-x86/$@ ios-arm/$@ -create -output $@
else
.cc.o:
--- /dev/null
+**LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.**
+
+Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+# Features
+ * Keys and values are arbitrary byte arrays.
+ * Data is stored sorted by key.
+ * Callers can provide a custom comparison function to override the sort order.
+ * The basic operations are `Put(key,value)`, `Get(key)`, `Delete(key)`.
+ * Multiple changes can be made in one atomic batch.
+ * Users can create a transient snapshot to get a consistent view of data.
+ * Forward and backward iteration is supported over the data.
+ * Data is automatically compressed using the [Snappy compression library](http://code.google.com/p/snappy).
+ * External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions.
+ * [Detailed documentation](http://htmlpreview.github.io/?https://github.com/google/leveldb/blob/master/doc/index.html) about how to use the library is included with the source code.
+
+
+# Limitations
+ * This is not a SQL database. It does not have a relational data model, it does not support SQL queries, and it has no support for indexes.
+ * Only a single process (possibly multi-threaded) can access a particular database at a time.
+ * There is no client-server support builtin to the library. An application that needs such support will have to wrap their own server around the library.
+
+# Performance
+
+Here is a performance report (with explanations) from the run of the
+included db_bench program. The results are somewhat noisy, but should
+be enough to get a ballpark performance estimate.
+
+## Setup
+
+We use a database with a million entries. Each entry has a 16 byte
+key, and a 100 byte value. Values used by the benchmark compress to
+about half their original size.
+
+ LevelDB: version 1.1
+ Date: Sun May 1 12:11:26 2011
+ CPU: 4 x Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz
+ CPUCache: 4096 KB
+ Keys: 16 bytes each
+ Values: 100 bytes each (50 bytes after compression)
+ Entries: 1000000
+ Raw Size: 110.6 MB (estimated)
+ File Size: 62.9 MB (estimated)
+
+## Write performance
+
+The "fill" benchmarks create a brand new database, in either
+sequential, or random order. The "fillsync" benchmark flushes data
+from the operating system to the disk after every operation; the other
+write operations leave the data sitting in the operating system buffer
+cache for a while. The "overwrite" benchmark does random writes that
+update existing keys in the database.
+
+ fillseq : 1.765 micros/op; 62.7 MB/s
+ fillsync : 268.409 micros/op; 0.4 MB/s (10000 ops)
+ fillrandom : 2.460 micros/op; 45.0 MB/s
+ overwrite : 2.380 micros/op; 46.5 MB/s
+
+Each "op" above corresponds to a write of a single key/value pair.
+I.e., a random write benchmark goes at approximately 400,000 writes per second.
+
+Each "fillsync" operation costs much less (0.3 millisecond)
+than a disk seek (typically 10 milliseconds). We suspect that this is
+because the hard disk itself is buffering the update in its memory and
+responding before the data has been written to the platter. This may
+or may not be safe based on whether or not the hard disk has enough
+power to save its memory in the event of a power failure.
+
+## Read performance
+
+We list the performance of reading sequentially in both the forward
+and reverse direction, and also the performance of a random lookup.
+Note that the database created by the benchmark is quite small.
+Therefore the report characterizes the performance of leveldb when the
+working set fits in memory. The cost of reading a piece of data that
+is not present in the operating system buffer cache will be dominated
+by the one or two disk seeks needed to fetch the data from disk.
+Write performance will be mostly unaffected by whether or not the
+working set fits in memory.
+
+ readrandom : 16.677 micros/op; (approximately 60,000 reads per second)
+ readseq : 0.476 micros/op; 232.3 MB/s
+ readreverse : 0.724 micros/op; 152.9 MB/s
+
+LevelDB compacts its underlying storage data in the background to
+improve read performance. The results listed above were done
+immediately after a lot of random writes. The results after
+compactions (which are usually triggered automatically) are better.
+
+ readrandom : 11.602 micros/op; (approximately 85,000 reads per second)
+ readseq : 0.423 micros/op; 261.8 MB/s
+ readreverse : 0.663 micros/op; 166.9 MB/s
+
+Some of the high cost of reads comes from repeated decompression of blocks
+read from disk. If we supply enough cache to the leveldb so it can hold the
+uncompressed blocks in memory, the read performance improves again:
+
+ readrandom : 9.775 micros/op; (approximately 100,000 reads per second before compaction)
+ readrandom : 5.215 micros/op; (approximately 190,000 reads per second after compaction)
+
+## Repository contents
+
+See doc/index.html for more explanation. See doc/impl.html for a brief overview of the implementation.
+
+The public interface is in include/*.h. Callers should not include or
+rely on the details of any other header files in this package. Those
+internal APIs may be changed without warning.
+
+Guide to header files:
+
+* **include/db.h**: Main interface to the DB: Start here
+
+* **include/options.h**: Control over the behavior of an entire database,
+and also control over the behavior of individual reads and writes.
+
+* **include/comparator.h**: Abstraction for user-specified comparison function.
+If you want just bytewise comparison of keys, you can use the default
+comparator, but clients can write their own comparator implementations if they
+want custom ordering (e.g. to handle different character encodings, etc.)
+
+* **include/iterator.h**: Interface for iterating over data. You can get
+an iterator from a DB object.
+
+* **include/write_batch.h**: Interface for atomically applying multiple
+updates to a database.
+
+* **include/slice.h**: A simple module for maintaining a pointer and a
+length into some other byte array.
+
+* **include/status.h**: Status is returned from many of the public interfaces
+and is used to report success and various kinds of errors.
+
+* **include/env.h**:
+Abstraction of the OS environment. A posix implementation of this interface is
+in util/env_posix.cc
+
+* **include/table.h, include/table_builder.h**: Lower-level modules that most
+clients probably won't use directly
#
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
#
-# -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
+# -DLEVELDB_ATOMIC_PRESENT if <atomic> is present
# -DLEVELDB_PLATFORM_POSIX for Posix-based platforms
# -DSNAPPY if the Snappy library is present
#
fi
case "$TARGET_OS" in
+ CYGWIN_*)
+ PLATFORM=OS_LINUX
+ COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
+ PLATFORM_LDFLAGS="-lpthread"
+ PORT_FILE=port/port_posix.cc
+ ;;
Darwin)
PLATFORM=OS_MACOSX
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
# man ld: +h internal_name
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
;;
+ IOS)
+ PLATFORM=IOS
+ COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
+ [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
+ PORT_FILE=port/port_posix.cc
+ PLATFORM_SHARED_EXT=
+ PLATFORM_SHARED_LDFLAGS=
+ PLATFORM_SHARED_CFLAGS=
+ PLATFORM_SHARED_VERSIONED=
+ ;;
OS_WINDOWS_CROSSCOMPILE | NATIVE_WINDOWS)
PLATFORM=OS_WINDOWS
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1"
else
CXXOUTPUT="${TMPDIR}/leveldb_build_detect_platform-cxx.$$"
- # If -std=c++0x works, use <cstdatomic>. Otherwise use port_posix.h.
+ # If -std=c++0x works, use <atomic> as fallback for when memory barriers
+ # are not available.
$CXX $CXXFLAGS -std=c++0x -x c++ - -o $CXXOUTPUT 2>/dev/null <<EOF
- #include <cstdatomic>
+ #include <atomic>
int main() {}
EOF
if [ "$?" = 0 ]; then
- COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
+ COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_ATOMIC_PRESENT"
PLATFORM_CXXFLAGS="-std=c++0x"
else
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
--- /dev/null
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+#include "db/db_impl.h"
+#include "leveldb/cache.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class AutoCompactTest {
+ public:
+ std::string dbname_;
+ Cache* tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ AutoCompactTest() {
+ dbname_ = test::TmpDir() + "/autocompact_test";
+ tiny_cache_ = NewLRUCache(100);
+ options_.block_cache = tiny_cache_;
+ DestroyDB(dbname_, options_);
+ options_.create_if_missing = true;
+ options_.compression = kNoCompression;
+ ASSERT_OK(DB::Open(options_, dbname_, &db_));
+ }
+
+ ~AutoCompactTest() {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ delete tiny_cache_;
+ }
+
+ std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%06d", i);
+ return std::string(buf);
+ }
+
+ uint64_t Size(const Slice& start, const Slice& limit) {
+ Range r(start, limit);
+ uint64_t size;
+ db_->GetApproximateSizes(&r, 1, &size);
+ return size;
+ }
+
+ void DoReads(int n);
+};
+
+static const int kValueSize = 200 * 1024;
+static const int kTotalSize = 100 * 1024 * 1024;
+static const int kCount = kTotalSize / kValueSize;
+
+// Read through the first n keys repeatedly and check that they get
+// compacted (verified by checking the size of the key space).
+void AutoCompactTest::DoReads(int n) {
+ std::string value(kValueSize, 'x');
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+ // Fill database
+ for (int i = 0; i < kCount; i++) {
+ ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
+ }
+ ASSERT_OK(dbi->TEST_CompactMemTable());
+
+ // Delete everything
+ for (int i = 0; i < kCount; i++) {
+ ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
+ }
+ ASSERT_OK(dbi->TEST_CompactMemTable());
+
+ // Get initial measurement of the space we will be reading.
+ const int64_t initial_size = Size(Key(0), Key(n));
+ const int64_t initial_other_size = Size(Key(n), Key(kCount));
+
+ // Read until size drops significantly.
+ std::string limit_key = Key(n);
+ for (int read = 0; true; read++) {
+ ASSERT_LT(read, 100) << "Taking too long to compact";
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst();
+ iter->Valid() && iter->key().ToString() < limit_key;
+ iter->Next()) {
+ // Drop data
+ }
+ delete iter;
+ // Wait a little bit to allow any triggered compactions to complete.
+ Env::Default()->SleepForMicroseconds(1000000);
+ uint64_t size = Size(Key(0), Key(n));
+ fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
+ read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
+ if (size <= initial_size/10) {
+ break;
+ }
+ }
+
+ // Verify that the size of the key space not touched by the reads
+ // is pretty much unchanged.
+ const int64_t final_other_size = Size(Key(n), Key(kCount));
+ ASSERT_LE(final_other_size, initial_other_size + 1048576);
+ ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
+}
+
+TEST(AutoCompactTest, ReadAll) {
+ DoReads(kCount);
+}
+
+TEST(AutoCompactTest, ReadHalf) {
+ DoReads(kCount/2);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
CorruptionTest() {
tiny_cache_ = NewLRUCache(100);
options_.env = &env_;
+ options_.block_cache = tiny_cache_;
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, options_);
delete tiny_cache_;
}
- Status TryReopen(Options* options = NULL) {
+ Status TryReopen() {
delete db_;
db_ = NULL;
- Options opt = (options ? *options : options_);
- opt.env = &env_;
- opt.block_cache = tiny_cache_;
- return DB::Open(opt, dbname_, &db_);
+ return DB::Open(options_, dbname_, &db_);
}
- void Reopen(Options* options = NULL) {
- ASSERT_OK(TryReopen(options));
+ void Reopen() {
+ ASSERT_OK(TryReopen());
}
void RepairDB() {
Slice key = Key(i, &key_space);
batch.Clear();
batch.Put(key, Value(i, &value_space));
- ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ WriteOptions options;
+ // Corrupt() doesn't work without this sync on windows; stat reports 0 for
+ // the file size.
+ if (i == n - 1) {
+ options.sync = true;
+ }
+ ASSERT_OK(db_->Write(options, &batch));
}
}
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key;
Slice in(iter->key());
+ if (in == "" || in == "~") {
+ // Ignore boundary keys.
+ continue;
+ }
if (!ConsumeDecimalNumber(&in, &key) ||
!in.empty() ||
key < next_expected) {
FileType type;
std::string fname;
int picked_number = -1;
- for (int i = 0; i < filenames.size(); i++) {
+ for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) &&
type == filetype &&
int(number) > picked_number) { // Pick latest file
dbi->TEST_CompactRange(1, NULL, NULL);
Corrupt(kTableFile, 100, 1);
- Check(99, 99);
+ Check(90, 99);
+}
+
+TEST(CorruptionTest, TableFileRepair) {
+ options_.block_size = 2 * kValueSize; // Limit scope of corruption
+ options_.paranoid_checks = true;
+ Reopen();
+ Build(100);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, NULL, NULL);
+ dbi->TEST_CompactRange(1, NULL, NULL);
+
+ Corrupt(kTableFile, 100, 1);
+ RepairDB();
+ Reopen();
+ Check(95, 99);
}
TEST(CorruptionTest, TableFileIndexData) {
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
Corrupt(kTableFile, 100, 1);
- Check(9, 9);
+ Check(5, 9);
// Force compactions by writing lots of values
Build(10000);
}
TEST(CorruptionTest, CompactionInputErrorParanoid) {
- Options options;
- options.paranoid_checks = true;
- options.write_buffer_size = 1048576;
- Reopen(&options);
+ options_.paranoid_checks = true;
+ options_.write_buffer_size = 512 << 10;
+ Reopen();
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
- // Fill levels >= 1 so memtable compaction outputs to level 1
- for (int level = 1; level < config::kNumLevels; level++) {
- dbi->Put(WriteOptions(), "", "begin");
- dbi->Put(WriteOptions(), "~", "end");
+ // Make multiple inputs so we need to compact.
+ for (int i = 0; i < 2; i++) {
+ Build(10);
dbi->TEST_CompactMemTable();
+ Corrupt(kTableFile, 100, 1);
+ env_.SleepForMicroseconds(100000);
}
+ dbi->CompactRange(NULL, NULL);
- Build(10);
- dbi->TEST_CompactMemTable();
- ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
-
- Corrupt(kTableFile, 100, 1);
- Check(9, 9);
-
- // Write must eventually fail because of corrupted table
- Status s;
+ // Write must fail because of corrupted table
std::string tmp1, tmp2;
- for (int i = 0; i < 10000 && s.ok(); i++) {
- s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
- }
+ Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
}
pos_ = 0;
}
- Slice Generate(int len) {
+ Slice Generate(size_t len) {
if (pos_ + len > data_.size()) {
pos_ = 0;
assert(len < data_.size());
};
static Slice TrimSpace(Slice s) {
- int start = 0;
+ size_t start = 0;
while (start < s.size() && isspace(s[start])) {
start++;
}
- int limit = s.size();
+ size_t limit = s.size();
while (limit > start && isspace(s[limit-1])) {
limit--;
}
heap_counter_(0) {
std::vector<std::string> files;
Env::Default()->GetChildren(FLAGS_db, &files);
- for (int i = 0; i < files.size(); i++) {
+ for (size_t i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("heap-")) {
Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
}
benchmarks = sep + 1;
}
- // Reset parameters that may be overriddden bwlow
+ // Reset parameters that may be overridden below
num_ = FLAGS_num;
reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
value_size_ = FLAGS_value_size;
void SeekRandom(ThreadState* thread) {
ReadOptions options;
- std::string value;
int found = 0;
for (int i = 0; i < reads_; i++) {
Iterator* iter = db_->NewIterator(options);
return result;
}
-DBImpl::DBImpl(const Options& options, const std::string& dbname)
- : env_(options.env),
- internal_comparator_(options.comparator),
- internal_filter_policy_(options.filter_policy),
- options_(SanitizeOptions(
- dbname, &internal_comparator_, &internal_filter_policy_, options)),
- owns_info_log_(options_.info_log != options.info_log),
- owns_cache_(options_.block_cache != options.block_cache),
+DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
+ : env_(raw_options.env),
+ internal_comparator_(raw_options.comparator),
+ internal_filter_policy_(raw_options.filter_policy),
+ options_(SanitizeOptions(dbname, &internal_comparator_,
+ &internal_filter_policy_, raw_options)),
+ owns_info_log_(options_.info_log != raw_options.info_log),
+ owns_cache_(options_.block_cache != raw_options.block_cache),
dbname_(dbname),
db_lock_(NULL),
shutting_down_(NULL),
logfile_(NULL),
logfile_number_(0),
log_(NULL),
+ seed_(0),
tmp_batch_(new WriteBatch),
bg_compaction_scheduled_(false),
- manual_compaction_(NULL),
- consecutive_compaction_errors_(0) {
+ manual_compaction_(NULL) {
mem_->Ref();
has_imm_.Release_Store(NULL);
// Reserve ten files or so for other uses and give the rest to TableCache.
- const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles;
+ const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles;
table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
versions_ = new VersionSet(dbname_, &options_, table_cache_,
}
void DBImpl::DeleteObsoleteFiles() {
+ if (!bg_error_.ok()) {
+ // After a background error, we don't know whether a new version may
+ // or may not have been committed, so we cannot safely garbage collect.
+ return;
+ }
+
// Make a set of all of the live files
std::set<uint64_t> live = pending_outputs_;
versions_->AddLiveFiles(&live);
reporter.info_log = options_.info_log;
reporter.fname = fname.c_str();
reporter.status = (options_.paranoid_checks ? &status : NULL);
- // We intentially make log::Reader do checksumming even if
+ // We intentionally make log::Reader do checksumming even if
// paranoid_checks==false so that corruptions cause entire commits
// to be skipped instead of propagating bad information (like overly
// large sequence numbers).
return s;
}
-Status DBImpl::CompactMemTable() {
+void DBImpl::CompactMemTable() {
mutex_.AssertHeld();
assert(imm_ != NULL);
imm_ = NULL;
has_imm_.Release_Store(NULL);
DeleteObsoleteFiles();
+ } else {
+ RecordBackgroundError(s);
}
-
- return s;
}
void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
}
MutexLock l(&mutex_);
- while (!manual.done) {
- while (manual_compaction_ != NULL) {
- bg_cv_.Wait();
- }
- manual_compaction_ = &manual;
- MaybeScheduleCompaction();
- while (manual_compaction_ == &manual) {
+ while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
+ if (manual_compaction_ == NULL) { // Idle
+ manual_compaction_ = &manual;
+ MaybeScheduleCompaction();
+ } else { // Running either my compaction or another compaction.
bg_cv_.Wait();
}
}
+ if (manual_compaction_ == &manual) {
+ // Cancel my manual compaction since we aborted early for some reason.
+ manual_compaction_ = NULL;
+ }
}
Status DBImpl::TEST_CompactMemTable() {
return s;
}
+void DBImpl::RecordBackgroundError(const Status& s) {
+ mutex_.AssertHeld();
+ if (bg_error_.ok()) {
+ bg_error_ = s;
+ bg_cv_.SignalAll();
+ }
+}
+
void DBImpl::MaybeScheduleCompaction() {
mutex_.AssertHeld();
if (bg_compaction_scheduled_) {
// Already scheduled
} else if (shutting_down_.Acquire_Load()) {
// DB is being deleted; no more background compactions
+ } else if (!bg_error_.ok()) {
+ // Already got an error; no more changes
} else if (imm_ == NULL &&
manual_compaction_ == NULL &&
!versions_->NeedsCompaction()) {
void DBImpl::BackgroundCall() {
MutexLock l(&mutex_);
assert(bg_compaction_scheduled_);
- if (!shutting_down_.Acquire_Load()) {
- Status s = BackgroundCompaction();
- if (s.ok()) {
- // Success
- consecutive_compaction_errors_ = 0;
- } else if (shutting_down_.Acquire_Load()) {
- // Error most likely due to shutdown; do not wait
- } else {
- // Wait a little bit before retrying background compaction in
- // case this is an environmental problem and we do not want to
- // chew up resources for failed compactions for the duration of
- // the problem.
- bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
- Log(options_.info_log, "Waiting after background compaction error: %s",
- s.ToString().c_str());
- mutex_.Unlock();
- ++consecutive_compaction_errors_;
- int seconds_to_sleep = 1;
- for (int i = 0; i < 3 && i < consecutive_compaction_errors_ - 1; ++i) {
- seconds_to_sleep *= 2;
- }
- env_->SleepForMicroseconds(seconds_to_sleep * 1000000);
- mutex_.Lock();
- }
+ if (shutting_down_.Acquire_Load()) {
+ // No more background work when shutting down.
+ } else if (!bg_error_.ok()) {
+ // No more background work after a background error.
+ } else {
+ BackgroundCompaction();
}
bg_compaction_scheduled_ = false;
bg_cv_.SignalAll();
}
-Status DBImpl::BackgroundCompaction() {
+void DBImpl::BackgroundCompaction() {
mutex_.AssertHeld();
if (imm_ != NULL) {
- return CompactMemTable();
+ CompactMemTable();
+ return;
}
Compaction* c;
c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
f->smallest, f->largest);
status = versions_->LogAndApply(c->edit(), &mutex_);
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
static_cast<unsigned long long>(f->number),
} else {
CompactionState* compact = new CompactionState(c);
status = DoCompactionWork(compact);
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
CleanupCompaction(compact);
c->ReleaseInputs();
DeleteObsoleteFiles();
} else {
Log(options_.info_log,
"Compaction error: %s", status.ToString().c_str());
- if (options_.paranoid_checks && bg_error_.ok()) {
- bg_error_ = status;
- }
}
if (is_manual) {
}
manual_compaction_ = NULL;
}
- return status;
}
void DBImpl::CleanupCompaction(CompactionState* compact) {
if (status.ok()) {
status = InstallCompactionResults(compact);
}
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log,
"compacted to: %s", versions_->LevelSummary(&tmp));
} // namespace
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
- SequenceNumber* latest_snapshot) {
+ SequenceNumber* latest_snapshot,
+ uint32_t* seed) {
IterState* cleanup = new IterState;
mutex_.Lock();
*latest_snapshot = versions_->LastSequence();
cleanup->version = versions_->current();
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
+ *seed = ++seed_;
mutex_.Unlock();
return internal_iter;
}
Iterator* DBImpl::TEST_NewInternalIterator() {
SequenceNumber ignored;
- return NewInternalIterator(ReadOptions(), &ignored);
+ uint32_t ignored_seed;
+ return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
}
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
SequenceNumber latest_snapshot;
- Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
+ uint32_t seed;
+ Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
return NewDBIterator(
- &dbname_, env_, user_comparator(), internal_iter,
+ this, user_comparator(), iter,
(options.snapshot != NULL
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
- : latest_snapshot));
+ : latest_snapshot),
+ seed);
+}
+
+void DBImpl::RecordReadSample(Slice key) {
+ MutexLock l(&mutex_);
+ if (versions_->current()->RecordReadSample(key)) {
+ MaybeScheduleCompaction();
+ }
}
const Snapshot* DBImpl::GetSnapshot() {
{
mutex_.Unlock();
status = log_->AddRecord(WriteBatchInternal::Contents(updates));
+ bool sync_error = false;
if (status.ok() && options.sync) {
status = logfile_->Sync();
+ if (!status.ok()) {
+ sync_error = true;
+ }
}
if (status.ok()) {
status = WriteBatchInternal::InsertInto(updates, mem_);
}
mutex_.Lock();
+ if (sync_error) {
+ // The state of the log file is indeterminate: the log record we
+ // just added may or may not show up when the DB is re-opened.
+ // So we force the DB into a mode where all future writes fail.
+ RecordBackgroundError(status);
+ }
}
if (updates == tmp_batch_) tmp_batch_->Clear();
break;
}
- // Append to *reuslt
+ // Append to *result
if (result == first->batch) {
// Switch to temporary batch instead of disturbing caller's batch
result = tmp_batch_;
// file at a level >= 1.
int64_t TEST_MaxNextLevelOverlappingBytes();
+ // Record a sample of bytes read at the specified internal key.
+ // Samples are taken approximately once every config::kReadBytesPeriod
+ // bytes.
+ void RecordReadSample(Slice key);
+
private:
friend class DB;
struct CompactionState;
struct Writer;
Iterator* NewInternalIterator(const ReadOptions&,
- SequenceNumber* latest_snapshot);
+ SequenceNumber* latest_snapshot,
+ uint32_t* seed);
Status NewDB();
// Compact the in-memory write buffer to disk. Switches to a new
// log-file/memtable and writes a new descriptor iff successful.
- Status CompactMemTable()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ // Errors are recorded in bg_error_.
+ void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status RecoverLogFile(uint64_t log_number,
VersionEdit* edit,
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
WriteBatch* BuildBatchGroup(Writer** last_writer);
+ void RecordBackgroundError(const Status& s);
+
void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
static void BGWork(void* db);
void BackgroundCall();
- Status BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ void BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
void CleanupCompaction(CompactionState* compact)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status DoCompactionWork(CompactionState* compact)
WritableFile* logfile_;
uint64_t logfile_number_;
log::Writer* log_;
+ uint32_t seed_; // For sampling.
// Queue of writers.
std::deque<Writer*> writers_;
// Have we encountered a background error in paranoid mode?
Status bg_error_;
- int consecutive_compaction_errors_;
// Per level compaction stats. stats_[level] stores the stats for
// compactions that produced data for the specified "level".
#include "db/db_iter.h"
#include "db/filename.h"
+#include "db/db_impl.h"
#include "db/dbformat.h"
#include "leveldb/env.h"
#include "leveldb/iterator.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/mutexlock.h"
+#include "util/random.h"
namespace leveldb {
kReverse
};
- DBIter(const std::string* dbname, Env* env,
- const Comparator* cmp, Iterator* iter, SequenceNumber s)
- : dbname_(dbname),
- env_(env),
+ DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
+ uint32_t seed)
+ : db_(db),
user_comparator_(cmp),
iter_(iter),
sequence_(s),
direction_(kForward),
- valid_(false) {
+ valid_(false),
+ rnd_(seed),
+ bytes_counter_(RandomPeriod()) {
}
virtual ~DBIter() {
delete iter_;
}
}
- const std::string* const dbname_;
- Env* const env_;
+ // Pick next gap with average value of config::kReadBytesPeriod.
+ ssize_t RandomPeriod() {
+ return rnd_.Uniform(2*config::kReadBytesPeriod);
+ }
+
+ DBImpl* db_;
const Comparator* const user_comparator_;
Iterator* const iter_;
SequenceNumber const sequence_;
Direction direction_;
bool valid_;
+ Random rnd_;
+ ssize_t bytes_counter_;
+
// No copying allowed
DBIter(const DBIter&);
void operator=(const DBIter&);
};
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
- if (!ParseInternalKey(iter_->key(), ikey)) {
+ Slice k = iter_->key();
+ ssize_t n = k.size() + iter_->value().size();
+ bytes_counter_ -= n;
+ while (bytes_counter_ < 0) {
+ bytes_counter_ += RandomPeriod();
+ db_->RecordReadSample(k);
+ }
+ if (!ParseInternalKey(k, ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter");
return false;
} else {
saved_key_.clear();
return;
}
+ // saved_key_ already contains the key to skip past.
+ } else {
+ // Store in saved_key_ the current key so we skip it below.
+ SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
}
- // Temporarily use saved_key_ as storage for key to skip.
- std::string* skip = &saved_key_;
- SaveKey(ExtractUserKey(iter_->key()), skip);
- FindNextUserEntry(true, skip);
+ FindNextUserEntry(true, &saved_key_);
}
void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
} // anonymous namespace
Iterator* NewDBIterator(
- const std::string* dbname,
- Env* env,
+ DBImpl* db,
const Comparator* user_key_comparator,
Iterator* internal_iter,
- const SequenceNumber& sequence) {
- return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
+ SequenceNumber sequence,
+ uint32_t seed) {
+ return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
}
} // namespace leveldb
namespace leveldb {
+class DBImpl;
+
// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys.
extern Iterator* NewDBIterator(
- const std::string* dbname,
- Env* env,
+ DBImpl* db,
const Comparator* user_key_comparator,
Iterator* internal_iter,
- const SequenceNumber& sequence);
+ SequenceNumber sequence,
+ uint32_t seed);
} // namespace leveldb
// Special Env used to delay background operations
class SpecialEnv : public EnvWrapper {
public:
- // sstable Sync() calls are blocked while this pointer is non-NULL.
- port::AtomicPointer delay_sstable_sync_;
+ // sstable/log Sync() calls are blocked while this pointer is non-NULL.
+ port::AtomicPointer delay_data_sync_;
+
+ // sstable/log Sync() calls return an error.
+ port::AtomicPointer data_sync_error_;
// Simulate no-space errors while this pointer is non-NULL.
port::AtomicPointer no_space_;
bool count_random_reads_;
AtomicCounter random_read_counter_;
- AtomicCounter sleep_counter_;
- AtomicCounter sleep_time_counter_;
-
explicit SpecialEnv(Env* base) : EnvWrapper(base) {
- delay_sstable_sync_.Release_Store(NULL);
+ delay_data_sync_.Release_Store(NULL);
+ data_sync_error_.Release_Store(NULL);
no_space_.Release_Store(NULL);
non_writable_.Release_Store(NULL);
count_random_reads_ = false;
}
Status NewWritableFile(const std::string& f, WritableFile** r) {
- class SSTableFile : public WritableFile {
+ class DataFile : public WritableFile {
private:
SpecialEnv* env_;
WritableFile* base_;
public:
- SSTableFile(SpecialEnv* env, WritableFile* base)
+ DataFile(SpecialEnv* env, WritableFile* base)
: env_(env),
base_(base) {
}
- ~SSTableFile() { delete base_; }
+ ~DataFile() { delete base_; }
Status Append(const Slice& data) {
if (env_->no_space_.Acquire_Load() != NULL) {
// Drop writes on the floor
Status Close() { return base_->Close(); }
Status Flush() { return base_->Flush(); }
Status Sync() {
- while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
+ if (env_->data_sync_error_.Acquire_Load() != NULL) {
+ return Status::IOError("simulated data sync error");
+ }
+ while (env_->delay_data_sync_.Acquire_Load() != NULL) {
DelayMilliseconds(100);
}
return base_->Sync();
Status s = target()->NewWritableFile(f, r);
if (s.ok()) {
- if (strstr(f.c_str(), ".sst") != NULL) {
- *r = new SSTableFile(this, *r);
+ if (strstr(f.c_str(), ".ldb") != NULL ||
+ strstr(f.c_str(), ".log") != NULL) {
+ *r = new DataFile(this, *r);
} else if (strstr(f.c_str(), "MANIFEST") != NULL) {
*r = new ManifestFile(this, *r);
}
}
return s;
}
-
- virtual void SleepForMicroseconds(int micros) {
- sleep_counter_.Increment();
- sleep_time_counter_.IncrementBy(micros);
- }
-
};
class DBTest {
}
// Check reverse iteration results are the reverse of forward results
- int matched = 0;
+ size_t matched = 0;
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
ASSERT_LT(matched, forward.size());
ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
}
return false;
}
+
+ // Returns number of files renamed.
+ int RenameLDBToSST() {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ int files_renamed = 0;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
+ const std::string from = TableFileName(dbname_, number);
+ const std::string to = SSTTableFileName(dbname_, number);
+ ASSERT_OK(env_->RenameFile(from, to));
+ files_renamed++;
+ }
+ }
+ return files_renamed;
+ }
};
TEST(DBTest, Empty) {
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
- env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls
+ env_->delay_data_sync_.Release_Store(env_); // Block sync calls
Put("k1", std::string(100000, 'x')); // Fill memtable
Put("k2", std::string(100000, 'y')); // Trigger compaction
ASSERT_EQ("v1", Get("foo"));
- env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls
+ env_->delay_data_sync_.Release_Store(NULL); // Release sync calls
} while (ChangeOptions());
}
// * sstable B in level 2
// Then do enough Get() calls to arrange for an automatic compaction
// of sstable A. A bug would cause the compaction to be marked as
- // occuring at level 1 (instead of the correct level 0).
+ // occurring at level 1 (instead of the correct level 0).
// Step 1: First place sstables in levels 0 and 2
int compaction_count = 0;
Compact("a", "z");
const int num_files = CountFiles();
env_->no_space_.Release_Store(env_); // Force out-of-space errors
- env_->sleep_counter_.Reset();
- for (int i = 0; i < 5; i++) {
+ for (int i = 0; i < 10; i++) {
for (int level = 0; level < config::kNumLevels-1; level++) {
dbfull()->TEST_CompactRange(level, NULL, NULL);
}
}
env_->no_space_.Release_Store(NULL);
ASSERT_LT(CountFiles(), num_files + 3);
-
- // Check that compaction attempts slept after errors
- ASSERT_GE(env_->sleep_counter_.Read(), 5);
-}
-
-TEST(DBTest, ExponentialBackoff) {
- Options options = CurrentOptions();
- options.env = env_;
- Reopen(&options);
-
- ASSERT_OK(Put("foo", "v1"));
- ASSERT_EQ("v1", Get("foo"));
- Compact("a", "z");
- env_->non_writable_.Release_Store(env_); // Force errors for new files
- env_->sleep_counter_.Reset();
- env_->sleep_time_counter_.Reset();
- for (int i = 0; i < 5; i++) {
- dbfull()->TEST_CompactRange(2, NULL, NULL);
- }
- env_->non_writable_.Release_Store(NULL);
-
- // Wait for compaction to finish
- DelayMilliseconds(1000);
-
- ASSERT_GE(env_->sleep_counter_.Read(), 5);
- ASSERT_LT(env_->sleep_counter_.Read(), 10);
- ASSERT_GE(env_->sleep_time_counter_.Read(), 10e6);
}
TEST(DBTest, NonWritableFileSystem) {
env_->non_writable_.Release_Store(NULL);
}
+TEST(DBTest, WriteSyncError) {
+ // Check that log sync errors cause the DB to disallow future writes.
+
+ // (a) Cause log sync calls to fail
+ Options options = CurrentOptions();
+ options.env = env_;
+ Reopen(&options);
+ env_->data_sync_error_.Release_Store(env_);
+
+ // (b) Normal write should succeed
+ WriteOptions w;
+ ASSERT_OK(db_->Put(w, "k1", "v1"));
+ ASSERT_EQ("v1", Get("k1"));
+
+ // (c) Do a sync write; should fail
+ w.sync = true;
+ ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok());
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("NOT_FOUND", Get("k2"));
+
+ // (d) make sync behave normally
+ env_->data_sync_error_.Release_Store(NULL);
+
+ // (e) Do a non-sync write; should fail
+ w.sync = false;
+ ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok());
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("NOT_FOUND", Get("k2"));
+ ASSERT_EQ("NOT_FOUND", Get("k3"));
+}
+
TEST(DBTest, ManifestWriteError) {
// Test for the following problem:
// (a) Compaction produces file F
<< s.ToString();
}
+TEST(DBTest, StillReadSST) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Dump the memtable to disk.
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("bar", Get("foo"));
+ Close();
+ ASSERT_GT(RenameLDBToSST(), 0);
+ Options options = CurrentOptions();
+ options.paranoid_checks = true;
+ Status s = TryReopen(&options);
+ ASSERT_TRUE(s.ok());
+ ASSERT_EQ("bar", Get("foo"));
+}
+
TEST(DBTest, FilesDeletedAfterCompaction) {
ASSERT_OK(Put("foo", "v2"));
Compact("a", "z");
dbfull()->TEST_CompactMemTable();
// Prevent auto compactions triggered by seeks
- env_->delay_sstable_sync_.Release_Store(env_);
+ env_->delay_data_sync_.Release_Store(env_);
// Lookup present keys. Should rarely read from small sstable.
env_->random_read_counter_.Reset();
fprintf(stderr, "%d missing => %d reads\n", N, reads);
ASSERT_LE(reads, 3*N/100);
- env_->delay_sstable_sync_.Release_Store(NULL);
+ env_->delay_data_sync_.Release_Store(NULL);
Close();
delete options.block_cache;
delete options.filter_policy;
ASSERT_EQ(k, key);
ASSERT_GE(w, 0);
ASSERT_LT(w, kNumThreads);
- ASSERT_LE(c, reinterpret_cast<uintptr_t>(
+ ASSERT_LE(static_cast<uintptr_t>(c), reinterpret_cast<uintptr_t>(
t->state->counter[w].Acquire_Load()));
}
}
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
-#define STORAGE_LEVELDB_DB_FORMAT_H_
+#ifndef STORAGE_LEVELDB_DB_DBFORMAT_H_
+#define STORAGE_LEVELDB_DB_DBFORMAT_H_
#include <stdio.h>
#include "leveldb/comparator.h"
// space if the same key space is being repeatedly overwritten.
static const int kMaxMemCompactLevel = 2;
+// Approximate gap in bytes between samples of data read during iteration.
+static const int kReadBytesPeriod = 1048576;
+
} // namespace config
class InternalKey;
} // namespace leveldb
-#endif // STORAGE_LEVELDB_DB_FORMAT_H_
+#endif // STORAGE_LEVELDB_DB_DBFORMAT_H_
--- /dev/null
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+namespace {
+
+bool GuessType(const std::string& fname, FileType* type) {
+ size_t pos = fname.rfind('/');
+ std::string basename;
+ if (pos == std::string::npos) {
+ basename = fname;
+ } else {
+ basename = std::string(fname.data() + pos + 1, fname.size() - pos - 1);
+ }
+ uint64_t ignored;
+ return ParseFileName(basename, &ignored, type);
+}
+
+// Notified when log reader encounters corruption.
+class CorruptionReporter : public log::Reader::Reporter {
+ public:
+ WritableFile* dst_;
+ virtual void Corruption(size_t bytes, const Status& status) {
+ std::string r = "corruption: ";
+ AppendNumberTo(&r, bytes);
+ r += " bytes; ";
+ r += status.ToString();
+ r.push_back('\n');
+ dst_->Append(r);
+ }
+};
+
+// Print contents of a log file. (*func)() is called on every record.
+Status PrintLogContents(Env* env, const std::string& fname,
+ void (*func)(uint64_t, Slice, WritableFile*),
+ WritableFile* dst) {
+ SequentialFile* file;
+ Status s = env->NewSequentialFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ CorruptionReporter reporter;
+ reporter.dst_ = dst;
+ log::Reader reader(file, &reporter, true, 0);
+ Slice record;
+ std::string scratch;
+ while (reader.ReadRecord(&record, &scratch)) {
+ (*func)(reader.LastRecordOffset(), record, dst);
+ }
+ delete file;
+ return Status::OK();
+}
+
+// Called on every item found in a WriteBatch.
+class WriteBatchItemPrinter : public WriteBatch::Handler {
+ public:
+ WritableFile* dst_;
+ virtual void Put(const Slice& key, const Slice& value) {
+ std::string r = " put '";
+ AppendEscapedStringTo(&r, key);
+ r += "' '";
+ AppendEscapedStringTo(&r, value);
+ r += "'\n";
+ dst_->Append(r);
+ }
+ virtual void Delete(const Slice& key) {
+ std::string r = " del '";
+ AppendEscapedStringTo(&r, key);
+ r += "'\n";
+ dst_->Append(r);
+ }
+};
+
+
+// Called on every log record (each one of which is a WriteBatch)
+// found in a kLogFile.
+static void WriteBatchPrinter(uint64_t pos, Slice record, WritableFile* dst) {
+ std::string r = "--- offset ";
+ AppendNumberTo(&r, pos);
+ r += "; ";
+ if (record.size() < 12) {
+ r += "log record length ";
+ AppendNumberTo(&r, record.size());
+ r += " is too small\n";
+ dst->Append(r);
+ return;
+ }
+ WriteBatch batch;
+ WriteBatchInternal::SetContents(&batch, record);
+ r += "sequence ";
+ AppendNumberTo(&r, WriteBatchInternal::Sequence(&batch));
+ r.push_back('\n');
+ dst->Append(r);
+ WriteBatchItemPrinter batch_item_printer;
+ batch_item_printer.dst_ = dst;
+ Status s = batch.Iterate(&batch_item_printer);
+ if (!s.ok()) {
+ dst->Append(" error: " + s.ToString() + "\n");
+ }
+}
+
+Status DumpLog(Env* env, const std::string& fname, WritableFile* dst) {
+ return PrintLogContents(env, fname, WriteBatchPrinter, dst);
+}
+
+// Called on every log record (each one of which is a WriteBatch)
+// found in a kDescriptorFile.
+static void VersionEditPrinter(uint64_t pos, Slice record, WritableFile* dst) {
+ std::string r = "--- offset ";
+ AppendNumberTo(&r, pos);
+ r += "; ";
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(record);
+ if (!s.ok()) {
+ r += s.ToString();
+ r.push_back('\n');
+ } else {
+ r += edit.DebugString();
+ }
+ dst->Append(r);
+}
+
+Status DumpDescriptor(Env* env, const std::string& fname, WritableFile* dst) {
+ return PrintLogContents(env, fname, VersionEditPrinter, dst);
+}
+
+Status DumpTable(Env* env, const std::string& fname, WritableFile* dst) {
+ uint64_t file_size;
+ RandomAccessFile* file = NULL;
+ Table* table = NULL;
+ Status s = env->GetFileSize(fname, &file_size);
+ if (s.ok()) {
+ s = env->NewRandomAccessFile(fname, &file);
+ }
+ if (s.ok()) {
+ // We use the default comparator, which may or may not match the
+ // comparator used in this database. However this should not cause
+ // problems since we only use Table operations that do not require
+ // any comparisons. In particular, we do not call Seek or Prev.
+ s = Table::Open(Options(), file, file_size, &table);
+ }
+ if (!s.ok()) {
+ delete table;
+ delete file;
+ return s;
+ }
+
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = table->NewIterator(ro);
+ std::string r;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ r.clear();
+ ParsedInternalKey key;
+ if (!ParseInternalKey(iter->key(), &key)) {
+ r = "badkey '";
+ AppendEscapedStringTo(&r, iter->key());
+ r += "' => '";
+ AppendEscapedStringTo(&r, iter->value());
+ r += "'\n";
+ dst->Append(r);
+ } else {
+ r = "'";
+ AppendEscapedStringTo(&r, key.user_key);
+ r += "' @ ";
+ AppendNumberTo(&r, key.sequence);
+ r += " : ";
+ if (key.type == kTypeDeletion) {
+ r += "del";
+ } else if (key.type == kTypeValue) {
+ r += "val";
+ } else {
+ AppendNumberTo(&r, key.type);
+ }
+ r += " => '";
+ AppendEscapedStringTo(&r, iter->value());
+ r += "'\n";
+ dst->Append(r);
+ }
+ }
+ s = iter->status();
+ if (!s.ok()) {
+ dst->Append("iterator error: " + s.ToString() + "\n");
+ }
+
+ delete iter;
+ delete table;
+ delete file;
+ return Status::OK();
+}
+
+} // namespace
+
+Status DumpFile(Env* env, const std::string& fname, WritableFile* dst) {
+ FileType ftype;
+ if (!GuessType(fname, &ftype)) {
+ return Status::InvalidArgument(fname + ": unknown file type");
+ }
+ switch (ftype) {
+ case kLogFile: return DumpLog(env, fname, dst);
+ case kDescriptorFile: return DumpDescriptor(env, fname, dst);
+ case kTableFile: return DumpTable(env, fname, dst);
+ default:
+ break;
+ }
+ return Status::InvalidArgument(fname + ": not a dump-able file type");
+}
+
+} // namespace leveldb
assert(number > 0);
return MakeFileName(name, number, "log");
}
-
+// TableFileName returns the filenames we usually write to, while
+// SSTTableFileName returns the alternative filenames we also try to read from
+// for backward compatibility. For now, swap them around.
+// TODO: when compatibility is no longer necessary, swap them back
+// (TableFileName to use "ldb" and SSTTableFileName to use "sst").
std::string TableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "sst");
}
+std::string SSTTableFileName(const std::string& name, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(name, number, "ldb");
+}
+
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
char buf[100];
// dbname/LOG
// dbname/LOG.old
// dbname/MANIFEST-[0-9]+
-// dbname/[0-9]+.(log|sst)
+// dbname/[0-9]+.(log|sst|ldb)
bool ParseFileName(const std::string& fname,
uint64_t* number,
FileType* type) {
Slice suffix = rest;
if (suffix == Slice(".log")) {
*type = kLogFile;
- } else if (suffix == Slice(".sst")) {
+ } else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
*type = kTableFile;
} else if (suffix == Slice(".dbtmp")) {
*type = kTempFile;
// "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number);
+// Return the legacy file name for an sstable with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
+
// Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be
// prefixed with "dbname".
{ "100.log", 100, kLogFile },
{ "0.log", 0, kLogFile },
{ "0.sst", 0, kTableFile },
+ { "0.ldb", 0, kTableFile },
{ "CURRENT", 0, kCurrentFile },
{ "LOCK", 0, kDBLockFile },
{ "MANIFEST-2", 2, kDescriptorFile },
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
-#include "db/dbformat.h"
-#include "db/filename.h"
-#include "db/log_reader.h"
-#include "db/version_edit.h"
-#include "db/write_batch_internal.h"
+#include "leveldb/dumpfile.h"
#include "leveldb/env.h"
-#include "leveldb/iterator.h"
-#include "leveldb/options.h"
#include "leveldb/status.h"
-#include "leveldb/table.h"
-#include "leveldb/write_batch.h"
-#include "util/logging.h"
namespace leveldb {
-
namespace {
-bool GuessType(const std::string& fname, FileType* type) {
- size_t pos = fname.rfind('/');
- std::string basename;
- if (pos == std::string::npos) {
- basename = fname;
- } else {
- basename = std::string(fname.data() + pos + 1, fname.size() - pos - 1);
- }
- uint64_t ignored;
- return ParseFileName(basename, &ignored, type);
-}
-
-// Notified when log reader encounters corruption.
-class CorruptionReporter : public log::Reader::Reporter {
- public:
- virtual void Corruption(size_t bytes, const Status& status) {
- printf("corruption: %d bytes; %s\n",
- static_cast<int>(bytes),
- status.ToString().c_str());
- }
-};
-
-// Print contents of a log file. (*func)() is called on every record.
-bool PrintLogContents(Env* env, const std::string& fname,
- void (*func)(Slice)) {
- SequentialFile* file;
- Status s = env->NewSequentialFile(fname, &file);
- if (!s.ok()) {
- fprintf(stderr, "%s\n", s.ToString().c_str());
- return false;
- }
- CorruptionReporter reporter;
- log::Reader reader(file, &reporter, true, 0);
- Slice record;
- std::string scratch;
- while (reader.ReadRecord(&record, &scratch)) {
- printf("--- offset %llu; ",
- static_cast<unsigned long long>(reader.LastRecordOffset()));
- (*func)(record);
- }
- delete file;
- return true;
-}
-
-// Called on every item found in a WriteBatch.
-class WriteBatchItemPrinter : public WriteBatch::Handler {
+class StdoutPrinter : public WritableFile {
public:
- uint64_t offset_;
- uint64_t sequence_;
-
- virtual void Put(const Slice& key, const Slice& value) {
- printf(" put '%s' '%s'\n",
- EscapeString(key).c_str(),
- EscapeString(value).c_str());
- }
- virtual void Delete(const Slice& key) {
- printf(" del '%s'\n",
- EscapeString(key).c_str());
+ virtual Status Append(const Slice& data) {
+ fwrite(data.data(), 1, data.size(), stdout);
+ return Status::OK();
}
+ virtual Status Close() { return Status::OK(); }
+ virtual Status Flush() { return Status::OK(); }
+ virtual Status Sync() { return Status::OK(); }
};
-
-// Called on every log record (each one of which is a WriteBatch)
-// found in a kLogFile.
-static void WriteBatchPrinter(Slice record) {
- if (record.size() < 12) {
- printf("log record length %d is too small\n",
- static_cast<int>(record.size()));
- return;
- }
- WriteBatch batch;
- WriteBatchInternal::SetContents(&batch, record);
- printf("sequence %llu\n",
- static_cast<unsigned long long>(WriteBatchInternal::Sequence(&batch)));
- WriteBatchItemPrinter batch_item_printer;
- Status s = batch.Iterate(&batch_item_printer);
- if (!s.ok()) {
- printf(" error: %s\n", s.ToString().c_str());
- }
-}
-
-bool DumpLog(Env* env, const std::string& fname) {
- return PrintLogContents(env, fname, WriteBatchPrinter);
-}
-
-// Called on every log record (each one of which is a WriteBatch)
-// found in a kDescriptorFile.
-static void VersionEditPrinter(Slice record) {
- VersionEdit edit;
- Status s = edit.DecodeFrom(record);
- if (!s.ok()) {
- printf("%s\n", s.ToString().c_str());
- return;
- }
- printf("%s", edit.DebugString().c_str());
-}
-
-bool DumpDescriptor(Env* env, const std::string& fname) {
- return PrintLogContents(env, fname, VersionEditPrinter);
-}
-
-bool DumpTable(Env* env, const std::string& fname) {
- uint64_t file_size;
- RandomAccessFile* file = NULL;
- Table* table = NULL;
- Status s = env->GetFileSize(fname, &file_size);
- if (s.ok()) {
- s = env->NewRandomAccessFile(fname, &file);
- }
- if (s.ok()) {
- // We use the default comparator, which may or may not match the
- // comparator used in this database. However this should not cause
- // problems since we only use Table operations that do not require
- // any comparisons. In particular, we do not call Seek or Prev.
- s = Table::Open(Options(), file, file_size, &table);
- }
- if (!s.ok()) {
- fprintf(stderr, "%s\n", s.ToString().c_str());
- delete table;
- delete file;
- return false;
- }
-
- ReadOptions ro;
- ro.fill_cache = false;
- Iterator* iter = table->NewIterator(ro);
- for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
- ParsedInternalKey key;
- if (!ParseInternalKey(iter->key(), &key)) {
- printf("badkey '%s' => '%s'\n",
- EscapeString(iter->key()).c_str(),
- EscapeString(iter->value()).c_str());
- } else {
- char kbuf[20];
- const char* type;
- if (key.type == kTypeDeletion) {
- type = "del";
- } else if (key.type == kTypeValue) {
- type = "val";
- } else {
- snprintf(kbuf, sizeof(kbuf), "%d", static_cast<int>(key.type));
- type = kbuf;
- }
- printf("'%s' @ %8llu : %s => '%s'\n",
- EscapeString(key.user_key).c_str(),
- static_cast<unsigned long long>(key.sequence),
- type,
- EscapeString(iter->value()).c_str());
- }
- }
- s = iter->status();
- if (!s.ok()) {
- printf("iterator error: %s\n", s.ToString().c_str());
- }
-
- delete iter;
- delete table;
- delete file;
- return true;
-}
-
-bool DumpFile(Env* env, const std::string& fname) {
- FileType ftype;
- if (!GuessType(fname, &ftype)) {
- fprintf(stderr, "%s: unknown file type\n", fname.c_str());
- return false;
- }
- switch (ftype) {
- case kLogFile: return DumpLog(env, fname);
- case kDescriptorFile: return DumpDescriptor(env, fname);
- case kTableFile: return DumpTable(env, fname);
-
- default: {
- fprintf(stderr, "%s: not a dump-able file type\n", fname.c_str());
- break;
- }
- }
- return false;
-}
-
bool HandleDumpCommand(Env* env, char** files, int num) {
+ StdoutPrinter printer;
bool ok = true;
for (int i = 0; i < num; i++) {
- ok &= DumpFile(env, files[i]);
+ Status s = DumpFile(env, files[i], &printer);
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.ToString().c_str());
+ ok = false;
+ }
}
return ok;
}
-}
+} // namespace
} // namespace leveldb
static void Usage() {
static const int kBlockSize = 32768;
-// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
-static const int kHeaderSize = 4 + 1 + 2;
+// Header is checksum (4 bytes), length (2 bytes), type (1 byte).
+static const int kHeaderSize = 4 + 2 + 1;
} // namespace log
} // namespace leveldb
case kEof:
if (in_fragmented_record) {
- ReportCorruption(scratch->size(), "partial record without end(3)");
+ // This can be caused by the writer dying immediately after
+ // writing a physical record but before completing the next; don't
+ // treat it as a corruption, just ignore the entire logical record.
scratch->clear();
}
return false;
return last_record_offset_;
}
-void Reader::ReportCorruption(size_t bytes, const char* reason) {
+void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
ReportDrop(bytes, Status::Corruption(reason));
}
-void Reader::ReportDrop(size_t bytes, const Status& reason) {
+void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
if (reporter_ != NULL &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
- reporter_->Corruption(bytes, reason);
+ reporter_->Corruption(static_cast<size_t>(bytes), reason);
}
}
eof_ = true;
}
continue;
- } else if (buffer_.size() == 0) {
- // End of file
- return kEof;
} else {
- size_t drop_size = buffer_.size();
+ // Note that if buffer_ is non-empty, we have a truncated header at the
+ // end of the file, which can be caused by the writer crashing in the
+ // middle of writing the header. Instead of considering this an error,
+ // just report EOF.
buffer_.clear();
- ReportCorruption(drop_size, "truncated record at end of file");
return kEof;
}
}
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
buffer_.clear();
- ReportCorruption(drop_size, "bad record length");
- return kBadRecord;
+ if (!eof_) {
+ ReportCorruption(drop_size, "bad record length");
+ return kBadRecord;
+ }
+ // If the end of the file has been reached without reading |length| bytes
+ // of payload, assume the writer died in the middle of writing the record.
+ // Don't report a corruption.
+ return kEof;
}
if (type == kZeroType && length == 0) {
// Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation.
- void ReportCorruption(size_t bytes, const char* reason);
- void ReportDrop(size_t bytes, const Status& reason);
+ void ReportCorruption(uint64_t bytes, const char* reason);
+ void ReportDrop(uint64_t bytes, const Status& reason);
// No copying allowed
Reader(const Reader&);
ASSERT_EQ("OK", MatchError("unknown record type"));
}
-TEST(LogTest, TruncatedTrailingRecord) {
+TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read());
- ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
- ASSERT_EQ("OK", MatchError("truncated record at end of file"));
+ // Truncated last record is ignored, not treated as an error.
+ ASSERT_EQ(0, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
}
TEST(LogTest, BadLength) {
+ const int kPayloadSize = kBlockSize - kHeaderSize;
+ Write(BigString("bar", kPayloadSize));
+ Write("foo");
+ // Least significant size byte is stored in header[4].
+ IncrementByte(4, 1);
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ(kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, BadLengthAtEndIsIgnored) {
Write("foo");
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
- ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
- ASSERT_EQ("OK", MatchError("bad record length"));
+ ASSERT_EQ(0, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
}
TEST(LogTest, ChecksumMismatch) {
ASSERT_EQ("OK", MatchError("partial record without end"));
}
+TEST(LogTest, MissingLastIsIgnored) {
+ Write(BigString("bar", kBlockSize));
+ // Remove the LAST block, including header.
+ ShrinkSize(14);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0, DroppedBytes());
+}
+
+TEST(LogTest, PartialLastIsIgnored) {
+ Write(BigString("bar", kBlockSize));
+ // Cause a bad record length in the LAST block.
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0, DroppedBytes());
+}
+
TEST(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2)
ASSERT_EQ("correct", Read());
ASSERT_EQ("EOF", Read());
- const int dropped = DroppedBytes();
+ const size_t dropped = DroppedBytes();
ASSERT_LE(dropped, 2*kBlockSize + 100);
ASSERT_GE(dropped, 2*kBlockSize);
}
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.lognum = log;
- // We intentially make log::Reader do checksumming so that
+ // We intentionally make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
}
void ExtractMetaData() {
- std::vector<TableInfo> kept;
for (size_t i = 0; i < table_numbers_.size(); i++) {
- TableInfo t;
- t.meta.number = table_numbers_[i];
- Status status = ScanTable(&t);
- if (!status.ok()) {
- std::string fname = TableFileName(dbname_, table_numbers_[i]);
- Log(options_.info_log, "Table #%llu: ignoring %s",
- (unsigned long long) table_numbers_[i],
- status.ToString().c_str());
- ArchiveFile(fname);
- } else {
- tables_.push_back(t);
- }
+ ScanTable(table_numbers_[i]);
}
}
- Status ScanTable(TableInfo* t) {
- std::string fname = TableFileName(dbname_, t->meta.number);
+ Iterator* NewTableIterator(const FileMetaData& meta) {
+ // Same as compaction iterators: if paranoid_checks are on, turn
+ // on checksum verification.
+ ReadOptions r;
+ r.verify_checksums = options_.paranoid_checks;
+ return table_cache_->NewIterator(r, meta.number, meta.file_size);
+ }
+
+ void ScanTable(uint64_t number) {
+ TableInfo t;
+ t.meta.number = number;
+ std::string fname = TableFileName(dbname_, number);
+ Status status = env_->GetFileSize(fname, &t.meta.file_size);
+ if (!status.ok()) {
+ // Try alternate file name.
+ fname = SSTTableFileName(dbname_, number);
+ Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
+ if (s2.ok()) {
+ status = Status::OK();
+ }
+ }
+ if (!status.ok()) {
+ ArchiveFile(TableFileName(dbname_, number));
+ ArchiveFile(SSTTableFileName(dbname_, number));
+ Log(options_.info_log, "Table #%llu: dropped: %s",
+ (unsigned long long) t.meta.number,
+ status.ToString().c_str());
+ return;
+ }
+
+ // Extract metadata by scanning through table.
int counter = 0;
- Status status = env_->GetFileSize(fname, &t->meta.file_size);
- if (status.ok()) {
- Iterator* iter = table_cache_->NewIterator(
- ReadOptions(), t->meta.number, t->meta.file_size);
- bool empty = true;
- ParsedInternalKey parsed;
- t->max_sequence = 0;
- for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
- Slice key = iter->key();
- if (!ParseInternalKey(key, &parsed)) {
- Log(options_.info_log, "Table #%llu: unparsable key %s",
- (unsigned long long) t->meta.number,
- EscapeString(key).c_str());
- continue;
- }
+ Iterator* iter = NewTableIterator(t.meta);
+ bool empty = true;
+ ParsedInternalKey parsed;
+ t.max_sequence = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ if (!ParseInternalKey(key, &parsed)) {
+ Log(options_.info_log, "Table #%llu: unparsable key %s",
+ (unsigned long long) t.meta.number,
+ EscapeString(key).c_str());
+ continue;
+ }
- counter++;
- if (empty) {
- empty = false;
- t->meta.smallest.DecodeFrom(key);
- }
- t->meta.largest.DecodeFrom(key);
- if (parsed.sequence > t->max_sequence) {
- t->max_sequence = parsed.sequence;
- }
+ counter++;
+ if (empty) {
+ empty = false;
+ t.meta.smallest.DecodeFrom(key);
}
- if (!iter->status().ok()) {
- status = iter->status();
+ t.meta.largest.DecodeFrom(key);
+ if (parsed.sequence > t.max_sequence) {
+ t.max_sequence = parsed.sequence;
}
- delete iter;
}
+ if (!iter->status().ok()) {
+ status = iter->status();
+ }
+ delete iter;
Log(options_.info_log, "Table #%llu: %d entries %s",
- (unsigned long long) t->meta.number,
+ (unsigned long long) t.meta.number,
counter,
status.ToString().c_str());
- return status;
+
+ if (status.ok()) {
+ tables_.push_back(t);
+ } else {
+ RepairTable(fname, t); // RepairTable archives input file.
+ }
+ }
+
+ void RepairTable(const std::string& src, TableInfo t) {
+ // We will copy src contents to a new table and then rename the
+ // new table over the source.
+
+ // Create builder.
+ std::string copy = TableFileName(dbname_, next_file_number_++);
+ WritableFile* file;
+ Status s = env_->NewWritableFile(copy, &file);
+ if (!s.ok()) {
+ return;
+ }
+ TableBuilder* builder = new TableBuilder(options_, file);
+
+ // Copy data.
+ Iterator* iter = NewTableIterator(t.meta);
+ int counter = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ builder->Add(iter->key(), iter->value());
+ counter++;
+ }
+ delete iter;
+
+ ArchiveFile(src);
+ if (counter == 0) {
+ builder->Abandon(); // Nothing to save
+ } else {
+ s = builder->Finish();
+ if (s.ok()) {
+ t.meta.file_size = builder->FileSize();
+ }
+ }
+ delete builder;
+ builder = NULL;
+
+ if (s.ok()) {
+ s = file->Close();
+ }
+ delete file;
+ file = NULL;
+
+ if (counter > 0 && s.ok()) {
+ std::string orig = TableFileName(dbname_, t.meta.number);
+ s = env_->RenameFile(copy, orig);
+ if (s.ok()) {
+ Log(options_.info_log, "Table #%llu: %d entries repaired",
+ (unsigned long long) t.meta.number, counter);
+ tables_.push_back(t);
+ }
+ }
+ if (!s.ok()) {
+ env_->DeleteFile(copy);
+ }
}
Status WriteDescriptor() {
+#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
+#define STORAGE_LEVELDB_DB_SKIPLIST_H_
+
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
}
} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_SKIPLIST_H_
RandomAccessFile* file = NULL;
Table* table = NULL;
s = env_->NewRandomAccessFile(fname, &file);
+ if (!s.ok()) {
+ std::string old_fname = SSTTableFileName(dbname_, file_number);
+ if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
+ s = Status::OK();
+ }
+ }
if (s.ok()) {
s = Table::Open(*options_, file, file_size, &table);
}
return sum;
}
-namespace {
-std::string IntSetToString(const std::set<uint64_t>& s) {
- std::string result = "{";
- for (std::set<uint64_t>::const_iterator it = s.begin();
- it != s.end();
- ++it) {
- result += (result.size() > 1) ? "," : "";
- result += NumberToString(*it);
- }
- result += "}";
- return result;
-}
-} // namespace
-
Version::~Version() {
assert(refs_ == 0);
return a->number > b->number;
}
+void Version::ForEachOverlapping(Slice user_key, Slice internal_key,
+ void* arg,
+ bool (*func)(void*, int, FileMetaData*)) {
+ // TODO(sanjay): Change Version::Get() to use this function.
+ const Comparator* ucmp = vset_->icmp_.user_comparator();
+
+ // Search level-0 in order from newest to oldest.
+ std::vector<FileMetaData*> tmp;
+ tmp.reserve(files_[0].size());
+ for (uint32_t i = 0; i < files_[0].size(); i++) {
+ FileMetaData* f = files_[0][i];
+ if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
+ ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ tmp.push_back(f);
+ }
+ }
+ if (!tmp.empty()) {
+ std::sort(tmp.begin(), tmp.end(), NewestFirst);
+ for (uint32_t i = 0; i < tmp.size(); i++) {
+ if (!(*func)(arg, 0, tmp[i])) {
+ return;
+ }
+ }
+ }
+
+ // Search other levels.
+ for (int level = 1; level < config::kNumLevels; level++) {
+ size_t num_files = files_[level].size();
+ if (num_files == 0) continue;
+
+ // Binary search to find earliest index whose largest key >= internal_key.
+ uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
+ if (index < num_files) {
+ FileMetaData* f = files_[level][index];
+ if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
+ // All of "f" is past any data for user_key
+ } else {
+ if (!(*func)(arg, level, f)) {
+ return;
+ }
+ }
+ }
+ }
+}
+
Status Version::Get(const ReadOptions& options,
const LookupKey& k,
std::string* value,
return false;
}
+bool Version::RecordReadSample(Slice internal_key) {
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(internal_key, &ikey)) {
+ return false;
+ }
+
+ struct State {
+ GetStats stats; // Holds first matching file
+ int matches;
+
+ static bool Match(void* arg, int level, FileMetaData* f) {
+ State* state = reinterpret_cast<State*>(arg);
+ state->matches++;
+ if (state->matches == 1) {
+ // Remember first match.
+ state->stats.seek_file = f;
+ state->stats.seek_file_level = level;
+ }
+ // We can stop iterating once we have a second match.
+ return state->matches < 2;
+ }
+ };
+
+ State state;
+ state.matches = 0;
+ ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
+
+ // Must have at least two matches since we want to merge across
+ // files. But what if we have a single file that contains many
+ // overwrites and deletions? Should we have another mechanism for
+ // finding such files?
+ if (state.matches >= 2) {
+ // 1MB cost is about 1 seek (see comment in Builder::Apply).
+ return UpdateStats(state.stats);
+ }
+ return false;
+}
+
void Version::Ref() {
++refs_;
}
if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
break;
}
- GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
- const int64_t sum = TotalFileSize(overlaps);
- if (sum > kMaxGrandParentOverlapBytes) {
- break;
+ if (level + 2 < config::kNumLevels) {
+ // Check that file does not overlap too many grandparent bytes.
+ GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
+ const int64_t sum = TotalFileSize(overlaps);
+ if (sum > kMaxGrandParentOverlapBytes) {
+ break;
+ }
}
level++;
}
const InternalKey* begin,
const InternalKey* end,
std::vector<FileMetaData*>* inputs) {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
inputs->clear();
Slice user_begin, user_end;
if (begin != NULL) {
}
if (!s.ok()) {
Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
- if (ManifestContains(record)) {
- Log(options_->info_log,
- "MANIFEST contains log record despite error; advancing to new "
- "version to prevent mismatch between in-memory and logged state");
- s = Status::OK();
- }
}
}
// new CURRENT file that points to it.
if (s.ok() && !new_manifest_file.empty()) {
s = SetCurrentFile(env_, dbname_, manifest_file_number_);
- // No need to double-check MANIFEST in case of error since it
- // will be discarded below.
}
mu->Lock();
return scratch->buffer;
}
-// Return true iff the manifest contains the specified record.
-bool VersionSet::ManifestContains(const std::string& record) const {
- std::string fname = DescriptorFileName(dbname_, manifest_file_number_);
- Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
- SequentialFile* file = NULL;
- Status s = env_->NewSequentialFile(fname, &file);
- if (!s.ok()) {
- Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
- return false;
- }
- log::Reader reader(file, NULL, true/*checksum*/, 0);
- Slice r;
- std::string scratch;
- bool result = false;
- while (reader.ReadRecord(&r, &scratch)) {
- if (r == Slice(record)) {
- result = true;
- break;
- }
- }
- delete file;
- Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
- return result;
-}
-
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
uint64_t result = 0;
for (int level = 0; level < config::kNumLevels; level++) {
// REQUIRES: lock is held
bool UpdateStats(const GetStats& stats);
+ // Record a sample of bytes read at the specified internal key.
+ // Samples are taken approximately once every config::kReadBytesPeriod
+ // bytes. Returns true if a new compaction may need to be triggered.
+ // REQUIRES: lock is held
+ bool RecordReadSample(Slice key);
+
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
+ // Call func(arg, level, f) for every file that overlaps user_key in
+ // order from newest to oldest. If an invocation of func returns
+ // false, makes no more calls.
+ //
+ // REQUIRES: user portion of internal_key == user_key.
+ void ForEachOverlapping(Slice user_key, Slice internal_key,
+ void* arg,
+ bool (*func)(void*, int, FileMetaData*));
+
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list
void AppendVersion(Version* v);
- bool ManifestContains(const std::string& record) const;
-
Env* const env_;
const std::string dbname_;
const Options* const options_;
// Set the count for the number of entries in the batch.
static void SetCount(WriteBatch* batch, int n);
- // Return the seqeunce number for the start of this batch.
+ // Return the sequence number for the start of this batch.
static SequenceNumber Sequence(const WriteBatch* batch);
- // Store the specified number as the seqeunce number for the start of
+ // Store the specified number as the sequence number for the start of
// this batch.
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
bool write_sync = false;
if (name == Slice("fillseq")) {
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
-
+ DBSynchronize(db_);
} else if (name == Slice("fillrandom")) {
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
DBSynchronize(db_);
The implementation of leveldb is similar in spirit to the
representation of a single
-<a href="http://labs.google.com/papers/bigtable.html">
+<a href="http://research.google.com/archive/bigtable.html">
Bigtable tablet (section 5.3)</a>.
However the organization of the files that make up the representation
is somewhat different and is explained below.
sequence of level-(L+1) files. We switch to producing a new
level-(L+1) file after the current output file has reached the target
file size (2MB). We also switch to a new output file when the key
-range of the current output file has grown enough to overlap more then
+range of the current output file has grown enough to overlap more than
ten level-(L+2) files. This last rule ensures that a later compaction
of a level-(L+1) file will not pick up too much data from level-(L+2).
If we throttle the background writing to something small, say 10% of
the full 100MB/s speed, a compaction may take up to 5 seconds. If the
user is writing at 10MB/s, we might build up lots of level-0 files
-(~50 to hold the 5*10MB). This may signficantly increase the cost of
+(~50 to hold the 5*10MB). This may significantly increase the cost of
reads due to the overhead of merging more files together on every
read.
A record never starts within the last six bytes of a block (since it
won't fit). Any leftover bytes here form the trailer, which must
-consist entirely of zero bytes and must be skipped by readers.
+consist entirely of zero bytes and must be skipped by readers.
Aside: if exactly seven bytes are left in the current block, and a new
non-zero length record is added, the writer must emit a FIRST record
FIRST, MIDDLE, LAST are types used for user records that have been
split into multiple fragments (typically because of block boundaries).
FIRST is the type of the first fragment of a user record, LAST is the
-type of the last fragment of a user record, and MID is the type of all
-interior fragments of a user record.
+type of the last fragment of a user record, and MIDDLE is the type of
+all interior fragments of a user record.
Example: consider a sequence of user records:
A: length 1000
}
const uint64_t available = size_ - offset;
if (n > available) {
- n = available;
+ n = static_cast<size_t>(available);
}
if (n == 0) {
*result = Slice();
return Status::OK();
}
- size_t block = offset / kBlockSize;
+ assert(offset / kBlockSize <= SIZE_MAX);
+ size_t block = static_cast<size_t>(offset / kBlockSize);
size_t block_offset = offset % kBlockSize;
if (n <= kBlockSize - block_offset) {
if (pos_ > file_->Size()) {
return Status::IOError("pos_ > file_->Size()");
}
- const size_t available = file_->Size() - pos_;
+ const uint64_t available = file_->Size() - pos_;
if (n > available) {
n = available;
}
private:
FileState* file_;
- size_t pos_;
+ uint64_t pos_;
};
class RandomAccessFileImpl : public RandomAccessFile {
Does not support:
. getters for the option types
. custom comparators that implement key shortening
- . capturing post-write-snapshot
. custom iter, db, env, cache implementations using just the C bindings
Some conventions:
} // namespace leveldb
-#endif // STORAGE_LEVELDB_UTIL_CACHE_H_
+#endif // STORAGE_LEVELDB_INCLUDE_CACHE_H_
// Update Makefile if you change these
static const int kMajorVersion = 1;
-static const int kMinorVersion = 12;
+static const int kMinorVersion = 18;
struct Options;
struct ReadOptions;
--- /dev/null
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
+#define STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
+
+#include <string>
+#include "leveldb/env.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+// Dump the contents of the file named by fname in text format to
+// *dst. Makes a sequence of dst->Append() calls; each call is passed
+// the newline-terminated text corresponding to a single item found
+// in the file.
+//
+// Returns a non-OK result if fname does not name a leveldb storage
+// file, or if the file cannot be read.
+Status DumpFile(Env* env, const std::string& fname, WritableFile* dst);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
-#include <cstdarg>
#include <string>
#include <vector>
+#include <stdarg.h>
#include <stdint.h>
#include "leveldb/status.h"
// useful for computing deltas of time.
virtual uint64_t NowMicros() = 0;
- // Sleep/delay the thread for the perscribed number of micro-seconds.
+ // Sleep/delay the thread for the prescribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0;
private:
// Return the value for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
- // REQUIRES: !AtEnd() && !AtStart()
+ // REQUIRES: Valid()
virtual Slice value() const = 0;
// If an error has occurred, return it. Else return an ok status.
// If "snapshot" is non-NULL, read as of the supplied snapshot
// (which must belong to the DB that is being read and which must
- // not have been released). If "snapshot" is NULL, use an impliicit
+ // not have been released). If "snapshot" is NULL, use an implicit
// snapshot of the state at the beginning of this read operation.
// Default: NULL
const Snapshot* snapshot;
}
inline int Slice::compare(const Slice& b) const {
- const int min_len = (size_ < b.size_) ? size_ : b.size_;
+ const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
int r = memcmp(data_, b.data_, min_len);
if (r == 0) {
if (size_ < b.size_) r = -1;
--- /dev/null
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Test for issue 200: when iterator switches direction from backward
+// to forward, the current key can be yielded unexpectedly if a new
+// mutation has been added just before the current key.
+
+#include "leveldb/db.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class Issue200 { };
+
+TEST(Issue200, Test) {
+ // Get rid of any state from an old run.
+ std::string dbpath = test::TmpDir() + "/leveldb_issue200_test";
+ DestroyDB(dbpath, Options());
+
+ DB *db;
+ Options options;
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbpath, &db));
+
+ WriteOptions write_options;
+ ASSERT_OK(db->Put(write_options, "1", "b"));
+ ASSERT_OK(db->Put(write_options, "2", "c"));
+ ASSERT_OK(db->Put(write_options, "3", "d"));
+ ASSERT_OK(db->Put(write_options, "4", "e"));
+ ASSERT_OK(db->Put(write_options, "5", "f"));
+
+ ReadOptions read_options;
+ Iterator *iter = db->NewIterator(read_options);
+
+ // Add an element that should not be reflected in the iterator.
+ ASSERT_OK(db->Put(write_options, "25", "cd"));
+
+ iter->Seek("5");
+ ASSERT_EQ(iter->key().ToString(), "5");
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "4");
+ iter->Prev();
+ ASSERT_EQ(iter->key().ToString(), "3");
+ iter->Next();
+ ASSERT_EQ(iter->key().ToString(), "4");
+ iter->Next();
+ ASSERT_EQ(iter->key().ToString(), "5");
+
+ delete iter;
+ delete db;
+ DestroyDB(dbpath, options);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
// AtomicPointer provides storage for a lock-free pointer.
// Platform-dependent implementation of AtomicPointer:
// - If the platform provides a cheap barrier, we use it with raw pointers
-// - If cstdatomic is present (on newer versions of gcc, it is), we use
-// a cstdatomic-based AtomicPointer. However we prefer the memory
+// - If <atomic> is present (on newer versions of gcc, it is), we use
+// a <atomic>-based AtomicPointer. However we prefer the memory
// barrier based version, because at least on a gcc 4.4 32-bit build
-// on linux, we have encountered a buggy <cstdatomic>
-// implementation. Also, some <cstdatomic> implementations are much
-// slower than a memory-barrier based implementation (~16ns for
-// <cstdatomic> based acquire-load vs. ~1ns for a barrier based
-// acquire-load).
+// on linux, we have encountered a buggy <atomic> implementation.
+// Also, some <atomic> implementations are much slower than a memory-barrier
+// based implementation (~16ns for <atomic> based acquire-load vs. ~1ns for
+// a barrier based acquire-load).
// This code is based on atomicops-internals-* in Google's perftools:
// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase
#define PORT_ATOMIC_POINTER_H_
#include <stdint.h>
-#ifdef LEVELDB_CSTDATOMIC_PRESENT
-#include <cstdatomic>
+#ifdef LEVELDB_ATOMIC_PRESENT
+#include <atomic>
#endif
#ifdef OS_WIN
#include <windows.h>
// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx
#define LEVELDB_HAVE_MEMORY_BARRIER
+// Mac OS
+#elif defined(OS_MACOSX)
+inline void MemoryBarrier() {
+ OSMemoryBarrier();
+}
+#define LEVELDB_HAVE_MEMORY_BARRIER
+
// Gcc on x86
#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__)
inline void MemoryBarrier() {
}
#define LEVELDB_HAVE_MEMORY_BARRIER
-// Mac OS
-#elif defined(OS_MACOSX)
-inline void MemoryBarrier() {
- OSMemoryBarrier();
-}
-#define LEVELDB_HAVE_MEMORY_BARRIER
-
// ARM Linux
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__)
typedef void (*LinuxKernelMemoryBarrierFunc)(void);
};
// AtomicPointer based on <cstdatomic>
-#elif defined(LEVELDB_CSTDATOMIC_PRESENT)
+#elif defined(LEVELDB_ATOMIC_PRESENT)
class AtomicPointer {
private:
std::atomic<void*> rep_;
inline void NoBarrier_Store(void* v) { rep_ = v; }
};
-// We have neither MemoryBarrier(), nor <cstdatomic>
+// We have neither MemoryBarrier(), nor <atomic>
#else
#error Please implement AtomicPointer for this platform.
#else
#define PLATFORM_IS_LITTLE_ENDIAN false
#endif
-#elif defined(OS_FREEBSD)
+#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) ||\
+ defined(OS_NETBSD) || defined(OS_DRAGONFLYBSD)
#include <sys/types.h>
#include <sys/endian.h>
#define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN)
-#elif defined(OS_OPENBSD) || defined(OS_NETBSD) ||\
- defined(OS_DRAGONFLYBSD)
- #include <sys/types.h>
- #include <sys/endian.h>
#elif defined(OS_HPUX)
#define PLATFORM_IS_LITTLE_ENDIAN false
#elif defined(OS_ANDROID)
#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\
defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\
- defined(OS_ANDROID) || defined(OS_HPUX)
+ defined(OS_ANDROID) || defined(OS_HPUX) || defined(CYGWIN)
// Use fread/fwrite/fflush on platforms without _unlocked variants
#define fread_unlocked fread
#define fwrite_unlocked fwrite
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#ifndef STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H
+#ifndef STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_
+#define STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_
// Some environments provide custom macros to aid in static thread-safety
// analysis. Provide empty definitions of such macros unless they are already
#define NO_THREAD_SAFETY_ANALYSIS
#endif
-#endif // STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H
+#endif // STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
-// "*value_length", respectively. Will not derefence past "limit".
+// "*value_length", respectively. Will not dereference past "limit".
//
// If any errors are detected, returns NULL. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
// Reset the contents as if the BlockBuilder was just constructed.
void Reset();
- // REQUIRES: Finish() has not been callled since the last call to Reset().
+ // REQUIRES: Finish() has not been called since the last call to Reset().
// REQUIRES: key is larger than any previously added key
void Add(const Slice& key, const Slice& value);
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
uint32_t h = Hash(key.data(), key.size(), 1);
- for (int i = 0; i + 4 <= filter.size(); i += 4) {
+ for (size_t i = 0; i + 4 <= filter.size(); i += 4) {
if (h == DecodeFixed32(filter.data() + i)) {
return true;
}
const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
(static_cast<uint64_t>(magic_lo)));
if (magic != kTableMagicNumber) {
- return Status::InvalidArgument("not an sstable (bad magic number)");
+ return Status::Corruption("not an sstable (bad magic number)");
}
Status result = metaindex_handle_.DecodeFrom(input);
Table** table) {
*table = NULL;
if (size < Footer::kEncodedLength) {
- return Status::InvalidArgument("file is too short to be an sstable");
+ return Status::Corruption("file is too short to be an sstable");
}
char footer_space[Footer::kEncodedLength];
BlockContents contents;
Block* index_block = NULL;
if (s.ok()) {
- s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents);
+ ReadOptions opt;
+ if (options.paranoid_checks) {
+ opt.verify_checksums = true;
+ }
+ s = ReadBlock(file, opt, footer.index_handle(), &contents);
if (s.ok()) {
index_block = new Block(contents);
}
// TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
// it is an empty block.
ReadOptions opt;
+ if (rep_->options.paranoid_checks) {
+ opt.verify_checksums = true;
+ }
BlockContents contents;
if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
// Do not propagate errors since meta info is not needed for operation
// We might want to unify with ReadBlock() if we start
// requiring checksum verification in Table::Open.
ReadOptions opt;
+ if (rep_->options.paranoid_checks) {
+ opt.verify_checksums = true;
+ }
BlockContents block;
if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
return;
}
char* Arena::AllocateAligned(size_t bytes) {
- const int align = sizeof(void*); // We'll align to pointer size
+ const int align = (sizeof(void*) > 8) ? sizeof(void*) : 8;
assert((align & (align-1)) == 0); // Pointer size should be a power of 2
size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
size_t slop = (current_mod == 0 ? 0 : align - current_mod);
#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_
#define STORAGE_LEVELDB_UTIL_ARENA_H_
-#include <cstddef>
#include <vector>
#include <assert.h>
+#include <stddef.h>
#include <stdint.h>
namespace leveldb {
r = arena.Allocate(s);
}
- for (int b = 0; b < s; b++) {
+ for (size_t b = 0; b < s; b++) {
// Fill the "i"th allocation with a known bit pattern
r[b] = i % 256;
}
ASSERT_LE(arena.MemoryUsage(), bytes * 1.10);
}
}
- for (int i = 0; i < allocated.size(); i++) {
+ for (size_t i = 0; i < allocated.size(); i++) {
size_t num_bytes = allocated[i].first;
const char* p = allocated[i].second;
- for (int b = 0; b < num_bytes; b++) {
+ for (size_t b = 0; b < num_bytes; b++) {
// Check the "i"th allocation for the known bit pattern
ASSERT_EQ(int(p[b]) & 0xff, i % 256);
}
}
virtual const char* Name() const {
- return "leveldb.BuiltinBloomFilter";
+ return "leveldb.BuiltinBloomFilter2";
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
}
Build();
- ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length;
+ ASSERT_LE(FilterSize(), static_cast<size_t>((length * 10 / 8) + 40))
+ << length;
// All added keys must match
for (int i = 0; i < length; i++) {
}
std::string s;
- for (int i = 0; i < values.size(); i++) {
+ for (size_t i = 0; i < values.size(); i++) {
PutVarint64(&s, values[i]);
}
const char* p = s.data();
const char* limit = p + s.size();
- for (int i = 0; i < values.size(); i++) {
+ for (size_t i = 0; i < values.size(); i++) {
ASSERT_TRUE(p < limit);
uint64_t actual;
const char* start = p;
std::string s;
PutVarint32(&s, large_value);
uint32_t result;
- for (int len = 0; len < s.size() - 1; len++) {
+ for (size_t len = 0; len < s.size() - 1; len++) {
ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL);
}
ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL);
std::string s;
PutVarint64(&s, large_value);
uint64_t result;
- for (int len = 0; len < s.size() - 1; len++) {
+ for (size_t len = 0; len < s.size() - 1; len++) {
ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL);
}
ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL);
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#if !defined(LEVELDB_PLATFORM_WINDOWS)
-#include <deque>
-#include <set>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
-#if defined(LEVELDB_PLATFORM_ANDROID)
-#include <sys/stat.h>
-#endif
+#include <deque>
+#include <set>
#include "leveldb/env.h"
#include "leveldb/slice.h"
#include "port/port.h"
}
};
-// We preallocate up to an extra megabyte and use memcpy to append new
-// data to the file. This is safe since we either properly close the
-// file before reading from it, or for log files, the reading code
-// knows enough to skip zero suffixes.
-class PosixMmapFile : public WritableFile {
+class PosixWritableFile : public WritableFile {
private:
std::string filename_;
- int fd_;
- size_t page_size_;
- size_t map_size_; // How much extra memory to map at a time
- char* base_; // The mapped region
- char* limit_; // Limit of the mapped region
- char* dst_; // Where to write next (in range [base_,limit_])
- char* last_sync_; // Where have we synced up to
- uint64_t file_offset_; // Offset of base_ in file
-
- // Have we done an munmap of unsynced data?
- bool pending_sync_;
-
- // Roundup x to a multiple of y
- static size_t Roundup(size_t x, size_t y) {
- return ((x + y - 1) / y) * y;
- }
+ FILE* file_;
- size_t TruncateToPageBoundary(size_t s) {
- s -= (s & (page_size_ - 1));
- assert((s % page_size_) == 0);
- return s;
- }
+ public:
+ PosixWritableFile(const std::string& fname, FILE* f)
+ : filename_(fname), file_(f) { }
- bool UnmapCurrentRegion() {
- bool result = true;
- if (base_ != NULL) {
- if (last_sync_ < limit_) {
- // Defer syncing this data until next Sync() call, if any
- pending_sync_ = true;
- }
- if (munmap(base_, limit_ - base_) != 0) {
- result = false;
- }
- file_offset_ += limit_ - base_;
- base_ = NULL;
- limit_ = NULL;
- last_sync_ = NULL;
- dst_ = NULL;
-
- // Increase the amount we map the next time, but capped at 1MB
- if (map_size_ < (1<<20)) {
- map_size_ *= 2;
- }
+ ~PosixWritableFile() {
+ if (file_ != NULL) {
+ // Ignoring any potential errors
+ fclose(file_);
}
- return result;
}
- bool MapNewRegion() {
- assert(base_ == NULL);
- if (ftruncate(fd_, file_offset_ + map_size_) < 0) {
- return false;
- }
- void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
- fd_, file_offset_);
- if (ptr == MAP_FAILED) {
- return false;
+ virtual Status Append(const Slice& data) {
+ size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_);
+ if (r != data.size()) {
+ return IOError(filename_, errno);
}
- base_ = reinterpret_cast<char*>(ptr);
- limit_ = base_ + map_size_;
- dst_ = base_;
- last_sync_ = base_;
- return true;
+ return Status::OK();
}
- public:
- PosixMmapFile(const std::string& fname, int fd, size_t page_size)
- : filename_(fname),
- fd_(fd),
- page_size_(page_size),
- map_size_(Roundup(65536, page_size)),
- base_(NULL),
- limit_(NULL),
- dst_(NULL),
- last_sync_(NULL),
- file_offset_(0),
- pending_sync_(false) {
- assert((page_size & (page_size - 1)) == 0);
- }
-
-
- ~PosixMmapFile() {
- if (fd_ >= 0) {
- PosixMmapFile::Close();
+ virtual Status Close() {
+ Status result;
+ if (fclose(file_) != 0) {
+ result = IOError(filename_, errno);
}
+ file_ = NULL;
+ return result;
}
- virtual Status Append(const Slice& data) {
- const char* src = data.data();
- size_t left = data.size();
- while (left > 0) {
- assert(base_ <= dst_);
- assert(dst_ <= limit_);
- size_t avail = limit_ - dst_;
- if (avail == 0) {
- if (!UnmapCurrentRegion() ||
- !MapNewRegion()) {
- return IOError(filename_, errno);
- }
- }
-
- size_t n = (left <= avail) ? left : avail;
- memcpy(dst_, src, n);
- dst_ += n;
- src += n;
- left -= n;
+ virtual Status Flush() {
+ if (fflush_unlocked(file_) != 0) {
+ return IOError(filename_, errno);
}
return Status::OK();
}
- virtual Status Close() {
- Status s;
- size_t unused = limit_ - dst_;
- if (!UnmapCurrentRegion()) {
- s = IOError(filename_, errno);
- } else if (unused > 0) {
- // Trim the extra space at the end of the file
- if (ftruncate(fd_, file_offset_ - unused) < 0) {
- s = IOError(filename_, errno);
- }
+ Status SyncDirIfManifest() {
+ const char* f = filename_.c_str();
+ const char* sep = strrchr(f, '/');
+ Slice basename;
+ std::string dir;
+ if (sep == NULL) {
+ dir = ".";
+ basename = f;
+ } else {
+ dir = std::string(f, sep - f);
+ basename = sep + 1;
}
-
- if (close(fd_) < 0) {
- if (s.ok()) {
- s = IOError(filename_, errno);
+ Status s;
+ if (basename.starts_with("MANIFEST")) {
+ int fd = open(dir.c_str(), O_RDONLY);
+ if (fd < 0) {
+ s = IOError(dir, errno);
+ } else {
+ if (fsync(fd) < 0) {
+ s = IOError(dir, errno);
+ }
+ close(fd);
}
}
-
- fd_ = -1;
- base_ = NULL;
- limit_ = NULL;
return s;
}
- virtual Status Flush() {
- return Status::OK();
- }
-
virtual Status Sync() {
- Status s;
-
- if (pending_sync_) {
- // Some unmapped data was not synced
- pending_sync_ = false;
- if (fdatasync(fd_) < 0) {
- s = IOError(filename_, errno);
- }
+ // Ensure new files referred to by the manifest are in the filesystem.
+ Status s = SyncDirIfManifest();
+ if (!s.ok()) {
+ return s;
}
-
- if (dst_ > last_sync_) {
- // Find the beginnings of the pages that contain the first and last
- // bytes to be synced.
- size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
- size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
- last_sync_ = dst_;
- if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
- s = IOError(filename_, errno);
- }
+ if (fflush_unlocked(file_) != 0 ||
+ fdatasync(fileno(file_)) != 0) {
+ s = Status::IOError(filename_, strerror(errno));
}
-
return s;
}
};
public:
PosixEnv();
virtual ~PosixEnv() {
- fprintf(stderr, "Destroying Env::Default()\n");
+ char msg[] = "Destroying Env::Default()\n";
+ fwrite(msg, 1, sizeof(msg), stderr);
abort();
}
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) {
Status s;
- const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
- if (fd < 0) {
+ FILE* f = fopen(fname.c_str(), "w");
+ if (f == NULL) {
*result = NULL;
s = IOError(fname, errno);
} else {
- *result = new PosixMmapFile(fname, fd, page_size_);
+ *result = new PosixWritableFile(fname, f);
}
return s;
}
return NULL;
}
- size_t page_size_;
pthread_mutex_t mu_;
pthread_cond_t bgsignal_;
pthread_t bgthread_;
MmapLimiter mmap_limit_;
};
-PosixEnv::PosixEnv() : page_size_(getpagesize()),
- started_bgthread_(false) {
+PosixEnv::PosixEnv() : started_bgthread_(false) {
PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL));
PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL));
}
// Pick up remaining bytes
switch (limit - data) {
case 3:
- h += data[2] << 16;
+ h += static_cast<unsigned char>(data[2]) << 16;
FALLTHROUGH_INTENDED;
case 2:
- h += data[1] << 8;
+ h += static_cast<unsigned char>(data[1]) << 8;
FALLTHROUGH_INTENDED;
case 1:
- h += data[0];
+ h += static_cast<unsigned char>(data[0]);
h *= m;
h ^= (h >> r);
break;
--- /dev/null
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/hash.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class HASH { };
+
+TEST(HASH, SignedUnsignedIssue) {
+ const unsigned char data1[1] = {0x62};
+ const unsigned char data2[2] = {0xc3, 0x97};
+ const unsigned char data3[3] = {0xe2, 0x99, 0xa5};
+ const unsigned char data4[4] = {0xe1, 0x80, 0xb9, 0x32};
+ const unsigned char data5[48] = {
+ 0x01, 0xc0, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x14, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x04, 0x00,
+ 0x00, 0x00, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x18,
+ 0x28, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x02, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ };
+
+ ASSERT_EQ(Hash(0, 0, 0xbc9f1d34), 0xbc9f1d34);
+ ASSERT_EQ(
+ Hash(reinterpret_cast<const char*>(data1), sizeof(data1), 0xbc9f1d34),
+ 0xef1345c4);
+ ASSERT_EQ(
+ Hash(reinterpret_cast<const char*>(data2), sizeof(data2), 0xbc9f1d34),
+ 0x5b663814);
+ ASSERT_EQ(
+ Hash(reinterpret_cast<const char*>(data3), sizeof(data3), 0xbc9f1d34),
+ 0x323c078f);
+ ASSERT_EQ(
+ Hash(reinterpret_cast<const char*>(data4), sizeof(data4), 0xbc9f1d34),
+ 0xed21633a);
+ ASSERT_EQ(
+ Hash(reinterpret_cast<const char*>(data5), sizeof(data5), 0x12345678),
+ 0xf333dabb);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
return r;
}
-bool ConsumeChar(Slice* in, char c) {
- if (!in->empty() && (*in)[0] == c) {
- in->remove_prefix(1);
- return true;
- } else {
- return false;
- }
-}
-
bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
uint64_t v = 0;
int digits = 0;
// Escapes any non-printable characters found in "value".
extern std::string EscapeString(const Slice& value);
-// If *in starts with "c", advances *in past the first character and
-// returns true. Otherwise, returns false.
-extern bool ConsumeChar(Slice* in, char c);
-
// Parse a human-readable number from "*in" into *value. On success,
// advances "*in" past the consumed number and sets "*val" to the
// numeric value. Otherwise, returns false and leaves *in in an
private:
uint32_t seed_;
public:
- explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
+ explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) {
+ // Avoid bad seeds.
+ if (seed_ == 0 || seed_ == 2147483647L) {
+ seed_ = 1;
+ }
+ }
uint32_t Next() {
static const uint32_t M = 2147483647L; // 2^31-1
static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0
int num = 0;
if (tests != NULL) {
- for (int i = 0; i < tests->size(); i++) {
+ for (size_t i = 0; i < tests->size(); i++) {
const Test& t = (*tests)[i];
if (matcher != NULL) {
std::string name = t.base;
extern Slice CompressibleString(Random* rnd, double compressed_fraction,
- int len, std::string* dst) {
+ size_t len, std::string* dst) {
int raw = static_cast<int>(len * compressed_fraction);
if (raw < 1) raw = 1;
std::string raw_data;
// "N*compressed_fraction" bytes and return a Slice that references
// the generated data.
extern Slice CompressibleString(Random* rnd, double compressed_fraction,
- int len, std::string* dst);
+ size_t len, std::string* dst);
// A wrapper that allows injection of errors.
class ErrorEnv : public EnvWrapper {