Skip to content

Commit 3197186

Browse files
authored
Switch to a custom pebble merger for codesearch (#9512)
Previously, we relied on the default pebble merger, which appended merged values together, to store posting lists. When unmarshalling posting lists, we would just repeatedly read from the buffer until we exhausted it. This was simple and worked, but over the long term this will cause the index to grow in proportion to the number of updates - as new postings are added to existing lists, we'll write out an entirely new roaring bitmap, rather than adding the new posting to the existing roaring bitmap. The merger included in this PR merges posting lists together (using `Or`), which will result in a single serialize roaring bitmap per entry, rather than N concatenated bitmaps. More on pebble mergers: https://pkg.go.dev/github.com/cockroachdb/[email protected]/internal/base#Merger and https://pkg.go.dev/github.com/cockroachdb/[email protected]/internal/base#ValueMerger. Important note: the merger name is stored in the pebble db. Trying to open an existing db with a different merger is an error. So, when this change goes in, we'll need to blow away the existing code search dbs and re-index the repos. This PR will stay in draft until we have a plan for that.
1 parent 5bfa50a commit 3197186

File tree

11 files changed

+147
-219
lines changed

11 files changed

+147
-219
lines changed

codesearch/cmd/cli/BUILD

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ go_library(
1515
"//codesearch/types",
1616
"//server/util/git",
1717
"//server/util/log",
18-
"@com_github_cockroachdb_pebble//:pebble",
1918
],
2019
)
2120

codesearch/cmd/cli/cli.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import (
2121
"github.com/buildbuddy-io/buildbuddy/codesearch/types"
2222
"github.com/buildbuddy-io/buildbuddy/server/util/git"
2323
"github.com/buildbuddy-io/buildbuddy/server/util/log"
24-
"github.com/cockroachdb/pebble"
2524
)
2625

2726
var (
@@ -145,7 +144,7 @@ func handleIndex(args []string) {
145144
if *reset {
146145
os.RemoveAll(indexDir)
147146
}
148-
db, err := pebble.Open(indexDir, &pebble.Options{})
147+
db, err := index.OpenPebbleDB(indexDir)
149148
if err != nil {
150149
log.Fatal(err.Error())
151150
}
@@ -197,7 +196,7 @@ func handleIndex(args []string) {
197196
func handleSearch(ctx context.Context, args []string) {
198197
pat := args[0]
199198

200-
db, err := pebble.Open(indexDir, &pebble.Options{})
199+
db, err := index.OpenPebbleDB(indexDir)
201200
if err != nil {
202201
log.Fatal(err.Error())
203202
}
@@ -266,7 +265,7 @@ func handleSearch(ctx context.Context, args []string) {
266265
func handleSquery(ctx context.Context, args []string) {
267266
pat := args[0]
268267

269-
db, err := pebble.Open(indexDir, &pebble.Options{})
268+
db, err := index.OpenPebbleDB(indexDir)
270269
if err != nil {
271270
log.Fatal(err.Error())
272271
}

codesearch/github/github.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func lastIndexedDocKey(repoURL *git.RepoURL) []byte {
4949

5050
func makeFileId(repoURL *git.RepoURL, name string) []byte {
5151
uniqueID := xxhash.Sum64String(repoURL.Owner + repoURL.Repo + name)
52-
idBytes := []byte(fmt.Sprintf("%d", uniqueID))
52+
idBytes := fmt.Appendf(nil, "%d", uniqueID)
5353
return idBytes
5454
}
5555

@@ -73,7 +73,6 @@ func makeLastIndexedDoc(repoURL *git.RepoURL, commitSHA string) types.Document {
7373
func AddFileToIndex(w types.IndexWriter, repoURL *git.RepoURL, commitSHA, filename string, fileContent []byte) error {
7474
err := validateFile(fileContent)
7575
if err != nil {
76-
log.Infof("File %s can't be indexed, skipping: %v", filename, err)
7776
return err
7877
}
7978

codesearch/github/github_test.go

Lines changed: 19 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,17 @@ var (
2323
altRepoURL = &git.RepoURL{Host: "github.com", Owner: "buildbuddy-io", Repo: "buildbuddy-internal"}
2424
)
2525

26+
func mustOpenDB(t *testing.T, indexDir string) *pebble.DB {
27+
t.Helper()
28+
db, err := index.OpenPebbleDB(indexDir)
29+
require.NoError(t, err)
30+
t.Cleanup(func() { db.Close() })
31+
return db
32+
}
33+
2634
func TestLastIndexedCommit(t *testing.T) {
2735
ctx := context.Background()
28-
indexDir := testfs.MakeTempDir(t)
29-
db, err := pebble.Open(indexDir, nil)
30-
if err != nil {
31-
t.Fatal(err)
32-
}
33-
defer db.Close()
36+
db := mustOpenDB(t, testfs.MakeTempDir(t))
3437

3538
commitSHA := "abc123"
3639

@@ -48,12 +51,7 @@ func TestLastIndexedCommit(t *testing.T) {
4851

4952
func TestLastIndexedCommitUpdated(t *testing.T) {
5053
ctx := context.Background()
51-
indexDir := testfs.MakeTempDir(t)
52-
db, err := pebble.Open(indexDir, nil)
53-
if err != nil {
54-
t.Fatal(err)
55-
}
56-
defer db.Close()
54+
db := mustOpenDB(t, testfs.MakeTempDir(t))
5755

5856
commitSHA := "abc123"
5957
repoURL, err := git.ParseGitHubRepoURL("github.com/buildbuddy-io/buildbuddy")
@@ -86,12 +84,7 @@ func TestLastIndexedCommitUpdated(t *testing.T) {
8684

8785
func TestLastIndexedCommitUnset(t *testing.T) {
8886
ctx := context.Background()
89-
indexDir := testfs.MakeTempDir(t)
90-
db, err := pebble.Open(indexDir, nil)
91-
if err != nil {
92-
t.Fatal(err)
93-
}
94-
defer db.Close()
87+
db := mustOpenDB(t, testfs.MakeTempDir(t))
9588

9689
commitSHA := "abc123"
9790

@@ -115,12 +108,7 @@ func TestLastIndexedCommitUnset(t *testing.T) {
115108

116109
func TestAddFileToIndex(t *testing.T) {
117110
ctx := context.Background()
118-
indexDir := testfs.MakeTempDir(t)
119-
db, err := pebble.Open(indexDir, nil)
120-
if err != nil {
121-
t.Fatal(err)
122-
}
123-
defer db.Close()
111+
db := mustOpenDB(t, testfs.MakeTempDir(t))
124112

125113
w, err := index.NewWriter(db, "testing-namespace")
126114
if err != nil {
@@ -148,12 +136,7 @@ func TestAddFileToIndex(t *testing.T) {
148136

149137
func TestAddFileToIndexWrongMimeType(t *testing.T) {
150138
ctx := context.Background()
151-
indexDir := testfs.MakeTempDir(t)
152-
db, err := pebble.Open(indexDir, nil)
153-
if err != nil {
154-
t.Fatal(err)
155-
}
156-
defer db.Close()
139+
db := mustOpenDB(t, testfs.MakeTempDir(t))
157140

158141
w, err := index.NewWriter(db, "testing-namespace")
159142
if err != nil {
@@ -175,12 +158,7 @@ func TestAddFileToIndexWrongMimeType(t *testing.T) {
175158

176159
func TestAddFileToIndexInvalidUTF8(t *testing.T) {
177160
ctx := context.Background()
178-
indexDir := testfs.MakeTempDir(t)
179-
db, err := pebble.Open(indexDir, nil)
180-
if err != nil {
181-
t.Fatal(err)
182-
}
183-
defer db.Close()
161+
db := mustOpenDB(t, testfs.MakeTempDir(t))
184162

185163
w, err := index.NewWriter(db, "testing-namespace")
186164
if err != nil {
@@ -211,10 +189,7 @@ func mustApplyCommit(t *testing.T, db *pebble.DB, commit *inpb.Commit) {
211189

212190
func TestProcessCommit_AddsOnly(t *testing.T) {
213191
ctx := context.Background()
214-
indexDir := testfs.MakeTempDir(t)
215-
db, err := pebble.Open(indexDir, nil)
216-
require.NoError(t, err)
217-
defer db.Close()
192+
db := mustOpenDB(t, testfs.MakeTempDir(t))
218193

219194
mustApplyCommit(t, db, &inpb.Commit{
220195
Sha: "abc123",
@@ -231,10 +206,7 @@ func TestProcessCommit_AddsOnly(t *testing.T) {
231206

232207
func TestProcessCommit_DeletesOnly(t *testing.T) {
233208
ctx := context.Background()
234-
indexDir := testfs.MakeTempDir(t)
235-
db, err := pebble.Open(indexDir, nil)
236-
require.NoError(t, err)
237-
defer db.Close()
209+
db := mustOpenDB(t, testfs.MakeTempDir(t))
238210

239211
// Add a file first
240212
mustApplyCommit(t, db, &inpb.Commit{
@@ -260,10 +232,7 @@ func TestProcessCommit_DeletesOnly(t *testing.T) {
260232

261233
func TestProcessCommit_DeleteThenReAdd(t *testing.T) {
262234
ctx := context.Background()
263-
indexDir := testfs.MakeTempDir(t)
264-
db, err := pebble.Open(indexDir, nil)
265-
require.NoError(t, err)
266-
defer db.Close()
235+
db := mustOpenDB(t, testfs.MakeTempDir(t))
267236

268237
filename := "foo/bar/baz.go"
269238

@@ -303,10 +272,7 @@ func TestProcessCommit_DeleteThenReAdd(t *testing.T) {
303272

304273
func TestProcessCommit_NonOverlappingAddsAndDeletes(t *testing.T) {
305274
ctx := context.Background()
306-
indexDir := testfs.MakeTempDir(t)
307-
db, err := pebble.Open(indexDir, nil)
308-
require.NoError(t, err)
309-
defer db.Close()
275+
db := mustOpenDB(t, testfs.MakeTempDir(t))
310276

311277
mustApplyCommit(t, db, &inpb.Commit{
312278
Sha: "abc123",
@@ -323,10 +289,7 @@ func TestProcessCommit_NonOverlappingAddsAndDeletes(t *testing.T) {
323289

324290
func TestProcessCommit_OverlappingAddsAndDeletes(t *testing.T) {
325291
ctx := context.Background()
326-
indexDir := testfs.MakeTempDir(t)
327-
db, err := pebble.Open(indexDir, nil)
328-
require.NoError(t, err)
329-
defer db.Close()
292+
db := mustOpenDB(t, testfs.MakeTempDir(t))
330293

331294
filename := "foo/bar/baz.go"
332295

codesearch/index/BUILD

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
22

33
go_library(
44
name = "index",
5-
srcs = ["index.go"],
5+
srcs = [
6+
"index.go",
7+
"merger.go",
8+
],
69
importpath = "github.com/buildbuddy-io/buildbuddy/codesearch/index",
710
visibility = ["//visibility:public"],
811
deps = [

0 commit comments

Comments
 (0)