mirror of https://github.com/go-gitea/gitea.git
Support elastic search for code search (#10273)
* Support elastic search for code search * Finished elastic search implementation and add some tests * Enable test on drone and added docs * Add new fields to elastic search * Fix bug * remove unused changes * Use indexer alias to keep the gitea indexer version * Improve codes * Some code improvements * The real indexer name changed to xxx.v1 Co-authored-by: zeripath <art27@cantab.net>pull/12648/head^2
parent
d257485bc0
commit
9bc69ff26e
@ -0,0 +1,385 @@
|
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package code
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"code.gitea.io/gitea/models"
|
||||
"code.gitea.io/gitea/modules/analyze"
|
||||
"code.gitea.io/gitea/modules/base"
|
||||
"code.gitea.io/gitea/modules/charset"
|
||||
"code.gitea.io/gitea/modules/git"
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
"code.gitea.io/gitea/modules/timeutil"
|
||||
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
"github.com/olivere/elastic/v7"
|
||||
)
|
||||
|
||||
const (
|
||||
esRepoIndexerLatestVersion = 1
|
||||
)
|
||||
|
||||
var (
|
||||
_ Indexer = &ElasticSearchIndexer{}
|
||||
)
|
||||
|
||||
// ElasticSearchIndexer implements Indexer interface
|
||||
type ElasticSearchIndexer struct {
|
||||
client *elastic.Client
|
||||
indexerAliasName string
|
||||
}
|
||||
|
||||
type elasticLogger struct {
|
||||
*log.Logger
|
||||
}
|
||||
|
||||
func (l elasticLogger) Printf(format string, args ...interface{}) {
|
||||
_ = l.Logger.Log(2, l.Logger.GetLevel(), format, args...)
|
||||
}
|
||||
|
||||
// NewElasticSearchIndexer creates a new elasticsearch indexer
|
||||
func NewElasticSearchIndexer(url, indexerName string) (*ElasticSearchIndexer, bool, error) {
|
||||
opts := []elastic.ClientOptionFunc{
|
||||
elastic.SetURL(url),
|
||||
elastic.SetSniff(false),
|
||||
elastic.SetHealthcheckInterval(10 * time.Second),
|
||||
elastic.SetGzip(false),
|
||||
}
|
||||
|
||||
logger := elasticLogger{log.GetLogger(log.DEFAULT)}
|
||||
|
||||
if logger.GetLevel() == log.TRACE || logger.GetLevel() == log.DEBUG {
|
||||
opts = append(opts, elastic.SetTraceLog(logger))
|
||||
} else if logger.GetLevel() == log.ERROR || logger.GetLevel() == log.CRITICAL || logger.GetLevel() == log.FATAL {
|
||||
opts = append(opts, elastic.SetErrorLog(logger))
|
||||
} else if logger.GetLevel() == log.INFO || logger.GetLevel() == log.WARN {
|
||||
opts = append(opts, elastic.SetInfoLog(logger))
|
||||
}
|
||||
|
||||
client, err := elastic.NewClient(opts...)
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
|
||||
indexer := &ElasticSearchIndexer{
|
||||
client: client,
|
||||
indexerAliasName: indexerName,
|
||||
}
|
||||
exists, err := indexer.init()
|
||||
|
||||
return indexer, !exists, err
|
||||
}
|
||||
|
||||
const (
|
||||
defaultMapping = `{
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"repo_id": {
|
||||
"type": "long",
|
||||
"index": true
|
||||
},
|
||||
"content": {
|
||||
"type": "text",
|
||||
"index": true
|
||||
},
|
||||
"commit_id": {
|
||||
"type": "keyword",
|
||||
"index": true
|
||||
},
|
||||
"language": {
|
||||
"type": "keyword",
|
||||
"index": true
|
||||
},
|
||||
"updated_at": {
|
||||
"type": "long",
|
||||
"index": true
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
)
|
||||
|
||||
func (b *ElasticSearchIndexer) realIndexerName() string {
|
||||
return fmt.Sprintf("%s.v%d", b.indexerAliasName, esRepoIndexerLatestVersion)
|
||||
}
|
||||
|
||||
// Init will initialize the indexer
|
||||
func (b *ElasticSearchIndexer) init() (bool, error) {
|
||||
ctx := context.Background()
|
||||
exists, err := b.client.IndexExists(b.realIndexerName()).Do(ctx)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if !exists {
|
||||
var mapping = defaultMapping
|
||||
|
||||
createIndex, err := b.client.CreateIndex(b.realIndexerName()).BodyString(mapping).Do(ctx)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if !createIndex.Acknowledged {
|
||||
return false, fmt.Errorf("create index %s with %s failed", b.realIndexerName(), mapping)
|
||||
}
|
||||
}
|
||||
|
||||
// check version
|
||||
r, err := b.client.Aliases().Do(ctx)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
realIndexerNames := r.IndicesByAlias(b.indexerAliasName)
|
||||
if len(realIndexerNames) < 1 {
|
||||
res, err := b.client.Alias().
|
||||
Add(b.realIndexerName(), b.indexerAliasName).
|
||||
Do(ctx)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if !res.Acknowledged {
|
||||
return false, fmt.Errorf("")
|
||||
}
|
||||
} else if len(realIndexerNames) >= 1 && realIndexerNames[0] < b.realIndexerName() {
|
||||
log.Warn("Found older gitea indexer named %s, but we will create a new one %s and keep the old NOT DELETED. You can delete the old version after the upgrade succeed.",
|
||||
realIndexerNames[0], b.realIndexerName())
|
||||
res, err := b.client.Alias().
|
||||
Remove(realIndexerNames[0], b.indexerAliasName).
|
||||
Add(b.realIndexerName(), b.indexerAliasName).
|
||||
Do(ctx)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if !res.Acknowledged {
|
||||
return false, fmt.Errorf("")
|
||||
}
|
||||
}
|
||||
|
||||
return exists, nil
|
||||
}
|
||||
|
||||
func (b *ElasticSearchIndexer) addUpdate(sha string, update fileUpdate, repo *models.Repository) ([]elastic.BulkableRequest, error) {
|
||||
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
|
||||
RunInDir(repo.RepoPath())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
|
||||
return nil, fmt.Errorf("Misformatted git cat-file output: %v", err)
|
||||
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
|
||||
return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
|
||||
}
|
||||
|
||||
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
|
||||
RunInDirBytes(repo.RepoPath())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if !base.IsTextFile(fileContents) {
|
||||
// FIXME: UTF-16 files will probably fail here
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
id := filenameIndexerID(repo.ID, update.Filename)
|
||||
|
||||
return []elastic.BulkableRequest{
|
||||
elastic.NewBulkIndexRequest().
|
||||
Index(b.indexerAliasName).
|
||||
Id(id).
|
||||
Doc(map[string]interface{}{
|
||||
"repo_id": repo.ID,
|
||||
"content": string(charset.ToUTF8DropErrors(fileContents)),
|
||||
"commit_id": sha,
|
||||
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
|
||||
"updated_at": timeutil.TimeStampNow(),
|
||||
}),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (b *ElasticSearchIndexer) addDelete(filename string, repo *models.Repository) elastic.BulkableRequest {
|
||||
id := filenameIndexerID(repo.ID, filename)
|
||||
return elastic.NewBulkDeleteRequest().
|
||||
Index(b.indexerAliasName).
|
||||
Id(id)
|
||||
}
|
||||
|
||||
// Index will save the index data
|
||||
func (b *ElasticSearchIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
|
||||
reqs := make([]elastic.BulkableRequest, 0)
|
||||
for _, update := range changes.Updates {
|
||||
updateReqs, err := b.addUpdate(sha, update, repo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(updateReqs) > 0 {
|
||||
reqs = append(reqs, updateReqs...)
|
||||
}
|
||||
}
|
||||
|
||||
for _, filename := range changes.RemovedFilenames {
|
||||
reqs = append(reqs, b.addDelete(filename, repo))
|
||||
}
|
||||
|
||||
if len(reqs) > 0 {
|
||||
_, err := b.client.Bulk().
|
||||
Index(b.indexerAliasName).
|
||||
Add(reqs...).
|
||||
Do(context.Background())
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete deletes indexes by ids
|
||||
func (b *ElasticSearchIndexer) Delete(repoID int64) error {
|
||||
_, err := b.client.DeleteByQuery(b.indexerAliasName).
|
||||
Query(elastic.NewTermsQuery("repo_id", repoID)).
|
||||
Do(context.Background())
|
||||
return err
|
||||
}
|
||||
|
||||
func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
|
||||
hits := make([]*SearchResult, 0, pageSize)
|
||||
for _, hit := range searchResult.Hits.Hits {
|
||||
// FIXME: There is no way to get the position the keyword on the content currently on the same request.
|
||||
// So we get it from content, this may made the query slower. See
|
||||
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
|
||||
var startIndex, endIndex int = -1, -1
|
||||
c, ok := hit.Highlight["content"]
|
||||
if ok && len(c) > 0 {
|
||||
var subStr = make([]rune, 0, len(kw))
|
||||
startIndex = strings.IndexFunc(c[0], func(r rune) bool {
|
||||
if len(subStr) >= len(kw) {
|
||||
subStr = subStr[1:]
|
||||
}
|
||||
subStr = append(subStr, r)
|
||||
return strings.EqualFold(kw, string(subStr))
|
||||
})
|
||||
if startIndex > -1 {
|
||||
endIndex = startIndex + len(kw)
|
||||
} else {
|
||||
panic(fmt.Sprintf("1===%#v", hit.Highlight))
|
||||
}
|
||||
} else {
|
||||
panic(fmt.Sprintf("2===%#v", hit.Highlight))
|
||||
}
|
||||
|
||||
repoID, fileName := parseIndexerID(hit.Id)
|
||||
var res = make(map[string]interface{})
|
||||
if err := json.Unmarshal(hit.Source, &res); err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
language := res["language"].(string)
|
||||
|
||||
hits = append(hits, &SearchResult{
|
||||
RepoID: repoID,
|
||||
Filename: fileName,
|
||||
CommitID: res["commit_id"].(string),
|
||||
Content: res["content"].(string),
|
||||
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
|
||||
Language: language,
|
||||
StartIndex: startIndex,
|
||||
EndIndex: endIndex,
|
||||
Color: enry.GetColor(language),
|
||||
})
|
||||
}
|
||||
|
||||
return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
|
||||
}
|
||||
|
||||
func extractAggs(searchResult *elastic.SearchResult) []*SearchResultLanguages {
|
||||
var searchResultLanguages []*SearchResultLanguages
|
||||
agg, found := searchResult.Aggregations.Terms("language")
|
||||
if found {
|
||||
searchResultLanguages = make([]*SearchResultLanguages, 0, 10)
|
||||
|
||||
for _, bucket := range agg.Buckets {
|
||||
searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
|
||||
Language: bucket.Key.(string),
|
||||
Color: enry.GetColor(bucket.Key.(string)),
|
||||
Count: int(bucket.DocCount),
|
||||
})
|
||||
}
|
||||
}
|
||||
return searchResultLanguages
|
||||
}
|
||||
|
||||
// Search searches for codes and language stats by given conditions.
|
||||
func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
|
||||
kwQuery := elastic.NewMultiMatchQuery(keyword, "content")
|
||||
query := elastic.NewBoolQuery()
|
||||
query = query.Must(kwQuery)
|
||||
if len(repoIDs) > 0 {
|
||||
var repoStrs = make([]interface{}, 0, len(repoIDs))
|
||||
for _, repoID := range repoIDs {
|
||||
repoStrs = append(repoStrs, repoID)
|
||||
}
|
||||
repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
|
||||
query = query.Must(repoQuery)
|
||||
}
|
||||
|
||||
var (
|
||||
start int
|
||||
kw = "<em>" + keyword + "</em>"
|
||||
aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
|
||||
)
|
||||
|
||||
if page > 0 {
|
||||
start = (page - 1) * pageSize
|
||||
}
|
||||
|
||||
if len(language) == 0 {
|
||||
searchResult, err := b.client.Search().
|
||||
Index(b.indexerAliasName).
|
||||
Aggregation("language", aggregation).
|
||||
Query(query).
|
||||
Highlight(elastic.NewHighlight().Field("content")).
|
||||
Sort("repo_id", true).
|
||||
From(start).Size(pageSize).
|
||||
Do(context.Background())
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
return convertResult(searchResult, kw, pageSize)
|
||||
}
|
||||
|
||||
langQuery := elastic.NewMatchQuery("language", language)
|
||||
countResult, err := b.client.Search().
|
||||
Index(b.indexerAliasName).
|
||||
Aggregation("language", aggregation).
|
||||
Query(query).
|
||||
Size(0). // We only needs stats information
|
||||
Do(context.Background())
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
query = query.Must(langQuery)
|
||||
searchResult, err := b.client.Search().
|
||||
Index(b.indexerAliasName).
|
||||
Query(query).
|
||||
Highlight(elastic.NewHighlight().Field("content")).
|
||||
Sort("repo_id", true).
|
||||
From(start).Size(pageSize).
|
||||
Do(context.Background())
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
total, hits, _, err := convertResult(searchResult, kw, pageSize)
|
||||
|
||||
return total, hits, extractAggs(countResult), err
|
||||
}
|
||||
|
||||
// Close implements indexer
|
||||
func (b *ElasticSearchIndexer) Close() {}
|
@ -0,0 +1,36 @@
|
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package code
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"code.gitea.io/gitea/models"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestESIndexAndSearch(t *testing.T) {
|
||||
models.PrepareTestEnv(t)
|
||||
|
||||
u := os.Getenv("TEST_INDEXER_CODE_ES_URL")
|
||||
if u == "" {
|
||||
t.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
indexer, _, err := NewElasticSearchIndexer(u, "gitea_codes")
|
||||
if err != nil {
|
||||
assert.Fail(t, "Unable to create ES indexer Error: %v", err)
|
||||
if indexer != nil {
|
||||
indexer.Close()
|
||||
}
|
||||
return
|
||||
}
|
||||
defer indexer.Close()
|
||||
|
||||
testIndexer("elastic_search", t, indexer)
|
||||
}
|
@ -0,0 +1,83 @@
|
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package code
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"code.gitea.io/gitea/models"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
models.MainTest(m, filepath.Join("..", "..", ".."))
|
||||
}
|
||||
|
||||
func testIndexer(name string, t *testing.T, indexer Indexer) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
var repoID int64 = 1
|
||||
err := index(indexer, repoID)
|
||||
assert.NoError(t, err)
|
||||
var (
|
||||
keywords = []struct {
|
||||
RepoIDs []int64
|
||||
Keyword string
|
||||
IDs []int64
|
||||
Langs int
|
||||
}{
|
||||
{
|
||||
RepoIDs: nil,
|
||||
Keyword: "Description",
|
||||
IDs: []int64{repoID},
|
||||
Langs: 1,
|
||||
},
|
||||
{
|
||||
RepoIDs: []int64{2},
|
||||
Keyword: "Description",
|
||||
IDs: []int64{},
|
||||
Langs: 0,
|
||||
},
|
||||
{
|
||||
RepoIDs: nil,
|
||||
Keyword: "repo1",
|
||||
IDs: []int64{repoID},
|
||||
Langs: 1,
|
||||
},
|
||||
{
|
||||
RepoIDs: []int64{2},
|
||||
Keyword: "repo1",
|
||||
IDs: []int64{},
|
||||
Langs: 0,
|
||||
},
|
||||
{
|
||||
RepoIDs: nil,
|
||||
Keyword: "non-exist",
|
||||
IDs: []int64{},
|
||||
Langs: 0,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
for _, kw := range keywords {
|
||||
t.Run(kw.Keyword, func(t *testing.T) {
|
||||
total, res, langs, err := indexer.Search(kw.RepoIDs, "", kw.Keyword, 1, 10)
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, len(kw.IDs), total)
|
||||
assert.EqualValues(t, kw.Langs, len(langs))
|
||||
|
||||
var ids = make([]int64, 0, len(res))
|
||||
for _, hit := range res {
|
||||
ids = append(ids, hit.RepoID)
|
||||
assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content)
|
||||
}
|
||||
assert.EqualValues(t, kw.IDs, ids)
|
||||
})
|
||||
}
|
||||
|
||||
assert.NoError(t, indexer.Delete(repoID))
|
||||
})
|
||||
}
|
Loading…
Reference in New Issue