// Copyright 2022 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT package gitdiff import ( "bytes" "html/template" "strings" "github.com/sergi/go-diff/diffmatchpatch" ) // token is a html tag or entity, eg: "", "", "<" func extractHTMLToken(s string) (before, token, after string, valid bool) { for pos1 := 0; pos1 < len(s); pos1++ { if s[pos1] == '<' { pos2 := strings.IndexByte(s[pos1:], '>') if pos2 == -1 { return "", "", s, false } return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true } else if s[pos1] == '&' { pos2 := strings.IndexByte(s[pos1:], ';') if pos2 == -1 { return "", "", s, false } return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true } } return "", "", s, true } // highlightCodeDiff is used to do diff with highlighted HTML code. // It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. // The HTML tags and entities will be replaced by Unicode placeholders: "{TEXT}" => "\uE000{TEXT}\uE001" // These Unicode placeholders are friendly to the diff. // Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities. // It's guaranteed that the tags in final diff result are paired correctly. type highlightCodeDiff struct { placeholderBegin rune placeholderMaxCount int placeholderIndex int placeholderTokenMap map[rune]string tokenPlaceholderMap map[string]rune placeholderOverflowCount int lineWrapperTags []string } func newHighlightCodeDiff() *highlightCodeDiff { return &highlightCodeDiff{ placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD) placeholderMaxCount: 64000, placeholderTokenMap: map[rune]string{}, tokenPlaceholderMap: map[string]rune{}, } } // nextPlaceholder returns 0 if no more placeholder can be used // the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line // so the placeholderMaxCount is impossible to be exhausted in real cases. func (hcd *highlightCodeDiff) nextPlaceholder() rune { for hcd.placeholderIndex < hcd.placeholderMaxCount { r := hcd.placeholderBegin + rune(hcd.placeholderIndex) hcd.placeholderIndex++ // only use non-existing (not used by code) rune as placeholders if _, ok := hcd.placeholderTokenMap[r]; !ok { return r } } return 0 // no more available placeholder } func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool { return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount) } func (hcd *highlightCodeDiff) collectUsedRunes(code template.HTML) { for _, r := range code { if hcd.isInPlaceholderRange(r) { // put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore. hcd.placeholderTokenMap[r] = "" } } } func (hcd *highlightCodeDiff) diffLineWithHighlight(lineType DiffLineType, codeA, codeB template.HTML) template.HTML { return hcd.diffLineWithHighlightWrapper(nil, lineType, codeA, codeB) } func (hcd *highlightCodeDiff) diffLineWithHighlightWrapper(lineWrapperTags []string, lineType DiffLineType, codeA, codeB template.HTML) template.HTML { hcd.collectUsedRunes(codeA) hcd.collectUsedRunes(codeB) convertedCodeA := hcd.convertToPlaceholders(codeA) convertedCodeB := hcd.convertToPlaceholders(codeB) dmp := defaultDiffMatchPatch() diffs := dmp.DiffMain(convertedCodeA, convertedCodeB, true) diffs = dmp.DiffCleanupSemantic(diffs) buf := bytes.NewBuffer(nil) // restore the line wrapper tags and , if necessary for _, tag := range lineWrapperTags { buf.WriteString(tag) } addedCodePrefix := hcd.registerTokenAsPlaceholder(``) removedCodePrefix := hcd.registerTokenAsPlaceholder(``) codeTagSuffix := hcd.registerTokenAsPlaceholder(``) if codeTagSuffix != 0 { for _, diff := range diffs { switch { case diff.Type == diffmatchpatch.DiffEqual: buf.WriteString(diff.Text) case diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd: buf.WriteRune(addedCodePrefix) buf.WriteString(diff.Text) buf.WriteRune(codeTagSuffix) case diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel: buf.WriteRune(removedCodePrefix) buf.WriteString(diff.Text) buf.WriteRune(codeTagSuffix) } } } else { // placeholder map space is exhausted for _, diff := range diffs { take := diff.Type == diffmatchpatch.DiffEqual || (diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd) || (diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel) if take { buf.WriteString(diff.Text) } } } for range lineWrapperTags { buf.WriteString("") } return hcd.recoverOneDiff(buf.String()) } func (hcd *highlightCodeDiff) registerTokenAsPlaceholder(token string) rune { placeholder, ok := hcd.tokenPlaceholderMap[token] if !ok { placeholder = hcd.nextPlaceholder() if placeholder != 0 { hcd.tokenPlaceholderMap[token] = placeholder hcd.placeholderTokenMap[placeholder] = token } } return placeholder } // convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. func (hcd *highlightCodeDiff) convertToPlaceholders(htmlContent template.HTML) string { var tagStack []string res := strings.Builder{} firstRunForLineTags := hcd.lineWrapperTags == nil var beforeToken, token string var valid bool htmlCode := string(htmlContent) // the standard chroma highlight HTML is " ... " for { beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode) if !valid || token == "" { break } // write the content before the token into result string, and consume the token in the string res.WriteString(beforeToken) // the line wrapper tags should be removed before diff if strings.HasPrefix(token, `") continue } var tokenInMap string if strings.HasSuffix(token, "" for "" tokenInMap = token + "" tagStack = tagStack[:len(tagStack)-1] } else if token[0] == '<' { // for opening tag tokenInMap = token tagStack = append(tagStack, token) } else if token[0] == '&' { // for html entity tokenInMap = token } // else: impossible // remember the placeholder and token in the map placeholder := hcd.registerTokenAsPlaceholder(tokenInMap) if placeholder != 0 { res.WriteRune(placeholder) // use the placeholder to replace the token } else { // unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting // usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma. hcd.placeholderOverflowCount++ if strings.HasPrefix(token, "&") { // when the token is a html entity, something must be outputted even if there is no placeholder. res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully? res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result. } } } // write the remaining string res.WriteString(htmlCode) return res.String() } func (hcd *highlightCodeDiff) recoverOneDiff(str string) template.HTML { sb := strings.Builder{} var tagStack []string for _, r := range str { token, ok := hcd.placeholderTokenMap[r] if !ok || token == "" { sb.WriteRune(r) // if the rune is not a placeholder, write it as it is continue } var tokenToRecover string if strings.HasPrefix(token, "')+1] if len(tagStack) == 0 { continue // if no opening tag in stack yet, skip the closing tag } tagStack = tagStack[:len(tagStack)-1] } else if token[0] == '<' { // for opening tag tokenToRecover = token tagStack = append(tagStack, token) } else if token[0] == '&' { // for html entity tokenToRecover = token } // else: impossible sb.WriteString(tokenToRecover) } if len(tagStack) > 0 { // close all opening tags for i := len(tagStack) - 1; i >= 0; i-- { tagToClose := tagStack[i] // get the closing tag "" from "" or "" pos := strings.IndexAny(tagToClose, " >") if pos != -1 { sb.WriteString("") } // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag } } return template.HTML(sb.String()) }