Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Switched content detection from base to typesniffer.
  • Loading branch information
KN4CK3R committed Feb 28, 2021
commit bc73503a23ff36fd2f4e3c34e967b6e7b46342a5
68 changes: 0 additions & 68 deletions modules/base/tool.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@ import (
"encoding/hex"
"errors"
"fmt"
"net/http"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
Expand All @@ -29,15 +27,6 @@ import (
"github.com/dustin/go-humanize"
)

// Use at most this many bytes to determine Content Type.
const sniffLen = 512

// SVGMimeType MIME type of SVG images.
const SVGMimeType = "image/svg+xml"

var svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!--.*?-->|<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg[\s>\/]`)
var svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!--.*?-->|<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg[\s>\/]`)

// EncodeMD5 encodes string to md5 hex value.
func EncodeMD5(str string) string {
m := md5.New()
Expand Down Expand Up @@ -275,63 +264,6 @@ func IsLetter(ch rune) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
}

// DetectContentType extends http.DetectContentType with more content types.
func DetectContentType(data []byte) string {
ct := http.DetectContentType(data)

if len(data) > sniffLen {
data = data[:sniffLen]
}

if setting.UI.SVG.Enabled &&
((strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")) && svgTagRegex.Match(data) ||
strings.Contains(ct, "text/xml") && svgTagInXMLRegex.Match(data)) {

// SVG is unsupported. https://github.com/golang/go/issues/15888
return SVGMimeType
}
return ct
}

// IsRepresentableAsText returns true if file content can be represented as
// plain text or is empty.
func IsRepresentableAsText(data []byte) bool {
return IsTextFile(data) || IsSVGImageFile(data)
}

// IsTextFile returns true if file content format is plain text or empty.
func IsTextFile(data []byte) bool {
if len(data) == 0 {
return true
}
return strings.Contains(DetectContentType(data), "text/")
}

// IsImageFile detects if data is an image format
func IsImageFile(data []byte) bool {
return strings.Contains(DetectContentType(data), "image/")
}

// IsSVGImageFile detects if data is an SVG image format
func IsSVGImageFile(data []byte) bool {
return strings.Contains(DetectContentType(data), SVGMimeType)
}

// IsPDFFile detects if data is a pdf format
func IsPDFFile(data []byte) bool {
return strings.Contains(DetectContentType(data), "application/pdf")
}

// IsVideoFile detects if data is an video format
func IsVideoFile(data []byte) bool {
return strings.Contains(DetectContentType(data), "video/")
}

// IsAudioFile detects if data is an video format
func IsAudioFile(data []byte) bool {
return strings.Contains(DetectContentType(data), "audio/")
}

// EntryIcon returns the octicon class for displaying files/directories
func EntryIcon(entry *git.TreeEntry) string {
switch {
Expand Down
92 changes: 0 additions & 92 deletions modules/base/tool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
package base

import (
"encoding/base64"
"os"
"testing"
"time"
Expand Down Expand Up @@ -238,97 +237,6 @@ func TestIsLetter(t *testing.T) {
assert.False(t, IsLetter(0x93))
}

func TestDetectContentTypeLongerThanSniffLen(t *testing.T) {
// Pre-condition: Shorter than sniffLen detects SVG.
assert.Equal(t, "image/svg+xml", DetectContentType([]byte(`<!-- Comment --><svg></svg>`)))
// Longer than sniffLen detects something else.
assert.Equal(t, "text/plain; charset=utf-8", DetectContentType([]byte(`<!--
Comment Comment Comment Comment Comment Comment Comment Comment Comment Comment
Comment Comment Comment Comment Comment Comment Comment Comment Comment Comment
Comment Comment Comment Comment Comment Comment Comment Comment Comment Comment
Comment Comment Comment Comment Comment Comment Comment Comment Comment Comment
Comment Comment Comment Comment Comment Comment Comment Comment Comment Comment
Comment Comment Comment Comment Comment Comment Comment Comment Comment Comment
Comment Comment Comment --><svg></svg>`)))
}

// IsRepresentableAsText

func TestIsTextFile(t *testing.T) {
assert.True(t, IsTextFile([]byte{}))
assert.True(t, IsTextFile([]byte("lorem ipsum")))
}

func TestIsImageFile(t *testing.T) {
png, _ := base64.StdEncoding.DecodeString("iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAG0lEQVQYlWN4+vTpf3SMDTAMBYXYBLFpHgoKAeiOf0SGE9kbAAAAAElFTkSuQmCC")
assert.True(t, IsImageFile(png))
assert.False(t, IsImageFile([]byte("plain text")))
}

func TestIsSVGImageFile(t *testing.T) {
assert.True(t, IsSVGImageFile([]byte("<svg></svg>")))
assert.True(t, IsSVGImageFile([]byte(" <svg></svg>")))
assert.True(t, IsSVGImageFile([]byte(`<svg width="100"></svg>`)))
assert.True(t, IsSVGImageFile([]byte("<svg/>")))
assert.True(t, IsSVGImageFile([]byte(`<?xml version="1.0" encoding="UTF-8"?><svg></svg>`)))
assert.True(t, IsSVGImageFile([]byte(`<!-- Comment -->
<svg></svg>`)))
assert.True(t, IsSVGImageFile([]byte(`<!-- Multiple -->
<!-- Comments -->
<svg></svg>`)))
assert.True(t, IsSVGImageFile([]byte(`<!-- Multiline
Comment -->
<svg></svg>`)))
assert.True(t, IsSVGImageFile([]byte(`<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Basic//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd">
<svg></svg>`)))
assert.True(t, IsSVGImageFile([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!-- Comment -->
<svg></svg>`)))
assert.True(t, IsSVGImageFile([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!-- Multiple -->
<!-- Comments -->
<svg></svg>`)))
assert.True(t, IsSVGImageFile([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!-- Multline
Comment -->
<svg></svg>`)))
assert.True(t, IsSVGImageFile([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Multline
Comment -->
<svg></svg>`)))
assert.False(t, IsSVGImageFile([]byte{}))
assert.False(t, IsSVGImageFile([]byte("svg")))
assert.False(t, IsSVGImageFile([]byte("<svgfoo></svgfoo>")))
assert.False(t, IsSVGImageFile([]byte("text<svg></svg>")))
assert.False(t, IsSVGImageFile([]byte("<html><body><svg></svg></body></html>")))
assert.False(t, IsSVGImageFile([]byte(`<script>"<svg></svg>"</script>`)))
assert.False(t, IsSVGImageFile([]byte(`<!-- <svg></svg> inside comment -->
<foo></foo>`)))
assert.False(t, IsSVGImageFile([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!-- <svg></svg> inside comment -->
<foo></foo>`)))
}

func TestIsPDFFile(t *testing.T) {
pdf, _ := base64.StdEncoding.DecodeString("JVBERi0xLjYKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURlY29kZT4+CnN0cmVhbQp4nF3NPwsCMQwF8D2f4s2CNYk1baF0EHRwOwg4iJt/NsFb/PpevUE4Mjwe")
assert.True(t, IsPDFFile(pdf))
assert.False(t, IsPDFFile([]byte("plain text")))
}

func TestIsVideoFile(t *testing.T) {
mp4, _ := base64.StdEncoding.DecodeString("AAAAGGZ0eXBtcDQyAAAAAGlzb21tcDQyAAEI721vb3YAAABsbXZoZAAAAADaBlwX2gZcFwAAA+gA")
assert.True(t, IsVideoFile(mp4))
assert.False(t, IsVideoFile([]byte("plain text")))
}

func TestIsAudioFile(t *testing.T) {
mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
assert.True(t, IsAudioFile(mp3))
assert.False(t, IsAudioFile([]byte("plain text")))
}

// TODO: Test EntryIcon

func TestSetupGiteaRoot(t *testing.T) {
Expand Down
4 changes: 2 additions & 2 deletions modules/indexer/code/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ import (

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/typesniffer"
"code.gitea.io/gitea/modules/util"

"github.com/blevesearch/bleve/v2"
Expand Down Expand Up @@ -200,7 +200,7 @@ func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *mode
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
} else if !typesniffer.DetectContentType(fileContents).IsText() {
// FIXME: UTF-16 files will probably fail here
return nil
}
Expand Down
4 changes: 2 additions & 2 deletions modules/indexer/code/elastic_search.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ import (

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/typesniffer"

"github.com/go-enry/go-enry/v2"
"github.com/olivere/elastic/v7"
Expand Down Expand Up @@ -199,7 +199,7 @@ func (b *ElasticSearchIndexer) addUpdate(sha string, update fileUpdate, repo *mo
RunInDirBytes(repo.RepoPath())
if err != nil {
return nil, err
} else if !base.IsTextFile(fileContents) {
} else if !typesniffer.DetectContentType(fileContents).IsText() {
// FIXME: UTF-16 files will probably fail here
return nil, nil
}
Expand Down
4 changes: 2 additions & 2 deletions modules/lfs/pointers.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ import (
"strings"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/modules/typesniffer"
)

// ReadPointerFile will return a partially filled LFSMetaObject if the provided reader is a pointer file
Expand All @@ -25,7 +25,7 @@ func ReadPointerFile(reader io.Reader) (*models.LFSMetaObject, *[]byte) {
n, _ := reader.Read(buf)
buf = buf[:n]

if isTextFile := base.IsTextFile(buf); !isTextFile {
if !typesniffer.DetectContentType(buf).IsText() {
return nil, nil
}

Expand Down
5 changes: 3 additions & 2 deletions modules/typesniffer/typesniffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const SvgMimeType = "image/svg+xml"
var svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!--.*?-->|<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg[\s>\/]`)
var svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!--.*?-->|<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg[\s>\/]`)

// SniffedType contains informations about a blobs type.
type SniffedType struct {
contentType string
}
Expand All @@ -38,8 +39,8 @@ func (ct SniffedType) IsSvgImage() bool {
return strings.Contains(ct.contentType, SvgMimeType)
}

// IsPdf detects if data is a pdf format
func (ct SniffedType) IsPdf() bool {
// IsPDF detects if data is a PDF format
func (ct SniffedType) IsPDF() bool {
return strings.Contains(ct.contentType, "application/pdf")
}

Expand Down
20 changes: 18 additions & 2 deletions modules/typesniffer/typesniffer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package typesniffer

import (
"encoding/base64"
"testing"

"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -75,5 +76,20 @@ func TestIsSvgImage(t *testing.T) {
<foo></foo>`)).IsSvgImage())
}

// TODO: IsImageFile(), currently no idea how to test
// TODO: IsPDFFile(), currently no idea how to test
func TestIsPDFFile(t *testing.T) {
pdf, _ := base64.StdEncoding.DecodeString("JVBERi0xLjYKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURlY29kZT4+CnN0cmVhbQp4nF3NPwsCMQwF8D2f4s2CNYk1baF0EHRwOwg4iJt/NsFb/PpevUE4Mjwe")
assert.True(t, DetectContentType(pdf).IsPDF())
assert.False(t, DetectContentType([]byte("plain text")).IsPDF())
}

func TestIsVideoFile(t *testing.T) {
mp4, _ := base64.StdEncoding.DecodeString("AAAAGGZ0eXBtcDQyAAAAAGlzb21tcDQyAAEI721vb3YAAABsbXZoZAAAAADaBlwX2gZcFwAAA+gA")
assert.True(t, DetectContentType(mp4).IsVideo())
assert.False(t, DetectContentType([]byte("plain text")).IsVideo())
}

func TestIsAudioFile(t *testing.T) {
mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
assert.True(t, DetectContentType(mp3).IsAudio())
assert.False(t, DetectContentType([]byte("plain text")).IsAudio())
}
27 changes: 16 additions & 11 deletions routers/repo/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ import (
"path"
"strings"

"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/lfs"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/typesniffer"
)

// ServeData download file from io.Reader
Expand All @@ -41,24 +42,28 @@ func ServeData(ctx *context.Context, name string, size int64, reader io.Reader)
// Google Chrome dislike commas in filenames, so let's change it to a space
name = strings.ReplaceAll(name, ",", " ")

if base.IsTextFile(buf) || ctx.QueryBool("render") {
st := typesniffer.DetectContentType(buf)

if st.IsText() || ctx.QueryBool("render") {
cs, err := charset.DetectEncoding(buf)
if err != nil {
log.Error("Detect raw file %s charset failed: %v, using by default utf-8", name, err)
cs = "utf-8"
}
ctx.Resp.Header().Set("Content-Type", "text/plain; charset="+strings.ToLower(cs))
} else if base.IsImageFile(buf) || base.IsPDFFile(buf) {
ctx.Resp.Header().Set("Content-Disposition", fmt.Sprintf(`inline; filename="%s"`, name))
ctx.Resp.Header().Set("Access-Control-Expose-Headers", "Content-Disposition")
if base.IsSVGImageFile(buf) {
ctx.Resp.Header().Set("Content-Security-Policy", "default-src 'none'; style-src 'unsafe-inline'; sandbox")
ctx.Resp.Header().Set("X-Content-Type-Options", "nosniff")
ctx.Resp.Header().Set("Content-Type", base.SVGMimeType)
}
} else {
ctx.Resp.Header().Set("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, name))
ctx.Resp.Header().Set("Access-Control-Expose-Headers", "Content-Disposition")

if (st.IsImage() || st.IsPDF()) && (setting.UI.SVG.Enabled || !st.IsSvgImage()) {
ctx.Resp.Header().Set("Content-Disposition", fmt.Sprintf(`inline; filename="%s"`, name))
if st.IsSvgImage() {
ctx.Resp.Header().Set("Content-Security-Policy", "default-src 'none'; style-src 'unsafe-inline'; sandbox")
ctx.Resp.Header().Set("X-Content-Type-Options", "nosniff")
ctx.Resp.Header().Set("Content-Type", typesniffer.SvgMimeType)
}
} else {
ctx.Resp.Header().Set("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, name))
}
}

_, err = ctx.Resp.Write(buf)
Expand Down
5 changes: 3 additions & 2 deletions routers/repo/editor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"code.gitea.io/gitea/modules/repofiles"
repo_module "code.gitea.io/gitea/modules/repository"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/typesniffer"
"code.gitea.io/gitea/modules/upload"
"code.gitea.io/gitea/modules/util"
"code.gitea.io/gitea/modules/web"
Expand Down Expand Up @@ -115,8 +116,8 @@ func editFile(ctx *context.Context, isNewFile bool) {
buf = buf[:n]

// Only some file types are editable online as text.
if !base.IsRepresentableAsText(buf) {
ctx.NotFound("base.IsRepresentableAsText", nil)
if !typesniffer.DetectContentType(buf).IsRepresentableAsText() {
ctx.NotFound("typesniffer.IsRepresentableAsText", nil)
return
}

Expand Down
Loading