gomuks/ui/messages/html/parser.go

546 lines
15 KiB
Go
Raw Normal View History

2018-04-13 23:34:25 +02:00
// gomuks - A terminal Matrix client written in Go.
2020-04-19 17:10:14 +02:00
// Copyright (C) 2020 Tulir Asokan
2018-04-13 23:34:25 +02:00
//
// This program is free software: you can redistribute it and/or modify
2019-01-17 13:13:25 +01:00
// it under the terms of the GNU Affero General Public License as published by
2018-04-13 23:34:25 +02:00
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2019-01-17 13:13:25 +01:00
// GNU Affero General Public License for more details.
2018-04-13 23:34:25 +02:00
//
2019-01-17 13:13:25 +01:00
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
2018-04-13 23:34:25 +02:00
2019-04-09 17:42:49 +02:00
package html
2018-04-13 23:34:25 +02:00
import (
"fmt"
"regexp"
2019-01-17 13:13:25 +01:00
"strconv"
2018-04-13 23:34:25 +02:00
"strings"
2019-04-07 19:13:23 +02:00
"github.com/alecthomas/chroma"
"github.com/alecthomas/chroma/lexers"
"github.com/alecthomas/chroma/styles"
"github.com/lucasb-eyer/go-colorful"
2018-06-01 23:44:21 +02:00
"golang.org/x/net/html"
"mvdan.cc/xurls/v2"
2019-01-17 13:13:25 +01:00
2022-04-15 11:53:09 +02:00
"go.mau.fi/tcell"
"maunium.net/go/mautrix/event"
"maunium.net/go/mautrix/id"
2020-08-15 06:59:02 +02:00
2020-08-18 17:01:19 +02:00
"maunium.net/go/gomuks/config"
"maunium.net/go/gomuks/matrix/muksevt"
"maunium.net/go/gomuks/matrix/rooms"
"maunium.net/go/gomuks/ui/widget"
2018-04-13 23:34:25 +02:00
)
2018-05-31 15:59:40 +02:00
type htmlParser struct {
2020-08-17 21:03:28 +02:00
prefs *config.UserPreferences
room *rooms.Room
evt *muksevt.Event
preserveWhitespace bool
linkIDCounter int
2018-05-31 15:59:40 +02:00
}
func AdjustStyleBold(style tcell.Style) tcell.Style {
2018-05-31 15:59:40 +02:00
return style.Bold(true)
}
func AdjustStyleItalic(style tcell.Style) tcell.Style {
2018-05-31 15:59:40 +02:00
return style.Italic(true)
}
func AdjustStyleUnderline(style tcell.Style) tcell.Style {
2018-05-31 15:59:40 +02:00
return style.Underline(true)
}
func AdjustStyleStrikethrough(style tcell.Style) tcell.Style {
2022-04-15 11:53:09 +02:00
return style.StrikeThrough(true)
2018-04-13 23:34:25 +02:00
}
func AdjustStyleTextColor(color tcell.Color) AdjustStyleFunc {
return func(style tcell.Style) tcell.Style {
return style.Foreground(color)
}
}
func AdjustStyleBackgroundColor(color tcell.Color) AdjustStyleFunc {
2019-03-26 21:09:10 +01:00
return func(style tcell.Style) tcell.Style {
return style.Background(color)
}
}
func AdjustStyleLink(url, id string) AdjustStyleFunc {
return func(style tcell.Style) tcell.Style {
return style.Hyperlink(url, id)
}
}
func (parser *htmlParser) maybeGetAttribute(node *html.Node, attribute string) (string, bool) {
2018-06-01 23:28:21 +02:00
for _, attr := range node.Attr {
if attr.Key == attribute {
return attr.Val, true
2018-06-01 23:28:21 +02:00
}
}
return "", false
}
func (parser *htmlParser) getAttribute(node *html.Node, attribute string) string {
val, _ := parser.maybeGetAttribute(node, attribute)
return val
2018-06-01 23:28:21 +02:00
}
func (parser *htmlParser) hasAttribute(node *html.Node, attribute string) bool {
_, ok := parser.maybeGetAttribute(node, attribute)
return ok
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) listToEntity(node *html.Node) Entity {
children := parser.nodeToEntities(node.FirstChild)
2018-05-31 15:59:40 +02:00
ordered := node.Data == "ol"
start := 1
2018-05-31 15:59:40 +02:00
if ordered {
if startRaw := parser.getAttribute(node, "start"); len(startRaw) > 0 {
var err error
start, err = strconv.Atoi(startRaw)
if err != nil {
start = 1
}
2018-06-01 23:28:21 +02:00
}
}
listItems := children[:0]
for _, child := range children {
if child.GetTag() == "li" {
listItems = append(listItems, child)
2018-05-31 15:59:40 +02:00
}
2019-04-07 02:22:51 +02:00
}
2019-04-09 17:42:49 +02:00
return NewListEntity(ordered, start, listItems)
2018-04-13 23:34:25 +02:00
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) basicFormatToEntity(node *html.Node) Entity {
entity := &ContainerEntity{
BaseEntity: &BaseEntity{
Tag: node.Data,
},
Children: parser.nodeToEntities(node.FirstChild),
2019-04-07 02:22:51 +02:00
}
2018-05-31 15:59:40 +02:00
switch node.Data {
case "b", "strong":
entity.AdjustStyle(AdjustStyleBold, AdjustStyleReasonNormal)
2018-05-31 15:59:40 +02:00
case "i", "em":
entity.AdjustStyle(AdjustStyleItalic, AdjustStyleReasonNormal)
case "s", "del", "strike":
entity.AdjustStyle(AdjustStyleStrikethrough, AdjustStyleReasonNormal)
2018-05-31 15:59:40 +02:00
case "u", "ins":
entity.AdjustStyle(AdjustStyleUnderline, AdjustStyleReasonNormal)
case "code":
bgColor := tcell.ColorDarkSlateGray
fgColor := tcell.ColorWhite
entity.AdjustStyle(AdjustStyleBackgroundColor(bgColor), AdjustStyleReasonNormal)
entity.AdjustStyle(AdjustStyleTextColor(fgColor), AdjustStyleReasonNormal)
case "font", "span":
2019-04-07 02:22:51 +02:00
fgColor, ok := parser.parseColor(node, "data-mx-color", "color")
if ok {
entity.AdjustStyle(AdjustStyleTextColor(fgColor), AdjustStyleReasonNormal)
2019-04-07 02:22:51 +02:00
}
bgColor, ok := parser.parseColor(node, "data-mx-bg-color", "background-color")
if ok {
entity.AdjustStyle(AdjustStyleBackgroundColor(bgColor), AdjustStyleReasonNormal)
}
spoilerReason, isSpoiler := parser.maybeGetAttribute(node, "data-mx-spoiler")
if isSpoiler {
return NewSpoilerEntity(entity, spoilerReason)
2019-04-07 02:22:51 +02:00
}
}
2019-04-07 02:22:51 +02:00
return entity
}
2019-03-26 21:09:10 +01:00
func (parser *htmlParser) parseColor(node *html.Node, mainName, altName string) (color tcell.Color, ok bool) {
hex := parser.getAttribute(node, mainName)
if len(hex) == 0 {
2019-03-26 21:09:10 +01:00
hex = parser.getAttribute(node, altName)
if len(hex) == 0 {
2019-03-26 21:09:10 +01:00
return
}
}
2019-03-26 21:09:10 +01:00
cful, err := colorful.Hex(hex)
if err != nil {
2019-04-09 17:42:49 +02:00
color2, found := colorMap[strings.ToLower(hex)]
2019-03-26 21:09:10 +01:00
if !found {
return
}
2019-03-26 21:09:10 +01:00
cful, _ = colorful.MakeColor(color2)
}
2019-03-26 21:09:10 +01:00
r, g, b := cful.RGB255()
return tcell.NewRGBColor(int32(r), int32(g), int32(b)), true
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) headerToEntity(node *html.Node) Entity {
return (&ContainerEntity{
BaseEntity: &BaseEntity{
Tag: node.Data,
},
Children: append(
[]Entity{NewTextEntity(strings.Repeat("#", int(node.Data[1]-'0')) + " ")},
2020-08-15 06:59:02 +02:00
parser.nodeToEntities(node.FirstChild)...,
),
}).AdjustStyle(AdjustStyleBold, AdjustStyleReasonNormal)
2018-05-31 15:59:40 +02:00
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) blockquoteToEntity(node *html.Node) Entity {
return NewBlockquoteEntity(parser.nodeToEntities(node.FirstChild))
2018-05-31 15:59:40 +02:00
}
2019-04-14 23:34:48 +02:00
func (parser *htmlParser) linkToEntity(node *html.Node) Entity {
sameURL := false
2020-08-15 06:59:02 +02:00
href := parser.getAttribute(node, "href")
2019-04-14 23:34:48 +02:00
entity := &ContainerEntity{
BaseEntity: &BaseEntity{
Tag: "a",
},
Children: parser.nodeToEntities(node.FirstChild),
2019-04-07 02:22:51 +02:00
}
2020-08-15 06:59:02 +02:00
if len(href) == 0 {
2019-04-14 23:34:48 +02:00
return entity
2018-05-31 15:59:40 +02:00
}
2020-08-15 06:59:02 +02:00
if len(entity.Children) == 1 {
entity, ok := entity.Children[0].(*TextEntity)
if ok && entity.Text == href {
sameURL = true
}
}
2020-08-15 06:59:02 +02:00
matrixURI, _ := id.ParseMatrixURIOrMatrixToURL(href)
if matrixURI != nil && (matrixURI.Sigil1 == '@' || matrixURI.Sigil1 == '#') && matrixURI.Sigil2 == 0 {
text := NewTextEntity(matrixURI.PrimaryIdentifier())
if matrixURI.Sigil1 == '@' {
if member := parser.room.GetMember(matrixURI.UserID()); member != nil {
2019-04-14 23:34:48 +02:00
text.Text = member.Displayname
text.Style = text.Style.Foreground(widget.GetHashColor(matrixURI.UserID()))
2018-05-31 15:59:40 +02:00
}
2019-04-14 23:34:48 +02:00
entity.Children = []Entity{text}
} else if matrixURI.Sigil1 == '#' {
2019-04-14 23:34:48 +02:00
entity.Children = []Entity{text}
2018-05-31 15:59:40 +02:00
}
} else if parser.prefs.EnableInlineURLs() {
linkID := fmt.Sprintf("%s-%d", parser.evt.ID, parser.linkIDCounter)
parser.linkIDCounter++
entity.AdjustStyle(AdjustStyleLink(href, linkID), AdjustStyleReasonNormal)
} else if !sameURL && !parser.prefs.DisableShowURLs && !parser.hasAttribute(node, "data-mautrix-exclude-plaintext") {
entity.Children = append(entity.Children, NewTextEntity(fmt.Sprintf(" (%s)", href)))
}
2019-04-14 23:34:48 +02:00
return entity
2018-04-13 23:34:25 +02:00
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) imageToEntity(node *html.Node) Entity {
2019-04-07 19:13:23 +02:00
alt := parser.getAttribute(node, "alt")
if len(alt) == 0 {
alt = parser.getAttribute(node, "title")
if len(alt) == 0 {
alt = "[inline image]"
}
}
entity := &TextEntity{
BaseEntity: &BaseEntity{
Tag: "img",
},
2019-04-07 19:13:23 +02:00
Text: alt,
}
// TODO add click action and underline on hover for inline images
return entity
}
func colourToColor(colour chroma.Colour) tcell.Color {
if !colour.IsSet() {
return tcell.ColorDefault
}
return tcell.NewRGBColor(int32(colour.Red()), int32(colour.Green()), int32(colour.Blue()))
}
func styleEntryToStyle(se chroma.StyleEntry) tcell.Style {
return tcell.StyleDefault.
Bold(se.Bold == chroma.Yes).
Italic(se.Italic == chroma.Yes).
Underline(se.Underline == chroma.Yes).
Foreground(colourToColor(se.Colour)).
Background(colourToColor(se.Background))
}
func tokenToTextEntity(style *chroma.Style, token *chroma.Token) *TextEntity {
return &TextEntity{
BaseEntity: &BaseEntity{
Tag: token.Type.String(),
Style: styleEntryToStyle(style.Get(token.Type)),
DefaultHeight: 1,
},
Text: token.Value,
}
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) syntaxHighlight(text, language string) Entity {
lexer := lexers.Get(strings.ToLower(language))
2019-04-07 19:13:23 +02:00
if lexer == nil {
lexer = lexers.Get("plaintext")
2019-04-07 19:13:23 +02:00
}
iter, err := lexer.Tokenise(nil, text)
if err != nil {
return nil
}
// TODO allow changing theme
2019-04-07 19:13:23 +02:00
style := styles.SolarizedDark
2019-04-07 19:13:23 +02:00
tokens := iter.Tokens()
var children []Entity
for _, token := range tokens {
lines := strings.SplitAfter(token.Value, "\n")
for _, line := range lines {
line_len := len(line)
if line_len == 0 {
continue
2019-04-07 19:13:23 +02:00
}
t := token.Clone()
if line[line_len-1:] == "\n" {
t.Value = line[:line_len-1]
children = append(children, tokenToTextEntity(style, &t), NewBreakEntity())
} else {
t.Value = line
children = append(children, tokenToTextEntity(style, &t))
}
2019-04-07 19:13:23 +02:00
}
}
2019-04-09 17:42:49 +02:00
return NewCodeBlockEntity(children, styleEntryToStyle(style.Get(chroma.Background)))
2019-04-07 19:13:23 +02:00
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) codeblockToEntity(node *html.Node) Entity {
lang := "plaintext"
2019-04-07 19:13:23 +02:00
// TODO allow disabling syntax highlighting
if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" {
node = node.FirstChild
attr := parser.getAttribute(node, "class")
2019-04-07 19:13:23 +02:00
for _, class := range strings.Split(attr, " ") {
if strings.HasPrefix(class, "language-") {
lang = class[len("language-"):]
break
}
}
2019-04-07 02:22:51 +02:00
}
parser.preserveWhitespace = true
text := (&ContainerEntity{
Children: parser.nodeToEntities(node.FirstChild),
}).PlainText()
parser.preserveWhitespace = false
return parser.syntaxHighlight(text, lang)
2019-04-07 02:22:51 +02:00
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) tagNodeToEntity(node *html.Node) Entity {
2018-05-31 15:59:40 +02:00
switch node.Data {
case "blockquote":
return parser.blockquoteToEntity(node)
2018-05-31 15:59:40 +02:00
case "ol", "ul":
return parser.listToEntity(node)
case "h1", "h2", "h3", "h4", "h5", "h6":
return parser.headerToEntity(node)
2018-05-31 15:59:40 +02:00
case "br":
2019-04-09 17:42:49 +02:00
return NewBreakEntity()
case "b", "strong", "i", "em", "s", "strike", "del", "u", "ins", "font", "span", "code":
return parser.basicFormatToEntity(node)
case "a":
return parser.linkToEntity(node)
2019-04-07 19:13:23 +02:00
case "img":
return parser.imageToEntity(node)
2018-05-31 15:59:40 +02:00
case "pre":
2019-04-07 02:22:51 +02:00
return parser.codeblockToEntity(node)
case "hr":
return NewHorizontalLineEntity()
case "mx-reply":
return nil
2018-05-31 15:59:40 +02:00
default:
return &ContainerEntity{
BaseEntity: &BaseEntity{
Tag: node.Data,
Block: parser.isBlockTag(node.Data),
},
Children: parser.nodeToEntities(node.FirstChild),
2019-04-07 02:22:51 +02:00
}
}
2018-04-13 23:34:25 +02:00
}
var spaces = regexp.MustCompile("\\s+")
2022-10-19 13:30:16 +02:00
// textToHTMLEntity converts a plain text string into an HTML Entity while preserving newlines.
func textToHTMLEntity(text string) Entity {
lines := strings.SplitAfter(text, "\n")
if len(lines) == 1 {
return NewTextEntity(text)
}
ent := &ContainerEntity{
BaseEntity: &BaseEntity{Tag: "span"},
}
for _, line := range lines {
line_len := len(line)
if line_len == 0 {
continue
}
if line[line_len-1:] == "\n" {
ent.Children = append(ent.Children, NewTextEntity(line[:line_len-1]), NewBreakEntity())
} else {
ent.Children = append(ent.Children, NewTextEntity(line))
}
}
return ent
}
func TextToEntity(text string, eventID id.EventID, linkify bool) Entity {
if len(text) == 0 {
return nil
}
if !linkify {
2022-10-19 13:30:16 +02:00
return textToHTMLEntity(text)
}
indices := xurls.Strict().FindAllStringIndex(text, -1)
if len(indices) == 0 {
2022-10-19 13:30:16 +02:00
return textToHTMLEntity(text)
}
ent := &ContainerEntity{
BaseEntity: &BaseEntity{Tag: "span"},
}
var lastEnd int
for i, item := range indices {
start, end := item[0], item[1]
if start > lastEnd {
2022-10-19 13:30:16 +02:00
ent.Children = append(ent.Children, textToHTMLEntity(text[lastEnd:start]))
}
link := text[start:end]
linkID := fmt.Sprintf("%s-%d", eventID, i)
ent.Children = append(ent.Children, NewTextEntity(link).AdjustStyle(AdjustStyleLink(link, linkID), AdjustStyleReasonNormal))
lastEnd = end
}
if lastEnd < len(text) {
2022-10-19 13:30:16 +02:00
ent.Children = append(ent.Children, textToHTMLEntity(text[lastEnd:]))
}
return ent
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) singleNodeToEntity(node *html.Node) Entity {
2018-05-31 15:59:40 +02:00
switch node.Type {
case html.TextNode:
if !parser.preserveWhitespace {
node.Data = strings.ReplaceAll(node.Data, "\n", "")
node.Data = spaces.ReplaceAllLiteralString(node.Data, " ")
2018-05-31 15:59:40 +02:00
}
return TextToEntity(node.Data, parser.evt.ID, parser.prefs.EnableInlineURLs())
2018-05-31 15:59:40 +02:00
case html.ElementNode:
parsed := parser.tagNodeToEntity(node)
if parsed != nil && !parsed.IsBlock() && parsed.IsEmpty() {
return nil
}
return parsed
2018-05-31 15:59:40 +02:00
case html.DocumentNode:
2019-04-07 17:21:38 +02:00
if node.FirstChild.Data == "html" && node.FirstChild.NextSibling == nil {
return parser.singleNodeToEntity(node.FirstChild)
2019-04-07 17:21:38 +02:00
}
return &ContainerEntity{
BaseEntity: &BaseEntity{
Tag: "html",
Block: true,
},
Children: parser.nodeToEntities(node.FirstChild),
2019-04-07 02:22:51 +02:00
}
2018-05-31 15:59:40 +02:00
default:
2019-04-07 02:22:51 +02:00
return nil
}
2018-04-13 23:34:25 +02:00
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) nodeToEntities(node *html.Node) (entities []Entity) {
2018-05-31 15:59:40 +02:00
for ; node != nil; node = node.NextSibling {
if entity := parser.singleNodeToEntity(node); entity != nil {
2019-04-07 02:22:51 +02:00
entities = append(entities, entity)
}
}
2018-05-31 15:59:40 +02:00
return
}
var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "blockquote", "div", "hr", "table"}
2018-05-31 15:59:40 +02:00
func (parser *htmlParser) isBlockTag(tag string) bool {
for _, blockTag := range BlockTags {
if tag == blockTag {
return true
}
2018-05-31 15:59:40 +02:00
}
return false
}
2019-04-09 17:42:49 +02:00
func (parser *htmlParser) Parse(htmlData string) Entity {
2018-05-31 15:59:40 +02:00
node, _ := html.Parse(strings.NewReader(htmlData))
2019-04-10 16:08:39 +02:00
bodyNode := node.FirstChild.FirstChild
for bodyNode != nil && (bodyNode.Type != html.ElementNode || bodyNode.Data != "body") {
bodyNode = bodyNode.NextSibling
}
if bodyNode != nil {
return parser.singleNodeToEntity(bodyNode)
}
return parser.singleNodeToEntity(node)
2018-04-13 23:34:25 +02:00
}
2019-04-09 17:42:49 +02:00
const TabLength = 4
// Parse parses a HTML-formatted Matrix event into a UIMessage.
func Parse(prefs *config.UserPreferences, room *rooms.Room, content *event.MessageEventContent, evt *muksevt.Event, senderDisplayname string) Entity {
2020-04-19 14:00:49 +02:00
htmlData := content.FormattedBody
2020-04-19 14:00:49 +02:00
if content.Format != event.FormatHTML {
htmlData = strings.Replace(html.EscapeString(content.Body), "\n", "<br/>", -1)
2019-04-09 17:42:49 +02:00
}
htmlData = strings.Replace(htmlData, "\t", strings.Repeat(" ", TabLength), -1)
2018-04-13 23:34:25 +02:00
parser := htmlParser{room: room, prefs: prefs, evt: evt}
2019-04-07 02:22:51 +02:00
root := parser.Parse(htmlData)
2022-04-19 11:01:56 +02:00
if root == nil {
return nil
}
beRoot, ok := root.(*ContainerEntity)
if ok {
beRoot.Block = false
if len(beRoot.Children) > 0 {
beChild, ok := beRoot.Children[0].(*ContainerEntity)
if ok && beChild.Tag == "p" {
// Hacky fix for m.emote
beChild.Block = false
}
2019-04-10 16:08:39 +02:00
}
}
2018-04-13 23:34:25 +02:00
2020-04-19 14:00:49 +02:00
if content.MsgType == event.MsgEmote {
root = &ContainerEntity{
BaseEntity: &BaseEntity{
Tag: "emote",
},
2019-04-09 17:42:49 +02:00
Children: []Entity{
NewTextEntity("* "),
NewTextEntity(senderDisplayname).AdjustStyle(AdjustStyleTextColor(widget.GetHashColor(evt.Sender)), AdjustStyleReasonNormal),
2019-04-09 17:42:49 +02:00
NewTextEntity(" "),
2019-04-07 02:22:51 +02:00
root,
},
}
2018-05-31 15:59:40 +02:00
}
2019-04-07 02:22:51 +02:00
return root
2018-04-13 23:34:25 +02:00
}