HTML敏捷包条带标签不在白名单中

我试图创build一个函数，删除不在白名单中的HTML标签和属性。我有以下的HTML：

<b>first text </b> <b>second text here <a>some text here</a> <a>some text here</a> </b> <a>some twxt here</a>

我正在使用HTML敏捷包，我到目前为止的代码是：

 static List<string> WhiteNodeList = new List<string> { "b" }; static List<string> WhiteAttrList = new List<string> { }; static HtmlNode htmlNode; public static void RemoveNotInWhiteList(out string _output, HtmlNode pNode, List<string> pWhiteList, List<string> attrWhiteList) { // remove all attributes not on white list foreach (var item in pNode.ChildNodes) { item.Attributes.Where(u => attrWhiteList.Contains(u.Name) == false).ToList().ForEach(u => RemoveAttribute(u)); } // remove all html and their innerText and attributes if not on whitelist. //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.Remove()); //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.ParentNode.ReplaceChild(ConvertHtmlToNode(u.InnerHtml),u)); //pNode.ChildNodes.Where(u => pWhiteList.Contains(u.Name) == false).ToList().ForEach(u => u.Remove()); for (int i = 0; i < pNode.ChildNodes.Count; i++) { if (!pWhiteList.Contains(pNode.ChildNodes[i].Name)) { HtmlNode _newNode = ConvertHtmlToNode(pNode.ChildNodes[i].InnerHtml); pNode.ChildNodes[i].ParentNode.ReplaceChild(_newNode, pNode.ChildNodes[i]); if (pNode.ChildNodes[i].HasChildNodes && !string.IsNullOrEmpty(pNode.ChildNodes[i].InnerText.Trim().Replace("\r\n", ""))) { HtmlNode outputNode1 = pNode.ChildNodes[i]; for (int j = 0; j < pNode.ChildNodes[i].ChildNodes.Count; j++) { string _childNodeOutput; RemoveNotInWhiteList(out _childNodeOutput, pNode.ChildNodes[i], WhiteNodeList, WhiteAttrList); pNode.ChildNodes[i].ReplaceChild(ConvertHtmlToNode(_childNodeOutput), pNode.ChildNodes[i].ChildNodes[j]); i++; } } } } // Console.WriteLine(pNode.OuterHtml); _output = pNode.OuterHtml; } private static void RemoveAttribute(HtmlAttribute u) { u.Value = u.Value.ToLower().Replace("javascript", ""); u.Remove(); } public static HtmlNode ConvertHtmlToNode(string html) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); if (doc.DocumentNode.ChildNodes.Count == 1) return doc.DocumentNode.ChildNodes[0]; else return doc.DocumentNode; }

我tryig实现的输出是

 <b>first text </b> <b>second text here some text here some text here </b> some twxt here

这意味着我只想保留<b>标签。
我这样做的原因是因为有些用户从MS WORD粘贴粘贴到NY WYSYWYG的HTML编辑器。

谢谢。！

呃，显然我几乎find了答案，在一个博客文章有人作出….

 using System.Collections.Generic; using System.Linq; using HtmlAgilityPack; namespace Wayloop.Blog.Core.Markup { public static class HtmlSanitizer { private static readonly IDictionary<string, string[]> Whitelist; static HtmlSanitizer() { Whitelist = new Dictionary<string, string[]> { { "a", new[] { "href" } }, { "strong", null }, { "em", null }, { "blockquote", null }, }; } public static string Sanitize(string input) { var htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(input); SanitizeNode(htmlDocument.DocumentNode); return htmlDocument.DocumentNode.WriteTo().Trim(); } private static void SanitizeChildren(HtmlNode parentNode) { for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) { SanitizeNode(parentNode.ChildNodes[i]); } } private static void SanitizeNode(HtmlNode node) { if (node.NodeType == HtmlNodeType.Element) { if (!Whitelist.ContainsKey(node.Name)) { node.ParentNode.RemoveChild(node); return; } if (node.HasAttributes) { for (int i = node.Attributes.Count - 1; i >= 0; i--) { HtmlAttribute currentAttribute = node.Attributes[i]; string[] allowedAttributes = Whitelist[node.Name]; if (!allowedAttributes.Contains(currentAttribute.Name)) { node.Attributes.Remove(currentAttribute); } } } } if (node.HasChildNodes) { SanitizeChildren(node); } } } }

我从这里得到了HtmlSanitizer显然它不剥离th标签，但删除元素altoghether。

好的，这是以后需要的解决scheme。

 public static class HtmlSanitizer { private static readonly IDictionary<string, string[]> Whitelist; private static List<string> DeletableNodesXpath = new List<string>(); static HtmlSanitizer() { Whitelist = new Dictionary<string, string[]> { { "a", new[] { "href" } }, { "strong", null }, { "em", null }, { "blockquote", null }, { "b", null}, { "p", null}, { "ul", null}, { "ol", null}, { "li", null}, { "div", new[] { "align" } }, { "strike", null}, { "u", null}, { "sub", null}, { "sup", null}, { "table", null }, { "tr", null }, { "td", null }, { "th", null } }; } public static string Sanitize(string input) { if (input.Trim().Length < 1) return string.Empty; var htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(input); SanitizeNode(htmlDocument.DocumentNode); string xPath = HtmlSanitizer.CreateXPath(); return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath); } private static void SanitizeChildren(HtmlNode parentNode) { for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) { SanitizeNode(parentNode.ChildNodes[i]); } } private static void SanitizeNode(HtmlNode node) { if (node.NodeType == HtmlNodeType.Element) { if (!Whitelist.ContainsKey(node.Name)) { if (!DeletableNodesXpath.Contains(node.Name)) { //DeletableNodesXpath.Add(node.Name.Replace("?","")); node.Name = "removeableNode"; DeletableNodesXpath.Add(node.Name); } if (node.HasChildNodes) { SanitizeChildren(node); } return; } if (node.HasAttributes) { for (int i = node.Attributes.Count - 1; i >= 0; i--) { HtmlAttribute currentAttribute = node.Attributes[i]; string[] allowedAttributes = Whitelist[node.Name]; if (allowedAttributes != null) { if (!allowedAttributes.Contains(currentAttribute.Name)) { node.Attributes.Remove(currentAttribute); } } else { node.Attributes.Remove(currentAttribute); } } } } if (node.HasChildNodes) { SanitizeChildren(node); } } private static string StripHtml(string html, string xPath) { HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); if (xPath.Length > 0) { HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath); foreach (HtmlNode node in invalidNodes) { node.ParentNode.RemoveChild(node, true); } } return htmlDoc.DocumentNode.WriteContentTo(); ; } private static string CreateXPath() { string _xPath = string.Empty; for (int i = 0; i < DeletableNodesXpath.Count; i++) { if (i != DeletableNodesXpath.Count - 1) { _xPath += string.Format("//{0}|", DeletableNodesXpath[i].ToString()); } else _xPath += string.Format("//{0}", DeletableNodesXpath[i].ToString()); } return _xPath; } }

我重命名节点，因为如果我不得不parsing一个XML命名空间节点，它将崩溃在xpathparsing。

感谢代码 – 伟大的事情！

我没有做一些优化

 class TagSanitizer { List<HtmlNode> _deleteNodes = new List<HtmlNode>(); public static void Sanitize(HtmlNode node) { new TagSanitizer().Clean(node); } void Clean(HtmlNode node) { CleanRecursive(node); for (int i = _deleteNodes.Count - 1; i >= 0; i--) { HtmlNode nodeToDelete = _deleteNodes[i]; nodeToDelete.ParentNode.RemoveChild(nodeToDelete, true); } } void CleanRecursive(HtmlNode node) { if (node.NodeType == HtmlNodeType.Element) { if (Config.TagsWhiteList.ContainsKey(node.Name) == false) { _deleteNodes.Add(node); } else if (node.HasAttributes) { for (int i = node.Attributes.Count - 1; i >= 0; i--) { HtmlAttribute currentAttribute = node.Attributes[i]; string[] allowedAttributes = Config.TagsWhiteList[node.Name]; if (allowedAttributes != null) { if (allowedAttributes.Contains(currentAttribute.Name) == false) { node.Attributes.Remove(currentAttribute); } } else { node.Attributes.Remove(currentAttribute); } } } } if (node.HasChildNodes) { node.ChildNodes.ToList().ForEach(v => CleanRecursive(v)); } } }

HTML敏捷包条带标签不在白名单中

在ggplot2中添加x和y轴标签

我如何添加提示或工具提示到C＃Winforms中的标签？

ASP.NET控件与HTML标签等效

如何在SVN中删除错误标记的目录？

崇高的文本2包装select标签

在代码中设置WPF标签的样式属性？

如何在Python中检查文本是否为空（空格，制表符，换行符）？

Chrome扩展程序：如何在新标签页中打开链接？

Git提交没有分支的标签

什么是<XMP>标签用于？