Bloom.XmlHtmlConverter.GetXmlDomFromHtml C# (CSharp) Method

GetXmlDomFromHtml() public static method

Throws if there are parsing errors
public static GetXmlDomFromHtml ( string content, bool includeXmlDeclaration = false ) : XmlDocument
content string
includeXmlDeclaration bool
return System.Xml.XmlDocument
        public static XmlDocument GetXmlDomFromHtml(string content, bool includeXmlDeclaration = false)
        {
            var dom = new XmlDocument();
            content = AddFillerToKeepTidyFromRemovingEmptyElements(content);

            //in BL-2250, we found that in previous versions, this method would turn, for example, "<u> </u>" REMOVEWHITESPACE.
            //That is fixed now, but this is needed to give to clean up existing books.
            content = content.Replace(@"REMOVEWHITESPACE", "");

            // It also likes to insert newlines before <b>, <u>, and <i>, and convert any existing whitespace
            // there to a space.
            content = new Regex(@"<([ubi]|em|strong)>").Replace(content, "REMOVEWHITESPACE<$1>");

            // fix for <br></br> tag doubling
            content = content.Replace("<br></br>", "<br />");

            //using (var temp = new TempFile())
            var temp = new TempFile();
            {
                RobustFile.WriteAllText(temp.Path, content, Encoding.UTF8);
                using (var tidy = RobustIO.DocumentFromFile(temp.Path))
                {
                    tidy.ShowWarnings = false;
                    tidy.Quiet = true;
                    tidy.WrapAt = 0; // prevents textarea wrapping.
                    tidy.AddTidyMetaElement = false;
                    tidy.OutputXml = true;
                    tidy.CharacterEncoding = EncodingType.Utf8;
                    tidy.InputCharacterEncoding = EncodingType.Utf8;
                    tidy.OutputCharacterEncoding = EncodingType.Utf8;
                    tidy.DocType = DocTypeMode.Omit; //when it supports html5, then we will let it out it
                    //maybe try this? tidy.Markup = true;

                    tidy.AddXmlDeclaration = includeXmlDeclaration;

                    //NB: this does not prevent tidy from deleting <span data-libray='somethingImportant'></span>
                    tidy.MergeSpans = AutoBool.No;
                    tidy.DropEmptyParagraphs = false;
                    tidy.MergeDivs = AutoBool.No;

                    var errors = tidy.CleanAndRepair();
                    if (!string.IsNullOrEmpty(errors))
                    {
                        throw new ApplicationException(errors);
                    }
                    var newContents = tidy.Save();
                    try
                    {
                        newContents = RemoveFillerInEmptyElements(newContents);

                        newContents = newContents.Replace("&nbsp;", "&#160;");
                        //REVIEW: 1) are there others? &amp; and such are fine.  2) shoul we to convert back to &nbsp; on save?

                        // The regex here is mainly for the \s as a convenient way to remove whatever whitespace TIDY
                        // has inserted. It's a fringe benefit that we can use the[bi] to deal with both elements in one replace.
                        newContents = Regex.Replace(newContents, @"REMOVEWHITESPACE\s*<([biu]|em|strong)>", "<$1>");

                        //In BL2250, we still had REMOVEWHITESPACE sticking around sometimes. The way we reproduced it was
                        //with <u> </u>. That is, we started with
                        //"REMOVEWHITESPACE <u> </u>", then libtidy (properly) removed the <u></u>, leaving us with only
                        //"REMOVEWHITESPACE".
                        newContents = Regex.Replace(newContents, @"REMOVEWHITESPACE", "");

                        // remove blank lines at the end of style blocks
                        newContents = Regex.Replace(newContents, @"\s+<\/style>", "</style>");

                        // remove <br> elements immediately preceding </p> close tag (BL-2557)
                        // These are apparently inserted by ckeditor as far as we can tell.  They don't show up on
                        // fields that have never had a ckeditor activated, and always show up on fields that have
                        // received focus and activated an inline ckeditor.  The ideal ckeditor use case appears
                        // to be for data entry as part of a web page that get stored separately, with the data
                        // obtained something like the following in javascript:
                        //        ckedit.on('blur', function(evt) {
                        //            var editor = evt['editor'];
                        //            var data = editor.getData();
                        //            <at this point, the data looks okay, with any <br> element before the </p> tag.>
                        //            <store the data somewhere: the following lines have no effect, and may be silly.>
                        //            var div = mapCkeditDiv[editor.id];
                        //            div.innerHTML = data;
                        //        });
                        // Examining the initial value of div.innerHTML shows the unwanted <br> element, but it is
                        // not in the data returned by editor.getData().  Since assigning to div.innerHTML doesn't
                        // affect what gets written to the file, this hack was implemented instead.
                        newContents = Regex.Replace(newContents, @"(<br></br>|<br ?/>)[\r\n]*</p>", "</p>");

                        // Don't let spaces between <strong>, <em>, or <u> elements be removed. (BL-2484)
                        dom.PreserveWhitespace = true;
                        dom.LoadXml(newContents);
                    }
                    catch (Exception e)
                    {
                        var exceptionWithHtmlContents = new Exception(string.Format("{0}{2}{2}{1}",
                            e.Message, newContents, Environment.NewLine));
                        throw exceptionWithHtmlContents;
                    }
                }
            }
            try
            {
                //It's a mystery but http://jira.palaso.org/issues/browse/BL-46 was reported by several people on Win XP, even though a look at html tidy dispose indicates that it does dispose (and thus close) the stream.
                // Therefore, I'm moving the dispose to an explict call so that I can catch the error and ignore it, leaving an extra file in Temp.

                temp.Dispose();
                    //enhance... could make a version of this which collects up any failed deletes and re-attempts them with each call to this
            }
            catch (Exception error)
            {
                //swallow
                Debug.Fail("Repro of http://jira.palaso.org/issues/browse/BL-46 ");
            }

            //this is a hack... each time we write the content, we add a new <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
            //so for now, we remove it when we read it in. It'll get added again when we write it out
            RemoveAllContentTypesMetas(dom);

            return dom;
        }

Usage Example

Esempio n. 1
0
        /// <summary>
        /// What's going on here: the browser is just /editting displaying a copy of one page of the document.
        /// So we need to copy any changes back to the real DOM.
        /// </summary>
        private void LoadPageDomFromBrowser()
        {
            if (_pageDom == null)
            {
                return;
            }

#if DEBUG
            if (_pageDom.SelectNodes("//textarea").Count > 0)
            {
                Debug.Fail("Oh, a chance to test bluring textarea's!");
            }
#endif
            //as of august 2012 textareas only occur in the Calendar
            //		if (_pageDom.SelectNodes("//textarea").Count >0)
            {
                //This approach was to force an onblur so that we can get at the actual user-edited value.
                //This caused problems, with Bloom itself (the Shell) not knowing that it is active.
                //_browser.WebBrowserFocus.Deactivate();
                //_browser.WebBrowserFocus.Activate();

                //now, we just do the blur directly.
                var activeElement = _browser.Window.Document.ActiveElement;
                if (activeElement != null)
                {
                    activeElement.Blur();
                }
            }

            var body = _browser.Document.GetElementsByTagName("body");
            if (body.Count == 0)                //review: this does happen... onValidating comes along, but there is no body. Assuming it is a timing issue.
            {
                return;
            }

            var         content = body[0].InnerHtml;
            XmlDocument dom;

            //todo: deal with exception that can come out of this
            try
            {
                dom = XmlHtmlConverter.GetXmlDomFromHtml(content, false);
                var bodyDom = dom.SelectSingleNode("//body");

                if (_pageDom == null)
                {
                    return;
                }

                var destinationDomPage = _pageDom.SelectSingleNode("//body/div[contains(@class,'bloom-page')]");
                if (destinationDomPage == null)
                {
                    return;
                }
                var expectedPageId = destinationDomPage["id"];

                var browserPageId = bodyDom.SelectSingleNode("//body/div[contains(@class,'bloom-page')]");
                if (browserPageId == null)
                {
                    return;                    //why? but I've seen it happen
                }
                var thisPageId = browserPageId["id"];
                if (expectedPageId != thisPageId)
                {
                    Palaso.Reporting.ErrorReport.NotifyUserOfProblem("Bloom encountered an error saving that page (unexpected page id)");
                    return;
                }
                _pageDom.GetElementsByTagName("body")[0].InnerXml = bodyDom.InnerXml;

                var customStyleSheet = _browser.Document.StyleSheets.Where(s =>
                {
                    var idNode = s.OwnerNode.Attributes["id"];
                    if (idNode == null)
                    {
                        return(false);
                    }
                    return(idNode.NodeValue == "customBookStyles");
                }).FirstOrDefault();

                if (customStyleSheet != null)
                {
                    /* why are we bothering to walk through the rules instead of just copying the html of the style tag? Because that doesn't
                     * actually get updated when the javascript edits the stylesheets of the page. Well, the <style> tag gets created, but
                     * rules don't show up inside of it. So
                     * this won't work: _pageDom.GetElementsByTagName("head")[0].InnerText = customStyleSheet.OwnerNode.OuterHtml;
                     */
                    var styles = new StringBuilder();
                    styles.AppendLine("<style id='customStyles' type='text/css'>");
                    foreach (var cssRule in customStyleSheet.CssRules)
                    {
                        styles.AppendLine(cssRule.CssText);
                    }
                    styles.AppendLine("</style>");
                    Debug.WriteLine("*CustomStylesheet in browser:" + styles);
                    _pageDom.GetElementsByTagName("head")[0].InnerXml = styles.ToString();
                }

                //enhance: we have jscript for this: cleanup()... but running jscript in this method was leading the browser to show blank screen
//				foreach (XmlElement j in _pageDom.SafeSelectNodes("//div[contains(@class, 'ui-tooltip')]"))
//				{
//					j.ParentNode.RemoveChild(j);
//				}
//				foreach (XmlAttribute j in _pageDom.SafeSelectNodes("//@ariasecondary-describedby | //@aria-describedby"))
//				{
//					j.OwnerElement.RemoveAttributeNode(j);
//				}
            }
            catch (Exception e)
            {
                Palaso.Reporting.ErrorReport.NotifyUserOfProblem(e, "Sorry, Bloom choked on something on this page (invalid incoming html).\r\n\r\n+{0}", e);
                return;
            }



            try
            {
                XmlHtmlConverter.ThrowIfHtmlHasErrors(_pageDom.OuterXml);
            }
            catch (Exception e)
            {
                var exceptionWithHtmlContents = new Exception(content);
                Palaso.Reporting.ErrorReport.NotifyUserOfProblem(e, "Sorry, Bloom choked on something on this page (validating page).\r\n\r\n+{0}", e.Message);
            }
        }
All Usage Examples Of Bloom.XmlHtmlConverter::GetXmlDomFromHtml