Bloom.RobustIO.DocumentFromFile C# (CSharp) Method

DocumentFromFile() public static method

public static DocumentFromFile ( string filePath ) : TidyManaged.Document
filePath string
return TidyManaged.Document
        public static Document DocumentFromFile(string filePath)
        {
            return RetryUtility.Retry(() => Document.FromFile(filePath));
        }

Usage Example

        /// <summary></summary>
        /// <param name="content"></param>
        /// <param name="includeXmlDeclaration"></param>
        /// <exception>Throws if there are parsing errors</exception>
        /// <returns></returns>
        public static XmlDocument GetXmlDomFromHtml(string content, bool includeXmlDeclaration = false)
        {
            var dom = new XmlDocument();

            content = AddFillerToKeepTidyFromRemovingEmptyElements(content);

            //in BL-2250, we found that in previous versions, this method would turn, for example, "<u> </u>" REMOVEWHITESPACE.
            //That is fixed now, but this is needed to give to clean up existing books.
            content = content.Replace(@"REMOVEWHITESPACE", "");

            // tidy likes to insert newlines before <b>, <u>, <i>, and these other elements and convert any existing whitespace
            // there to a space.  (span was found by pursuing BL-7558)
            content = new Regex(@"<([ubi]|em|strong|sup|sub|span[^>]*)>").Replace(content, "REMOVEWHITESPACE<$1>");

            // fix for <br></br> tag doubling
            content = content.Replace("<br></br>", "<br />");

            // fix for > and similar in <style> element protected by CDATA.
            // At present we only need to account for this occurring once.
            // See Browser.SaveCustomizedCssRules.
            var          startOfCdata     = content.IndexOf(Browser.CdataPrefix, StringComparison.InvariantCulture);
            const string restoreCdataHere = "/****RestoreCDATAHere*****/";
            var          endOfCdata       = content.IndexOf(Browser.CdataSuffix, StringComparison.InvariantCulture);
            var          savedCdata       = "";

            if (startOfCdata >= 0 && endOfCdata >= startOfCdata)
            {
                endOfCdata += Browser.CdataSuffix.Length;
                savedCdata  = content.Substring(startOfCdata, endOfCdata - startOfCdata);
                content     = content.Substring(0, startOfCdata) + restoreCdataHere + content.Substring(endOfCdata, content.Length - endOfCdata);
            }

            var removedSvgs = new List <string>();

            content = RemoveSvgs(content, removedSvgs);


            //using (var temp = new TempFile())
            var temp = new TempFile();

            {
                RobustFile.WriteAllText(temp.Path, content, Encoding.UTF8);
                using (var tidy = RobustIO.DocumentFromFile(temp.Path))
                {
                    tidy.ShowWarnings            = false;
                    tidy.Quiet                   = true;
                    tidy.WrapAt                  = 0;    // prevents textarea wrapping.
                    tidy.AddTidyMetaElement      = false;
                    tidy.OutputXml               = true;
                    tidy.CharacterEncoding       = EncodingType.Utf8;
                    tidy.InputCharacterEncoding  = EncodingType.Utf8;
                    tidy.OutputCharacterEncoding = EncodingType.Utf8;
                    tidy.DocType                 = DocTypeMode.Omit;     //when it supports html5, then we will let it out it
                    //maybe try this? tidy.Markup = true;

                    tidy.AddXmlDeclaration = includeXmlDeclaration;

                    //NB: this does not prevent tidy from deleting <span data-libray='somethingImportant'></span>
                    tidy.MergeSpans          = AutoBool.No;
                    tidy.DropEmptyParagraphs = false;
                    tidy.MergeDivs           = AutoBool.No;


                    var errors = tidy.CleanAndRepair();
                    if (!string.IsNullOrEmpty(errors))
                    {
                        throw new ApplicationException(errors);
                    }
                    var newContents = tidy.Save();
                    try
                    {
                        newContents = RestoreSvgs(newContents, removedSvgs);
                        newContents = RemoveFillerInEmptyElements(newContents);

                        newContents = newContents.Replace("&nbsp;", "&#160;");
                        //REVIEW: 1) are there others? &amp; and such are fine.  2) shoul we to convert back to &nbsp; on save?

                        // The regex here is mainly for the \s* as a convenient way to remove whatever whitespace TIDY
                        // has inserted. It's a fringe benefit that we can use the[biu]|... to deal with all these elements in one replace.
                        newContents = Regex.Replace(newContents, @"REMOVEWHITESPACE\s*<([biu]|em|strong|sup|sub|span[^>]*)>", "<$1>");

                        //In BL2250, we still had REMOVEWHITESPACE sticking around sometimes. The way we reproduced it was
                        //with <u> </u>. That is, we started with
                        //"REMOVEWHITESPACE <u> </u>", then libtidy (properly) removed the <u></u>, leaving us with only
                        //"REMOVEWHITESPACE".
                        newContents = Regex.Replace(newContents, @"REMOVEWHITESPACE", "");

                        // remove blank lines at the end of style blocks
                        newContents = Regex.Replace(newContents, @"\s+<\/style>", "</style>");

                        // remove <br> elements immediately preceding </p> close tag (BL-2557)
                        // These are apparently inserted by ckeditor as far as we can tell.  They don't show up on
                        // fields that have never had a ckeditor activated, and always show up on fields that have
                        // received focus and activated an inline ckeditor.  The ideal ckeditor use case appears
                        // to be for data entry as part of a web page that get stored separately, with the data
                        // obtained something like the following in javascript:
                        //        ckedit.on('blur', function(evt) {
                        //            var editor = evt['editor'];
                        //            var data = editor.getData();
                        //            <at this point, the data looks okay, with any <br> element before the </p> tag.>
                        //            <store the data somewhere: the following lines have no effect, and may be silly.>
                        //            var div = mapCkeditDiv[editor.id];
                        //            div.innerHTML = data;
                        //        });
                        // Examining the initial value of div.innerHTML shows the unwanted <br> element, but it is
                        // not in the data returned by editor.getData().  Since assigning to div.innerHTML doesn't
                        // affect what gets written to the file, this hack was implemented instead.
                        newContents = Regex.Replace(newContents, @"(<br></br>|<br ?/>)[\r\n]*</p>", "</p>");

                        newContents = newContents.Replace(restoreCdataHere, savedCdata);

                        // Don't let spaces between <strong>, <em>, or <u> elements be removed. (BL-2484)
                        dom.PreserveWhitespace = true;
                        dom.LoadXml(newContents);
                    }
                    catch (Exception e)
                    {
                        var exceptionWithHtmlContents = new Exception(string.Format("{0}{2}{2}{1}",
                                                                                    e.Message, newContents, Environment.NewLine));
                        throw exceptionWithHtmlContents;
                    }
                }
            }
            try
            {
                //It's a mystery but http://jira.palaso.org/issues/browse/BL-46 was reported by several people on Win XP, even though a look at html tidy dispose indicates that it does dispose (and thus close) the stream.
                // Therefore, I'm moving the dispose to an explict call so that I can catch the error and ignore it, leaving an extra file in Temp.

                temp.Dispose();
                //enhance... could make a version of this which collects up any failed deletes and re-attempts them with each call to this
            }
            catch (Exception error)
            {
                //swallow
                Debug.Fail("Repro of http://jira.palaso.org/issues/browse/BL-46 ");
            }

            //this is a hack... each time we write the content, we add a new <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
            //so for now, we remove it when we read it in. It'll get added again when we write it out
            RemoveAllContentTypesMetas(dom);

            return(dom);
        }