public static XmlDocument GetXmlDomFromHtml(string content, bool includeXmlDeclaration = false)
{
var dom = new XmlDocument();
content = AddFillerToKeepTidyFromRemovingEmptyElements(content);
//in BL-2250, we found that in previous versions, this method would turn, for example, "<u> </u>" REMOVEWHITESPACE.
//That is fixed now, but this is needed to give to clean up existing books.
content = content.Replace(@"REMOVEWHITESPACE", "");
// It also likes to insert newlines before <b>, <u>, and <i>, and convert any existing whitespace
// there to a space.
content = new Regex(@"<([ubi]|em|strong)>").Replace(content, "REMOVEWHITESPACE<$1>");
// fix for <br></br> tag doubling
content = content.Replace("<br></br>", "<br />");
//using (var temp = new TempFile())
var temp = new TempFile();
{
RobustFile.WriteAllText(temp.Path, content, Encoding.UTF8);
using (var tidy = RobustIO.DocumentFromFile(temp.Path))
{
tidy.ShowWarnings = false;
tidy.Quiet = true;
tidy.WrapAt = 0; // prevents textarea wrapping.
tidy.AddTidyMetaElement = false;
tidy.OutputXml = true;
tidy.CharacterEncoding = EncodingType.Utf8;
tidy.InputCharacterEncoding = EncodingType.Utf8;
tidy.OutputCharacterEncoding = EncodingType.Utf8;
tidy.DocType = DocTypeMode.Omit; //when it supports html5, then we will let it out it
//maybe try this? tidy.Markup = true;
tidy.AddXmlDeclaration = includeXmlDeclaration;
//NB: this does not prevent tidy from deleting <span data-libray='somethingImportant'></span>
tidy.MergeSpans = AutoBool.No;
tidy.DropEmptyParagraphs = false;
tidy.MergeDivs = AutoBool.No;
var errors = tidy.CleanAndRepair();
if (!string.IsNullOrEmpty(errors))
{
throw new ApplicationException(errors);
}
var newContents = tidy.Save();
try
{
newContents = RemoveFillerInEmptyElements(newContents);
newContents = newContents.Replace(" ", " ");
//REVIEW: 1) are there others? & and such are fine. 2) shoul we to convert back to on save?
// The regex here is mainly for the \s as a convenient way to remove whatever whitespace TIDY
// has inserted. It's a fringe benefit that we can use the[bi] to deal with both elements in one replace.
newContents = Regex.Replace(newContents, @"REMOVEWHITESPACE\s*<([biu]|em|strong)>", "<$1>");
//In BL2250, we still had REMOVEWHITESPACE sticking around sometimes. The way we reproduced it was
//with <u> </u>. That is, we started with
//"REMOVEWHITESPACE <u> </u>", then libtidy (properly) removed the <u></u>, leaving us with only
//"REMOVEWHITESPACE".
newContents = Regex.Replace(newContents, @"REMOVEWHITESPACE", "");
// remove blank lines at the end of style blocks
newContents = Regex.Replace(newContents, @"\s+<\/style>", "</style>");
// remove <br> elements immediately preceding </p> close tag (BL-2557)
// These are apparently inserted by ckeditor as far as we can tell. They don't show up on
// fields that have never had a ckeditor activated, and always show up on fields that have
// received focus and activated an inline ckeditor. The ideal ckeditor use case appears
// to be for data entry as part of a web page that get stored separately, with the data
// obtained something like the following in javascript:
// ckedit.on('blur', function(evt) {
// var editor = evt['editor'];
// var data = editor.getData();
// <at this point, the data looks okay, with any <br> element before the </p> tag.>
// <store the data somewhere: the following lines have no effect, and may be silly.>
// var div = mapCkeditDiv[editor.id];
// div.innerHTML = data;
// });
// Examining the initial value of div.innerHTML shows the unwanted <br> element, but it is
// not in the data returned by editor.getData(). Since assigning to div.innerHTML doesn't
// affect what gets written to the file, this hack was implemented instead.
newContents = Regex.Replace(newContents, @"(<br></br>|<br ?/>)[\r\n]*</p>", "</p>");
// Don't let spaces between <strong>, <em>, or <u> elements be removed. (BL-2484)
dom.PreserveWhitespace = true;
dom.LoadXml(newContents);
}
catch (Exception e)
{
var exceptionWithHtmlContents = new Exception(string.Format("{0}{2}{2}{1}",
e.Message, newContents, Environment.NewLine));
throw exceptionWithHtmlContents;
}
}
}
try
{
//It's a mystery but http://jira.palaso.org/issues/browse/BL-46 was reported by several people on Win XP, even though a look at html tidy dispose indicates that it does dispose (and thus close) the stream.
// Therefore, I'm moving the dispose to an explict call so that I can catch the error and ignore it, leaving an extra file in Temp.
temp.Dispose();
//enhance... could make a version of this which collects up any failed deletes and re-attempts them with each call to this
}
catch (Exception error)
{
//swallow
Debug.Fail("Repro of http://jira.palaso.org/issues/browse/BL-46 ");
}
//this is a hack... each time we write the content, we add a new <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
//so for now, we remove it when we read it in. It'll get added again when we write it out
RemoveAllContentTypesMetas(dom);
return dom;
}