public static bool ExportToMarkdown(int pageStart, int pageEnd, bool isSaveImage, string imagePrefixUrl = "", bool isAddMoreSeparateLine = false, int separateLineLocation = 300)
{
for (var page = pageStart; page <= pageEnd; page++)
{
var pagesUrl = string.Format("http://www.cnblogs.com/parry/default.html?page={0}", page);
//抓取所有的文章内容链接地址,进行循环抓取并存储
var regex = new Regex(@"class=""postTitle"">\s+<a.*?href=""(?<href>.*?)"">",
RegexOptions.Singleline | RegexOptions.Multiline);
var matches = regex.Matches(NetworkHelper.GetHtmlFromGet(pagesUrl, Encoding.UTF8));
foreach (Match match in matches)
{
var articleUrl = match.Groups["href"].ToString();
var regexArticle =
new Regex(
@"<div\s+id=""topics"">.*?id=""cb_post_title_url"".*?>(?<title>.*?)</a>.*?<div\s+id=""cnblogs_post_body"">(?<articlecontent>.*?)</div><div\s+(?:id=""MySignature""></div>)?\s+<div\s+class=""clear""></div>.*?id=""post-date"">(?<date>.*?)</span>",
RegexOptions.Singleline | RegexOptions.Multiline);
var content = NetworkHelper.GetHtmlFromGet(articleUrl, Encoding.UTF8);
var regexAppName = new Regex("currentBlogApp\\s+=\\s+'(?<appName>.*?)'", RegexOptions.Singleline | RegexOptions.Multiline);
var matchAppName = regexAppName.Match(content);
var appName = string.Empty;
if (matchAppName.Success)
{
appName = matchAppName.Groups["appName"].ToString();
}
var matchArticle = regexArticle.Match(content);
if (matchArticle.Success)
{
var title = matchArticle.Groups["title"].ToString().Trim();
var date = matchArticle.Groups["date"].ToString().Trim();
var articleContent = matchArticle.Groups["articlecontent"].ToString();
if (isSaveImage)
{
articleContent = ProcessArticleImage(articleContent, imagePrefixUrl); //对文章中的图片进行保存,根据情况可以不处理,如何有自己的图床,那么保存下来后替换掉图床前缀就可以了。
}
articleContent = ProcessArticleCode(articleContent);
articleContent =
articleContent.Replace("<div id=\"parrycontent\">", string.Empty)
.Replace("</div>", string.Empty);
var regexId = new Regex(@"cb_blogId=(?<blogid>\d+),cb_entryId=(?<entryid>\d+)",
RegexOptions.Singleline | RegexOptions.Multiline);
int blogId = 0, postId = 0;
var matchId = regexId.Match(content);
if (matchId.Success)
{
int.TryParse(matchId.Groups["blogid"].ToString(), out blogId);
int.TryParse(matchId.Groups["entryid"].ToString(), out postId);
}
var categoryTags = GetArticleCategory(appName, blogId, postId);
var fileName = GetFileName(articleUrl);
var filePath = Application.StartupPath + "\\output\\" + fileName;
var mdContent = string.Format("---\r\ntitle: {0}\r\ndate: {1}\r\n{2}\r\n---\r\n{3}", title, date,
categoryTags, articleContent);
var converter = new Converter();
var markdown = converter.Convert(mdContent);
//注意此处的作用是在抓取到的文章 300 字符处添加<!--more-->分隔符,用于博客展示文章时用于抽取描述以及阅读更多使用。
if (isAddMoreSeparateLine)
{
markdown = markdown.Substring(0, separateLineLocation) + "\r\n<!--more-->\r\n" +
markdown.Substring(separateLineLocation + 1);
}
//保存文件
var streamWriter = new StreamWriter(filePath);
streamWriter.Write(markdown);
streamWriter.Close();
}
}
}
return true;
}