BetterWaywo.Scraper.GetThreadPosts C# (CSharp) Метод

GetThreadPosts() публичный статический Метод

public static GetThreadPosts ( int pageCount ) : List
pageCount int
Результат List
        public static List<Post> GetThreadPosts(int pageCount)
        {
            var posts = new List<Post>();

            Parallel.For(1, pageCount + 1, page =>
            {
                Console.WriteLine("Scraping page {0}", page);

                try
                {
                    var html = GetHtmlDocument(string.Format(ThreadPageString, Program.ThreadId, page));
                    var postElements = html.DocumentNode.SelectNodes("//ol[@id='posts']//span[@class='rating_results']");
                    var first = true;

                    foreach (var p in postElements)
                    {
                        if (page <= 1 && first)
                        {
                            first = false;
                            continue;
                        }

                        var id = int.Parse(Regex.Match(p.Attributes["id"].Value, "rating_(\\d+)").Groups[1].Value);
                        var ratings = new Dictionary<string, int>();

                        foreach (var r in p.Elements("span"))
                        {
                            var ratingType = r.Element("img").Attributes["alt"].Value;
                            var ratingValue = int.Parse(r.Element("strong").InnerText);
                            ratings.Add(ratingType, ratingValue);
                        }

                        lock (posts)
                            posts.Add(new Post(id, ratings));
                    }
                }
                catch
                {
                    Console.WriteLine("Failed to scrape page {0}, ignoring", page);
                }

                if (pageCount >= 50)
                    Thread.Sleep(2500); // be nice
            });

            return posts;
        }

Usage Example

Пример #1
0
        public static void Main(string[] args)
        {
            bool help      = false;
            bool cache     = false;
            int  postCount = 20;

            Config = new Config {
                Authentication = new AuthenticationConfig {
                    UserAgent = "BetterWAYWO highlights generator",
                    Cookies   = new Dictionary <string, string>()
                },

                Weights = new WeightsConfig {
                    RatingsDefault = 0f,
                    Ratings        = new RatingsConfig[0],

                    ContentDefault = 0f,
                    Content        = new ContentConfig[0]
                }
            };

            #region Option Parsing
            var options = new OptionSet()
            {
                { "thread=",
                  "thread ID to generate highlights for", v =>
                  {
                      if (!int.TryParse(v, out ThreadId))
                      {
                          throw new OptionException("thread must be given an integer", "thread");
                      }
                  } },

                { "out=",
                  "file to output to",
                  v => OutputFile = v },

                { "posts=",
                  "number of posts to output (default 20)", v =>
                  {
                      if (!int.TryParse(v, out postCount))
                      {
                          throw new OptionException("posts must be given an integer value", "posts");
                      }
                  } },

                { "config=",
                  "specify a json file to load configuration options from",
                  LoadConfig },

                { "cache",
                  "enable caching of thread data",
                  v => cache = v != null },

                { "h|help",
                  "show this help message",
                  v => help = v != null }
            };

            try
            {
                options.Parse(args);
            }
            catch (OptionException e)
            {
                Console.WriteLine("error: " + e.Message);
                Console.WriteLine("Try `betterwaywo --help' for more information.");
                return;
            }

            if (ThreadId == default(int) || OutputFile == default(string))
            {
                help = true;
            }

            if (help)
            {
                Console.WriteLine("Usage: betterwaywo -thread=<ThreadID> -out=<OutputFile> [options]");
                Console.WriteLine("Generates highlights for Facepunch threads.");
                Console.WriteLine();
                Console.WriteLine("Options:");
                options.WriteOptionDescriptions(Console.Out);
                return;
            }
            #endregion

            postCount = postCount < 1 ? 1 : postCount;

            var cacheFile = string.Format("posts_{0}.json", ThreadId);
            var hasCache  = cache && File.Exists(cacheFile);

            if (hasCache)
            {
                try
                {
                    _posts = JsonConvert.DeserializeObject <List <Post> >(File.ReadAllText(cacheFile));
                    Console.WriteLine("Using cached posts");
                }
                catch
                {
                    Console.WriteLine("Failed to read cache, ignoring");
                }
            }

            if (_posts == null)
            {
                int pageCount;
                try
                {
                    pageCount = Scraper.GetPageCount();
                    Console.WriteLine("Thread has {0} pages", pageCount);
                }
                catch
                {
                    Console.WriteLine("Invalid ThreadId (couldn't get page count)");
                    return;
                }

                try
                {
                    _posts = Scraper.GetThreadPosts(pageCount);
                }
                catch
                {
                    Console.WriteLine("Failed to scrape thread");
                    return;
                }
            }

            List <Post> highlights;

            try
            {
                highlights = _posts
                             .OrderByDescending(p => p.RatingsValue)
                             .Take(postCount * 2)              // lets not read every posts' contents
                             .Where(p => p.ContentValue > 0)
                             .Where(p => !p.IsVotePost)
                             .OrderByDescending(p => p.RatingsValue * p.ContentMultiplier)
                             .GroupBy(p => p.Username)
                             .Select(g => g.First())
                             .Take(postCount)
                             .ToList();
            }
            catch
            {
                Console.WriteLine("Failed to read posts");
                return;
            }

            if (cache)
            {
                try
                {
                    var postsJson = JsonConvert.SerializeObject(_posts);
                    File.WriteAllText(cacheFile, postsJson);
                    Console.WriteLine("Wrote posts to cache");
                }
                catch
                {
                    Console.WriteLine("Failed to write cache, ignoring");
                }
            }

            try
            {
                using (var writer = new StreamWriter(OutputFile, false))
                {
                    foreach (var p in highlights)
                    {
                        if (p.Message.Length == 0)
                        {
                            Console.WriteLine("Failed to read post contents (length is 0)");
                            return;
                        }

                        writer.Write(p.Message);
                        writer.WriteLine();
                        writer.WriteLine();
                    }
                }
            }
            catch
            {
                Console.WriteLine("Failed to write output file");
                return;
            }

            Console.WriteLine("Done!");
        }