AmazonScrape.Parser.GetPageResultItemHtml C# (CSharp) Метод

GetPageResultItemHtml() публичный статический Метод

Returns a list of individual html product results from an html page
public static GetPageResultItemHtml ( string pageHtml, int resultCount ) : List
pageHtml string The string containing a single page of Amazon search results
resultCount int
Результат List
        public static List<string> GetPageResultItemHtml(string pageHtml, int resultCount)
        {
            // NOTE:
            // Amazon injects additional search results (commented out in
            // javascript) at the bottom of each search page to cache results.
            // The parameter _resultCount is obtained from the top of the page
            // so that only the results that are visible to the user are
            // returned. This was mainly done to fix a bug where duplicate
            // records were being returned, but it's probably good practice
            // to only consider "visible" results in case Amazon changes its
            // caching strategy.

            List<string> results = new List<string>();
            TimeSpan timeOut = new TimeSpan(0, 0, 10);

            // Grab the text between each of the results
            string resultPattern = @"(?<=result_[0-9]?[0-9]).*?(?=result_[0-9]?[0-9])";
            List<string> matches = GetMultipleRegExMatches(pageHtml, resultPattern);

            if (matches.Count < resultCount) { return results; }

            for (int i = 0; i < resultCount ; i++)
            {
                results.Add(matches[i]);
            }

            return results;
        }

Usage Example

Пример #1
0
        /// <summary>
        /// Loads, chops up, parses and validates one page worth of results.
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        public void Work(object sender, DoWorkEventArgs e)
        {
            _status = Status.Working;
            if (Thread.CurrentThread.Name == null)
            {
                Thread.CurrentThread.Name = "Page " + _pageNumber.ToString() + " worker";
            }

            // Set the RunWorkEventArgs so we can check its status on completion
            e.Result = this;

            // Will hold the page's html broken up by each individual product
            _productHtmlSegments = new List <string>();

            // Gets the entire page's html
            string pageHtml = _pageLoadMethod(_pageNumber,
                                              _searchCriteria.SearchText);

            // Get the number of results on this page
            _pageResultCount = Parser.GetPageResultCount(pageHtml);

            // If there are no results, set the status accordingly and exit
            if (_pageResultCount == 0)
            {
                _status = Status.NoResults;
                return;
            }
            else // There are results
            {
                // Break apart the page html by product
                // so they can be parsed individually
                _productHtmlSegments = Parser.GetPageResultItemHtml(pageHtml,
                                                                    _pageResultCount);
            }

            List <Result <AmazonItem> > results = new List <Result <AmazonItem> >();

            // Parse and validate each result, adding to the result list
            foreach (string productHtml in _productHtmlSegments)
            {
                Result <AmazonItem> result =
                    ParseAndValidateProductHtml(productHtml);

                // Don't worry about reporting the progress percentage here.
                // The SearchManager will look at the total results returned
                // and compare with the results requested and report that
                // percentage to the UI (passing in a dummy zero here)
                ReportProgress(0, result);
            }

            // The RunWorkerComplete method fires when method completes
            // This is used as a signal to the SearchManager that we
            // are clear to spawn another thread if necessary.
            _status = Status.Finished;
        }