Crawler.MovieCrawler.Crawl C# (CSharp) Method

MovieCrawler Class Documentation Usage Examples Of Crawler.MovieCrawler::Crawl Exibir arquivo Open project: viren85/moviemirchi

Crawl() public method

Entry point for the crawler. Pass the URL from source file (XML)

public Crawl ( string url ) : MovieEntity
url	string
return	DataStoreLib.Models.MovieEntity

        public MovieEntity Crawl(string url)
        {
            MovieEntity movie = new MovieEntity();
            TableManager tableMgr = new TableManager();
            thumbnailPath = string.Empty;

            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                if (response.StatusCode == HttpStatusCode.OK)
                {
                    #region Get Movie Page Content
                    Stream receiveStream = response.GetResponseStream();
                    StreamReader readStream = null;
                    if (response.CharacterSet == null)
                        readStream = new StreamReader(receiveStream);
                    else
                        readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));

                    moviePageContent = readStream.ReadToEnd();

                    response.Close();
                    readStream.Close();
                    #endregion

                    movie = PopulateMovieDetails(moviePageContent);
                    bool crawlPosters = true;

                    TableManager tblMgr = new TableManager();

                    MovieEntity me = tblMgr.GetMovieByUniqueName(movie.UniqueName);
                    if (me != null && !string.IsNullOrEmpty(me.RowKey))
                    {
                        movie.RowKey = me.RowKey;
                        movie.MovieId = me.MovieId;
                        movie.Popularity = Util.DEFAULT_POPULARITY;
                        movie.Posters = me.Posters;
                        movie.Songs = me.Songs;
                        movie.Trailers = me.Trailers;
                        movie.State = me.State;
                        crawlPosters = false;
                    }

                    PopulateMovieDetails(ref movie, url, crawlPosters);
                    return movie;
                    ////tableMgr.UpdateMovieById(movie);
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine("An error occurred while populating the movie details. Url = " + url + ". Error=" + ex.Message);
            }

            return movie;
        }

Usage Example

Exemplo n.º 1

Exibir arquivo

Arquivo: CrawlerFilesController.cs Projeto: viren85/moviemirchi

        private void CrawlfromXML(string xmlData, string movieName)
        {
            if (string.IsNullOrEmpty(xmlData)) return;

            Crawler.MovieCrawler movieCrawler = new Crawler.MovieCrawler();
            JavaScriptSerializer json = new JavaScriptSerializer();

            try
            {
                XmlDocument xdoc = new XmlDocument();

                #region Movie Crawler
                xdoc.LoadXml(xmlData);
                var movies = xdoc.SelectNodes("Movies/Month/Movie");
                if (movies == null) return;

                foreach (XmlNode movie in movies)
                {
                    // Check movie name, we just need to crawl single movie and not all the movies present in XML file for current month
                    if (movie.Attributes["name"].Value.ToLower() != movieName.ToLower())
                    {
                        continue;
                    }

                    if (movie.Attributes["link"] != null && !string.IsNullOrEmpty(movie.Attributes["link"].Value))
                    {
                        try
                        {
                            List<string> critics = new List<string>();
                            #region Crawl Movie
                            MovieEntity mov = movieCrawler.Crawl(movie.Attributes["link"].Value);
                            TableManager tblMgr = new TableManager();
                            // Save the crawled content because in case of new movies, it fails
                            tblMgr.UpdateMovieById(mov);

                            string posterUrl = string.Empty;

                            if (movie.Attributes["santaposterlink"] != null && !string.IsNullOrEmpty(movie.Attributes["santaposterlink"].Value))
                            {
                                XMLMovieProperties prop = new XMLMovieProperties();
                                prop.SantaPosterLink = movie.Attributes["santaposterlink"].Value;
                                prop.MovieName = mov.UniqueName;

                                CrawlPosters(json.Serialize(prop));
                            }

                            // Crawl Songs from Saavn

                            if (string.IsNullOrEmpty(mov.RowKey) || string.IsNullOrEmpty(mov.MovieId)) continue;

                            tblMgr.UpdateMovieById(mov);
                            #endregion

                            #region Crawl Movie Reviews
                            #region Crawler
                            try
                            {
                                BollywoodHungamaReviews bh = new BollywoodHungamaReviews();
                                HindustanTimesReviews ht = new HindustanTimesReviews();
                                FilmfareReviews ff = new FilmfareReviews();
                                CnnIbn cibn = new CnnIbn();
                                BoxOfficeIndia boi = new BoxOfficeIndia();
                                Dna dna = new Dna();
                                FirstPost fp = new FirstPost();
                                IndianExpress ie = new IndianExpress();
                                KomalNahta kn = new KomalNahta();
                                MidDay md = new MidDay();
                                Ndtv ndtv = new Ndtv();
                                Rajasen rs = new Rajasen();
                                Rediff rdf = new Rediff();
                                Telegraph tg = new Telegraph();
                                TheHindu th = new TheHindu();
                                TimesOfIndia toi = new TimesOfIndia();
                                AnupamaChopra ac = new AnupamaChopra();
                                MumbaiMirror mm = new MumbaiMirror();

                                var reviews = movie.SelectNodes("Review");

                                List<ReviewEntity> reviewList = tblMgr.GetReviewByMovieId(mov.MovieId);

                                foreach (XmlNode review in reviews)
                                {
                                    ReviewEntity duplicateRE = reviewList.Find(r => r.Affiliation == review.Attributes["name"].Value);
                                    if (duplicateRE != null)
                                    {
                                        // We found the duplicate, skip this review to crawl
                                        continue;
                                    }

                                    ReviewEntity re = new ReviewEntity();
                                    string reviewLink = review.Attributes["link"].Value;

                                    switch (review.Attributes["name"].Value.Trim())
                                    {
                                        case "BollywoodHungama":
                                        case "Bollywood Hungama":
                                            re = bh.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Hindustan Times":
                                            re = ht.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Filmfare":
                                            re = ff.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "CNN IBN":
                                        case "CNNIBN":
                                            re = cibn.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Box Office India":
                                            re = boi.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "DNA":
                                            re = dna.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "FirstPost":
                                            re = fp.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Indian Express":
                                            re = ie.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Komal Nahta's Blog":
                                            re = kn.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Mid Day":
                                        case "MidDay":
                                            re = md.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "NDTV":
                                            re = ndtv.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "rajasen.com":
                                            re = rs.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Rediff":
                                            re = rdf.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Telegraph":
                                            re = tg.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "The Hindu":
                                            re = th.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Times of India":
                                            re = toi.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "anupamachopra.com":
                                            re = ac.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Mumbai Mirror":
                                            re = mm.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                    }

                                    if (re == null)
                                        continue;

                                    critics.Add(re.ReviewerName);

                                    // update the IDs - Movie Id, Reviewer Id etc.
                                    string reviewerId = ReviewCrawler.SetReviewer(re.ReviewerName, review.Attributes["name"].Value);
                                    //re.RowKey = re.ReviewId = new Guid().ToString();
                                    re.ReviewerId = reviewerId;
                                    re.MovieId = mov.MovieId;
                                    re.OutLink = reviewLink;
                                    tblMgr.UpdateReviewById(re);
                                }
                            }
                            catch (Exception)
                            {
                            }
                            #endregion
                            #endregion

                            #region Lucene Search Index
                            List<APIRole.UDT.Cast> casts = json.Deserialize(mov.Cast, typeof(List<APIRole.UDT.Cast>)) as List<APIRole.UDT.Cast>;
                            List<String> posters = json.Deserialize(mov.Posters, typeof(List<String>)) as List<String>;
                            List<String> actors = new List<string>();

                            if (casts != null)
                            {
                                foreach (var actor in casts)
                                {
                                    // actor, director, music, producer
                                    string role = actor.role.ToLower();
                                    string characterName = string.IsNullOrEmpty(actor.charactername) ? string.Empty : actor.charactername;

                                    // Check if artist is already present in the list for some other role.
                                    // If yes, skip it. Also if the actor name is missing then skip the artist
                                    if (actors.Contains(actor.name) || string.IsNullOrEmpty(actor.name) || actor.name == "null")
                                        continue;

                                    // If we want to showcase main artists and not all, keep the following switch... case.
                                    switch (role)
                                    {
                                        case "actor":
                                            actors.Add(actor.name);
                                            break;
                                        case "producer":
                                            // some times producer are listed as line producer etc.
                                            // We are not interested in those artists as of now?! Hence skipping it
                                            if (characterName == role)
                                            {
                                                actors.Add(actor.name);
                                            }
                                            break;
                                        case "music":
                                        case "director":
                                            // Main music director and movie director does not have associated character name.
                                            // Where as other side directors have associated character name as associate director, assitant director.
                                            // Skipping such cases.
                                            if (string.IsNullOrEmpty(characterName))
                                            {
                                                actors.Add(actor.name);
                                            }
                                            break;
                                    }

                                    // If we want to showcase all the technicians
                                    //actors.Add(actor.name);
                                }
                            }

                            if (posters != null && posters.Count > 0)
                            {
                                posterUrl = posters[posters.Count - 1];
                            }

                            // include reviewer & their affiliation in index file
                            MovieSearchData movieSearchIndex = new MovieSearchData();
                            movieSearchIndex.Id = mov.RowKey;
                            movieSearchIndex.Title = mov.Name;
                            movieSearchIndex.Type = mov.Genre;
                            movieSearchIndex.TitleImageURL = posterUrl;
                            movieSearchIndex.UniqueName = mov.UniqueName;
                            movieSearchIndex.Description = json.Serialize(actors);
                            movieSearchIndex.Critics = json.Serialize(critics);
                            movieSearchIndex.Link = mov.UniqueName;
                            LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex);
                            #endregion
                        }
                        catch (Exception)
                        {
                            Debug.WriteLine("Error while crawling movie - " + movie.Attributes["link"].Value);
                        }
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                Debug.WriteLine("Exception: {0}", ex);
                throw;
            }
        }

All Usage Examples Of Crawler.MovieCrawler::Crawl

MovieCrawler

Crawl

CrawlBollywoodHungamaPosters

CrawlCast

CrawlPosters

CrawlSongs

GetBollywoodHungamaPosters

GetCast

GetPosters

GetSongs

PopulateMovieDetails