private void CrawlfromXML(string xmlData, string movieName)
{
if (string.IsNullOrEmpty(xmlData)) return;
Crawler.MovieCrawler movieCrawler = new Crawler.MovieCrawler();
JavaScriptSerializer json = new JavaScriptSerializer();
try
{
XmlDocument xdoc = new XmlDocument();
#region Movie Crawler
xdoc.LoadXml(xmlData);
var movies = xdoc.SelectNodes("Movies/Month/Movie");
if (movies == null) return;
foreach (XmlNode movie in movies)
{
// Check movie name, we just need to crawl single movie and not all the movies present in XML file for current month
if (movie.Attributes["name"].Value.ToLower() != movieName.ToLower())
{
continue;
}
if (movie.Attributes["link"] != null && !string.IsNullOrEmpty(movie.Attributes["link"].Value))
{
try
{
List<string> critics = new List<string>();
#region Crawl Movie
MovieEntity mov = movieCrawler.Crawl(movie.Attributes["link"].Value);
TableManager tblMgr = new TableManager();
// Save the crawled content because in case of new movies, it fails
tblMgr.UpdateMovieById(mov);
string posterUrl = string.Empty;
if (movie.Attributes["santaposterlink"] != null && !string.IsNullOrEmpty(movie.Attributes["santaposterlink"].Value))
{
XMLMovieProperties prop = new XMLMovieProperties();
prop.SantaPosterLink = movie.Attributes["santaposterlink"].Value;
prop.MovieName = mov.UniqueName;
CrawlPosters(json.Serialize(prop));
}
// Crawl Songs from Saavn
if (string.IsNullOrEmpty(mov.RowKey) || string.IsNullOrEmpty(mov.MovieId)) continue;
tblMgr.UpdateMovieById(mov);
#endregion
#region Crawl Movie Reviews
#region Crawler
try
{
BollywoodHungamaReviews bh = new BollywoodHungamaReviews();
HindustanTimesReviews ht = new HindustanTimesReviews();
FilmfareReviews ff = new FilmfareReviews();
CnnIbn cibn = new CnnIbn();
BoxOfficeIndia boi = new BoxOfficeIndia();
Dna dna = new Dna();
FirstPost fp = new FirstPost();
IndianExpress ie = new IndianExpress();
KomalNahta kn = new KomalNahta();
MidDay md = new MidDay();
Ndtv ndtv = new Ndtv();
Rajasen rs = new Rajasen();
Rediff rdf = new Rediff();
Telegraph tg = new Telegraph();
TheHindu th = new TheHindu();
TimesOfIndia toi = new TimesOfIndia();
AnupamaChopra ac = new AnupamaChopra();
MumbaiMirror mm = new MumbaiMirror();
var reviews = movie.SelectNodes("Review");
List<ReviewEntity> reviewList = tblMgr.GetReviewByMovieId(mov.MovieId);
foreach (XmlNode review in reviews)
{
ReviewEntity duplicateRE = reviewList.Find(r => r.Affiliation == review.Attributes["name"].Value);
if (duplicateRE != null)
{
// We found the duplicate, skip this review to crawl
continue;
}
ReviewEntity re = new ReviewEntity();
string reviewLink = review.Attributes["link"].Value;
switch (review.Attributes["name"].Value.Trim())
{
case "BollywoodHungama":
case "Bollywood Hungama":
re = bh.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Hindustan Times":
re = ht.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Filmfare":
re = ff.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "CNN IBN":
case "CNNIBN":
re = cibn.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Box Office India":
re = boi.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "DNA":
re = dna.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "FirstPost":
re = fp.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Indian Express":
re = ie.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Komal Nahta's Blog":
re = kn.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Mid Day":
case "MidDay":
re = md.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "NDTV":
re = ndtv.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "rajasen.com":
re = rs.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Rediff":
re = rdf.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Telegraph":
re = tg.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "The Hindu":
re = th.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Times of India":
re = toi.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "anupamachopra.com":
re = ac.Crawl(reviewLink, review.Attributes["name"].Value);
break;
case "Mumbai Mirror":
re = mm.Crawl(reviewLink, review.Attributes["name"].Value);
break;
}
if (re == null)
continue;
critics.Add(re.ReviewerName);
// update the IDs - Movie Id, Reviewer Id etc.
string reviewerId = ReviewCrawler.SetReviewer(re.ReviewerName, review.Attributes["name"].Value);
//re.RowKey = re.ReviewId = new Guid().ToString();
re.ReviewerId = reviewerId;
re.MovieId = mov.MovieId;
re.OutLink = reviewLink;
tblMgr.UpdateReviewById(re);
}
}
catch (Exception)
{
}
#endregion
#endregion
#region Lucene Search Index
List<APIRole.UDT.Cast> casts = json.Deserialize(mov.Cast, typeof(List<APIRole.UDT.Cast>)) as List<APIRole.UDT.Cast>;
List<String> posters = json.Deserialize(mov.Posters, typeof(List<String>)) as List<String>;
List<String> actors = new List<string>();
if (casts != null)
{
foreach (var actor in casts)
{
// actor, director, music, producer
string role = actor.role.ToLower();
string characterName = string.IsNullOrEmpty(actor.charactername) ? string.Empty : actor.charactername;
// Check if artist is already present in the list for some other role.
// If yes, skip it. Also if the actor name is missing then skip the artist
if (actors.Contains(actor.name) || string.IsNullOrEmpty(actor.name) || actor.name == "null")
continue;
// If we want to showcase main artists and not all, keep the following switch... case.
switch (role)
{
case "actor":
actors.Add(actor.name);
break;
case "producer":
// some times producer are listed as line producer etc.
// We are not interested in those artists as of now?! Hence skipping it
if (characterName == role)
{
actors.Add(actor.name);
}
break;
case "music":
case "director":
// Main music director and movie director does not have associated character name.
// Where as other side directors have associated character name as associate director, assitant director.
// Skipping such cases.
if (string.IsNullOrEmpty(characterName))
{
actors.Add(actor.name);
}
break;
}
// If we want to showcase all the technicians
//actors.Add(actor.name);
}
}
if (posters != null && posters.Count > 0)
{
posterUrl = posters[posters.Count - 1];
}
// include reviewer & their affiliation in index file
MovieSearchData movieSearchIndex = new MovieSearchData();
movieSearchIndex.Id = mov.RowKey;
movieSearchIndex.Title = mov.Name;
movieSearchIndex.Type = mov.Genre;
movieSearchIndex.TitleImageURL = posterUrl;
movieSearchIndex.UniqueName = mov.UniqueName;
movieSearchIndex.Description = json.Serialize(actors);
movieSearchIndex.Critics = json.Serialize(critics);
movieSearchIndex.Link = mov.UniqueName;
LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex);
#endregion
}
catch (Exception)
{
Debug.WriteLine("Error while crawling movie - " + movie.Attributes["link"].Value);
}
}
}
#endregion
}
catch (Exception ex)
{
Debug.WriteLine("Exception: {0}", ex);
throw;
}
}