static void Main(string[] args)
{
var searchPath = "pdf";
var outPath = "image";
// Note, this will create a new Azure Search Index for the OCR text
Console.WriteLine("Creating Azure Search index...");
AzureSearch.CreateIndex(serviceClient, indexName);
// Creating an image directory
if (Directory.Exists(outPath) == false)
Directory.CreateDirectory(outPath);
foreach (var filename in Directory.GetFiles(searchPath, "*.pdf", SearchOption.TopDirectoryOnly))
{
Console.WriteLine("Extracting images from {0}", System.IO.Path.GetFileName(filename));
var images = PdfImageExtractor.ExtractImages(filename);
Console.WriteLine("{0} images found.", images.Count);
Console.WriteLine();
var directory = System.IO.Path.GetDirectoryName(filename);
foreach (var name in images.Keys)
{
//if there is a filetype save the file
if (name.LastIndexOf(".") + 1 != name.Length)
images[name].Save(System.IO.Path.Combine(outPath, name));
}
// Read in all the images and convert to text creating one big text string
string ocrText = string.Empty;
Console.WriteLine("Extracting text from image...");
foreach (var imagefilename in Directory.GetFiles(outPath))
{
OcrResults ocr = vision.RecognizeText(imagefilename);
ocrText += vision.GetRetrieveText(ocr);
File.Delete(imagefilename);
}
// Take the resulting orcText and upload to a new Azure Search Index
// It is highly recommended that you upload documents in batches rather
// individually like is done here
if (ocrText.Length > 0)
{
Console.WriteLine("Uploading extracted text to Azure Search...");
string fileNameOnly = System.IO.Path.GetFileName(filename);
string fileId = System.Convert.ToBase64String(System.Text.Encoding.UTF8.GetBytes(fileNameOnly));
AzureSearch.UploadDocuments(indexClient, fileId, fileNameOnly, ocrText);
}
}
// Execute a test search
Console.WriteLine("Execute Search...");
AzureSearch.SearchDocuments(indexClient, "Azure Search");
Console.WriteLine("All done. Press any key to continue.");
Console.ReadLine();
}
}