public static CreateIndex ( Microsoft.Azure.Search.SearchServiceClient serviceClient, string indexName ) : void | ||
serviceClient | Microsoft.Azure.Search.SearchServiceClient | |
indexName | string | |
Résultat | void |
public static void CreateIndex(SearchServiceClient serviceClient, string indexName)
{
if (serviceClient.Indexes.Exists(indexName))
{
serviceClient.Indexes.Delete(indexName);
}
var definition = new Index()
{
Name = indexName,
Fields = new[]
{
new Field("fileId", DataType.String) { IsKey = true },
new Field("fileName", DataType.String) { IsSearchable = true, IsFilterable = false, IsSortable = false, IsFacetable = false },
new Field("ocrText", DataType.String) { IsSearchable = true, IsFilterable = false, IsSortable = false, IsFacetable = false }
}
};
serviceClient.Indexes.Create(definition);
}
static void Main(string[] args) { var searchPath = "pdf"; var outPath = "image"; // Note, this will create a new Azure Search Index for the OCR text Console.WriteLine("Creating Azure Search index..."); AzureSearch.CreateIndex(serviceClient, indexName); // Creating an image directory if (Directory.Exists(outPath) == false) { Directory.CreateDirectory(outPath); } foreach (var filename in Directory.GetFiles(searchPath, "*.pdf", SearchOption.TopDirectoryOnly)) { Console.WriteLine("Extracting images from {0}", System.IO.Path.GetFileName(filename)); var images = PdfImageExtractor.ExtractImages(filename); Console.WriteLine("{0} images found.", images.Count); Console.WriteLine(); var directory = System.IO.Path.GetDirectoryName(filename); foreach (var name in images.Keys) { if (name.LastIndexOf(".") + 1 != name.Length) { images[name].Save(System.IO.Path.Combine(outPath, name)); } } string ocrText = string.Empty; Console.WriteLine("Extracting text from image..."); foreach (var imagefilename in Directory.GetFiles(outPath)) { OcrResults ocr = vision.RecognizeText(imagefilename); ocrText += vision.GetRetrieveText(ocr); File.Delete(imagefilename); } // Take the resulting orcText and upload to a new Azure Search Index // It is highly recommended that you upload documents in batches rather // individually like is done here if (ocrText.Length > 0) { Console.WriteLine("Uploading extracted text to Azure Search..."); string fileNameOnly = System.IO.Path.GetFileName(filename); string fileId = System.Convert.ToBase64String(System.Text.Encoding.UTF8.GetBytes(fileNameOnly)); AzureSearch.UploadDocuments(indexClient, fileId, fileNameOnly, ocrText); } } // Execute a test search Console.WriteLine("Execute Search..."); AzureSearch.SearchDocuments(indexClient, "Azure Search"); Console.WriteLine("All done. Press any key to continue."); Console.ReadLine(); }