Strabo.Core.OCR.WrapperTesseract.ExtractTextToGEOJSON C# (CSharp) Method

ExtractTextToGEOJSON() public method

Extracts text from images using tesseract and creates a GEOJSON file with all the features
public ExtractTextToGEOJSON ( string inputPath, string outputPath, string TesseractResultsJSONFileName ) : void
inputPath string
outputPath string
TesseractResultsJSONFileName string
return void
        public void ExtractTextToGEOJSON(string inputPath, string outputPath, string TesseractResultsJSONFileName)
        {
            string[] filePaths = Directory.GetFiles(inputPath, "*.png");
            if (filePaths.Length == 0)
                return;
            int noOfFiles = filePaths.Length;
            String[] text = new String[noOfFiles];

            float[] textCost = new float[noOfFiles];
            Bgr drawColor = new Bgr(Color.Gray);
            int[] imageIds = new int[noOfFiles];
            try
            {
                Console.WriteLine("Tessearct in progress...");
                for (int i = 0; i < noOfFiles; i++)
                {

                    string filename = Path.GetFileName(filePaths[i]);
                    String[] splitTokens = filename.Split('_');
                    if (splitTokens.Length != 11)
                        continue;
                    Image<Bgr, Byte> image = new Image<Bgr, byte>(filePaths[i]);

                    using (Image<Gray, byte> gray = image.Convert<Gray, Byte>())
                    {
                        int index = 0;
                        _ocr.Recognize(gray);
                        int charCount = 0;
                        textCost[i] = 0;
                        Tesseract.Charactor[] recog_char = _ocr.GetCharactors();

                        while (charCount < recog_char.Length)
                        {
                            text[i] += recog_char[charCount].Text;
                            textCost[i] += recog_char[charCount].Cost;
                            charCount++;
                        }
                        imageIds[i] = int.Parse(filename.Split('_')[0]);
                        index = Array.IndexOf(imageIds, imageIds[i]);
                        if (index != -1 && index != i)
                        {
                            if (textCost[index] < textCost[i])
                                imageIds[i] = -1;
                            else
                                imageIds[index] = -1;
                        }
                    }
                }
            }

            catch (Exception exception)
            {
                //MessageBox.Show(exception.Message);
                Log.WriteLine(exception.Message);
                Log.WriteLine(exception.ToString());
                throw (exception);
            }

            try
            {

                FeatureInJSON jsonFeatures = new FeatureInJSON();
                jsonFeatures.displayFieldName = "";
                jsonFeatures.geometryType = "esriGeometryPolygon";
                jsonFeatures.fieldAliases.OBJECTID = "OBJECTID";
                jsonFeatures.fieldAliases.Filename = "Filename";
                jsonFeatures.fieldAliases.Orientation = "Orientation";
                jsonFeatures.fieldAliases.Text = "Text";
                jsonFeatures.fieldAliases.Susp_char_count = "Susp_char_count";
                jsonFeatures.fieldAliases.Susp_text = "Susp_text";
                jsonFeatures.fieldAliases.Mass_centerX = "Mass_centerX";
                jsonFeatures.fieldAliases.Mass_centerY = "Mass_centerY";
                jsonFeatures.fieldAliases.Char_count = "Char_count";
                jsonFeatures.fieldAliases.DetectionCost = "DetectionCost";
                jsonFeatures.spatialReference.latestWkid = 0;
                jsonFeatures.spatialReference.wkid = 0;
                jsonFeatures.fields[0].name = "OBJECTID";
                jsonFeatures.fields[0].type = "esriFieldTypeOID";
                jsonFeatures.fields[0].alias = "OBJECTID";
                jsonFeatures.fields[0].length = 0;
                jsonFeatures.fields[1].name = "Text";
                jsonFeatures.fields[1].type = "esriFieldTypeString";
                jsonFeatures.fields[1].alias = "Text";
                jsonFeatures.fields[1].length = 80;
                jsonFeatures.fields[2].name = "Char_count";
                jsonFeatures.fields[2].type = "esriFieldTypeInteger";
                jsonFeatures.fields[2].alias = "Char_count";
                jsonFeatures.fields[2].length = 0;
                jsonFeatures.fields[3].name = "Orientation";
                jsonFeatures.fields[3].type = "esriFieldTypeDouble";
                jsonFeatures.fields[3].alias = "Orientation";
                jsonFeatures.fields[3].length = 0;
                jsonFeatures.fields[4].name = "Filename";
                jsonFeatures.fields[4].type = "esriFieldTypeString";
                jsonFeatures.fields[4].alias = "Filename";
                jsonFeatures.fields[4].length = 80;
                jsonFeatures.fields[5].name = "Susp_text";
                jsonFeatures.fields[5].type = "esriFieldTypeString";
                jsonFeatures.fields[5].alias = "Susp_text";
                jsonFeatures.fields[5].length = 80;
                jsonFeatures.fields[6].name = "Susp_char_count";
                jsonFeatures.fields[6].type = "esriFieldTypeInteger";
                jsonFeatures.fields[6].alias = "Susp_char_count";
                jsonFeatures.fields[6].length = 0;
                jsonFeatures.fields[7].name = "Mass_centerX";
                jsonFeatures.fields[7].type = "esriFieldTypeDouble";
                jsonFeatures.fields[7].alias = "Mass_centerX";
                jsonFeatures.fields[7].length = 0;
                jsonFeatures.fields[8].name = "Mass_centerY";
                jsonFeatures.fields[8].type = "esriFieldTypeDouble";
                jsonFeatures.fields[8].alias = "Mass_centerY";
                jsonFeatures.fields[8].length = 0;
                jsonFeatures.fields[9].name = "DetectionCost";
                jsonFeatures.fields[9].type = "esriFieldTypeDouble";
                jsonFeatures.fields[9].alias = "DetectionCost";
                jsonFeatures.fields[9].length = 0;

                Array filesSearchArray = (Array)filePaths;
                Console.WriteLine("Converting to JSON...");
                for (int i = 0; i < noOfFiles; i++)
                {
                    if (imageIds[i] == -1 || textCost[i] == 0 || filePaths[i] == "")
                        continue;
                    char[] separator = new char[] { '_', '.' };
                    string filename = Path.GetFileName(filePaths[i]);
                    String[] token = filename.Split(separator, StringSplitOptions.RemoveEmptyEntries);

                    /* Regular expression to match non-word characters more than 3 in the text
                     \p{L} matches any kind of letter from any langauge
                     \p{Nd} matches a digit zero through nine in any script except ideographic scripts*/
                    Regex nonWords = new Regex("[^\\p{L}\\p{Nd}]{3}[^\\p{L}\\p{Nd}]+");
                    text[i] = nonWords.Replace(text[i], "");
                    if (text[i].Contains("1 1 1")) // dashed lines
                        continue;
                    if(alphnumericratio(text[i])==false)
                        continue;
                    Features ftr = new Features();
                    ftr.attributes.OBJECTID = Convert.ToInt16(token[0]);
                    ftr.attributes.Text = text[i];
                    ftr.attributes.Char_count = Convert.ToInt16(token[2]);
                    ftr.attributes.Orientation = Convert.ToInt16(token[6]);
                    ftr.attributes.Filename = filename;
                    ftr.attributes.Susp_char_count = 0;
                    ftr.attributes.Susp_text = "";
                    ftr.attributes.Mass_centerX = Convert.ToInt16(token[3]);
                    ftr.attributes.Mass_centerY = Convert.ToInt16(token[4]);
                    ftr.attributes.DetectionCost = textCost[i];
                    int x = Convert.ToInt16(token[7]);
                    int y = Convert.ToInt16(token[8]);
                    int w = Convert.ToInt16(token[9]);
                    int h = Convert.ToInt16(token[10]);
                    ftr.geometry.rings[0, 0, 0] = x;
                    ftr.geometry.rings[0, 0, 1] = -y;
                    ftr.geometry.rings[0, 1, 0] = x + w;
                    ftr.geometry.rings[0, 1, 1] = -y;
                    ftr.geometry.rings[0, 2, 0] = x + w;
                    ftr.geometry.rings[0, 2, 1] = -y - h;
                    ftr.geometry.rings[0, 3, 0] = x;
                    ftr.geometry.rings[0, 3, 1] = -y - h;
                    ftr.geometry.rings[0, 4, 0] = x;
                    ftr.geometry.rings[0, 4, 1] = -y;

                    jsonFeatures.features.Add(ftr);

                    GeoJson geoJson = new GeoJson();
                    geoJson.featureInJson = jsonFeatures;
                    geoJson.writeJsonFile(outputPath + "\\" + TesseractResultsJSONFileName);
                }

            }
            catch (Exception exception)
            {
                //MessageBox.Show(exception.Message);
                Log.WriteLine(exception.Message);
                Log.WriteLine(exception.ToString());
                throw (exception);
            }
        }

Usage Example

        protected override void OnClick()
        {
            string straboPath = Environment.GetEnvironmentVariable(ArcStrabo10Extension.EnvironmentVariableSTRABO_HOME, EnvironmentVariableTarget.User);
            string tessPath = Environment.GetEnvironmentVariable(ArcStrabo10Extension.EnvironmentVariableTESS_DATA, EnvironmentVariableTarget.User);

            if (ArcStrabo10Extension.PathSet == false)
            {

                if (String.IsNullOrEmpty(straboPath) == true)
                {
                    MessageBox.Show(ArcStrabo10Extension.ErrorMsgNoStraboHome);
                    return;
                }
                if (String.IsNullOrEmpty(tessPath) == true)
                {
                    MessageBox.Show(ArcStrabo10Extension.ErrorMsgNoTess_Data);
                    return;
                }

                ////Initialize directories
                bool Initialize_straboPath_Correct = ArcStrabo10Extension.initialize_straboPath_directories(straboPath);

                if (Initialize_straboPath_Correct == false)
                {
                    MessageBox.Show(ArcStrabo10Extension.ErrorMsgNoStraboHomeWritePermission);
                    return;
                }
                ArcStrabo10Extension.PathSet = true;
            }

            #region Text Recognition
            ////Save Positive and Negative Layer and making GeoJason File
            ComboBoxLayerSelector layerNameCombo = ComboBoxLayerSelector.GetLayerNameComboBox();

            ////Select correct raster map layer
            RasterLayer rasterlayer = new RasterLayer();
            rasterlayer = ((RasterLayer)layerNameCombo.GetSelectedLayer());
            string input_data_source_directory;
            try
            {
                input_data_source_directory = rasterlayer.FilePath;
                //input_data_source_directory = "B:\\Users\\akshay anand\\strabo\\strabo\\data\\text_extraction\\Opensource_Afghanistan_Kabul_City_Center\\";
            }
            catch (Exception)
            {
                // Handle no input map error
                MessageBox.Show(ArcStrabo10Extension.ErrorMsgNoInputMap, "Input Map Error", MessageBoxButtons.OK);
                return;
            }

            ////Select language from combo box in toolbar
            ComboBoxLanguageSelector languageNameCombo = ComboBoxLanguageSelector.GetLanguageNameComboBox();
            string lng = languageNameCombo.Get_selected_language();
            if (lng == null)
            {
                MessageBox.Show(ArcStrabo10Extension.ErrorMsgNoInputLanguage, "Input Language Error", MessageBoxButtons.OK);
                return;
            }

            ////Set Log Directory Path
            Log.SetLogDir(ArcStrabo10Extension.Log_Path);
            Log.SetOutputDir(ArcStrabo10Extension.Log_Path);

            Log.WriteLine("MakingTextLabelGeoJsonFile Method Start SIMA");
            IMap map = ArcMap.Document.FocusMap;
            ArcStraboObject arcStraboObject = new ArcStraboObject();
            arcStraboObject.MakingTextLabelGeoJsonFile(ArcStrabo10Extension.Text_Result_Path);
            Log.WriteLine("MakingTextLabelGeoJsonFile Method Finish");

            ////Run TextExtraction Layer from Strabo.core and load raster Layer
            Log.WriteLine("textLayerExtract Medthod Start SIMA");
            arcStraboObject.textLayerExtract(input_data_source_directory, ArcStrabo10Extension.Text_Result_Path);
            Log.WriteLine("textLayerExtract Method Finish");

            Log.WriteLine("AddRasterLayer Method Start SIMA");
            arcStraboObject.AddRasterLayer(ArcStrabo10Extension.Text_Result_Path, ArcStrabo10Extension.TextLayerPNGFileName);
            Log.WriteLine("AddRasterLayer Method Finish");

            ////Run TextIdentifier Method
            Log.WriteLine("textIndentification Method Start SIMA");
            System.Windows.Forms.Cursor.Current = Cursors.WaitCursor;

            ///// Attempting to create cancel feature window
            //DialogResult result = MessageBox.Show("Text Recognition is running.", "Application Running", MessageBoxButtons.OKCancel);
            //if (result == DialogResult.Cancel)
            //{
            //    return;
            //}
            //else if (result == DialogResult.OK)

            arcStraboObject.textIndentification(ArcStrabo10Extension.Text_Result_Path + "\\", ArcStrabo10Extension.Intermediate_Result_Path + "\\", ArcStrabo10Extension.TextLayerPNGFileName);
            System.Windows.Forms.Cursor.Current = Cursors.Default;
            Log.WriteLine("textIndentification Method Finish");

            ////OCR Part
            Log.WriteLine("ExtractTextToGEOJSON Method Start SANJUALI");
            System.Windows.Forms.Cursor.Current = Cursors.AppStarting;

            //// Select language from combo box in toolbar
               // ComboBoxLanguageSelector languageNameCombo = ComboBoxLanguageSelector.GetLanguageNameComboBox();
               // string lng = languageNameCombo.Get_selected_language();
            if (lng == null)
            {
                MessageBox.Show(ArcStrabo10Extension.ErrorMsgNoInputLanguage, "Input Language Error", MessageBoxButtons.OK);
                return;
            }
            Strabo.Core.OCR.WrapperTesseract language = new Strabo.Core.OCR.WrapperTesseract(tessPath, lng);
            /// Strabo.Core.OCR.WrapperTesseract language = new Strabo.Core.OCR.WrapperTesseract(tessPath);
            language.ExtractTextToGEOJSON(ArcStrabo10Extension.Intermediate_Result_Path,ArcStrabo10Extension.Text_Result_Path,ArcStrabo10Extension.TesseractResultsJSONFileName);
            Log.WriteLine("ExtractTextToGEOJSON Method Finish");
            System.Windows.Forms.Cursor.Current = Cursors.Default;

            ////Add Polygon of OCR Layer
            Log.WriteLine("CreateFeatureClassWithFields Method Start SIMA");
            IWorkspace workspace = arcStraboObject.CreateShapefileWorkspace(ArcStrabo10Extension.Text_Result_Path);
            IFeatureWorkspace featureworkspace = (IFeatureWorkspace)workspace;
            string tesseDataPath = ArcStrabo10Extension.Text_Result_Path + "\\" + ArcStrabo10Extension.TesseractResultsJSONFileName;

            IFeatureClass featureClass = arcStraboObject.CreateFeatureClassWithFields(ArcStrabo10Extension.TextLayerOCRShapefile, featureworkspace, tesseDataPath);
            IFeatureLayer featureLayer = arcStraboObject.CreateFeatureLayer(featureClass);
            Log.WriteLine("CreateFeatureClassWithFields Method Finish");

            Log.WriteLine("AddPolygon Method Start");
            arcStraboObject.AddPolygon(featureLayer, featureworkspace, tesseDataPath);
            Log.WriteLine("AddPolygon Method Finish");

            Log.ArchiveLog();
            MessageBox.Show("Text recognition finished!", "Done", MessageBoxButtons.OK);
            #endregion
        }
All Usage Examples Of Strabo.Core.OCR.WrapperTesseract::ExtractTextToGEOJSON