Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
File encoding and code page recognition
#2
I see class GuessCoder is in C#. And the PowerShell code uses .NET. Then better to use the new program. It is very similar to QM, but its script language is C#. Would not need to learn the QM language and convert the class. And much easier to convert PowerShell to C# than to QM.

C# code:
// script ""
var file_in = folders.Desktop + @"Test.txt";
var file_ok = folders.Desktop + @"Test_ok.txt";

var checkenc = GuessCoder.Detect(file_in);
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
var enc = Encoding.GetEncoding(checkenc);
print.it(checkenc, enc);

var text = File.ReadAllText(file_in, enc);

text = text.Replace("测试", "正式");

File.WriteAllText(file_ok, text, enc);

public static class GuessCoder
{
    public static string Detect(string file)
    {
        byte[] data=System.IO.File.ReadAllBytes(file);
        if (data.Length > 2 && data[0] == 0xFF && data[1] == 0xFE){return "Unicode";}
        if (data.Length > 2 && data[0] == 0xFE && data[1] == 0xFF){return "UTF-16BE";}
        if (data.Length > 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF){
            return "UTF-8";
        }else{
            int charByteCounter = 1;
            byte curByte;
            for (int i = 0; i < data.Length; i++)
            {
                curByte = data[i];
                if (charByteCounter == 1)
                {
                    if (curByte >= 0x80)
                    {
                        while (((curByte <<= 1) & 0x80) != 0)
                        {
                            charByteCounter++;
                        }
                        if (charByteCounter == 1 || charByteCounter > 6)
                        {
                            return "GB2312";
                        }
                    }
                }
                else
                {
                    if ((curByte & 0xC0) != 0x80)
                    {
                        return "GB2312";
                    }
                    charByteCounter--;
                }
            }
            if (charByteCounter > 1)
            {
               return "GB2312";
            }
            return "UTF-8";
        }
    }
}


Messages In This Thread
RE: File encoding and code page recognition - by Gintaras - 07-21-2022, 06:57 PM

Forum Jump:


Users browsing this thread: 1 Guest(s)