Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
File encoding and code page recognition
#3
QM code, looks more concise and easier, For C# My level of programming is not very good  Smile

I looked up some examples and got the code below

in qm, how Gets the code page for the text encoding?

I looked for some C code for code page

Macro Macro12
Code:
Copy      Help
_s.getfile("$desktop$\Test.txt") ;;cp gb2312

;Todo: Gets the code page for the text encoding

_s.ConvertEncoding(936 65001) ;;gb2312 to utf8
_s.findreplace("测试" "正式") ;;replace
_s.ConvertEncoding(65001 936) ;;UTF8 to gb2312
_s.setfile("$desktop$\Test_ok.txt")


C code for code page
Code:
Copy      Help
 
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

bool is_str_utf8(const char* str);
bool is_str_gbk(const char* str);

//Judge if it is UTF-8
bool is_str_utf8(const char* str)
{
unsigned int nBytes = 0;//UFT8Can be encoded in 1-6 bytes,ASCIIWith one byte
unsigned char chr = *str;
bool bAllAscii = true;
for (unsigned int i = 0; str[i] != '\0'; ++i) {
chr = *(str + i);
//Determine if asCII is encoded, if not, it is possible that it is UTF8, ASCII is encoded in 7 bits, and the highest bit is labeled 0,0xxxxxxx
if (nBytes == 0 && (chr & 0x80) != 0) {
bAllAscii = false;
}
if (nBytes == 0) {
//If it is not an ASCII code, it should be a multibyte character, which calculates the number of bytes
if (chr >= 0x80) {
if (chr >= 0xFC && chr <= 0xFD) {
nBytes = 6;
}
else if (chr >= 0xF8) {
nBytes = 5;
}
else if (chr >= 0xF0) {
nBytes = 4;
}
else if (chr >= 0xE0) {
nBytes = 3;
}
else if (chr >= 0xC0) {
nBytes = 2;
}
else {
return false;
}
nBytes--;
}
}
else {
//The non-first byte of the multibyte character should be 10xxxxxx
if ((chr & 0xC0) != 0x80) {
return false;
}
//Reduce to zero
nBytes--;
}
}
//Violation of UTF8 encoding rules
if (nBytes != 0) {
return false;
}
if (bAllAscii) { //If it's all ASCII, it's also UTF8
return true;
}
return true;
}

//Judge if it is GB2312
bool is_str_gbk(const char* str)
{
unsigned int nBytes = 0;//GB2312 Can be encoded in 1-2 bytes, Chinese two and one in English
unsigned char chr = *str;
bool bAllAscii = true; //If it's all ASCII,
for (unsigned int i = 0; str[i] != '\0'; ++i) {
chr = *(str + i);
if ((chr & 0x80) != 0 && nBytes == 0) {// Determine whether it is ASCII encoding, if not, it may be GB2312
bAllAscii = false;
}
if (nBytes == 0) {
if (chr >= 0x80) {
if (chr >= 0x81 && chr <= 0xFE) {
nBytes = +2;
}
else {
return false;
}
nBytes--;
}
}
else {
if (chr < 0x40 || chr>0xFE) {
return false;
}
nBytes--;
}//else end
}
if (nBytes != 0) {   //Violation rules
return false;
}
if (bAllAscii) { //If it's all ASCII, it's also GB2312
return true;
}
return true;
}

//Read the file
void read_text(const char* file_name)
{
char line[1024] = { 0 };
FILE *file = fopen(file_name, "rt");
if (!file)
return;
while (1)
{
//End of file read
if (EOF == fscanf(file, "%s", line))
break;
printf("%s\n", line);
}
printf("%d\n", is_str_utf8(line)); 
printf("%d\n", is_str_gbk(line));
fclose(file);

}

//Main function testing
int main() {
read_text("test.txt");
return 0;
}


Messages In This Thread
RE: File encoding and code page recognition - by Davider - 07-21-2022, 09:36 PM

Forum Jump:


Users browsing this thread: 1 Guest(s)