Reading text files in Unicode format Reference material

// Description:
// This function maps a character string to a wide-character (Unicode) string
//
// Parameters:
// lpcszStr: [in] Pointer to the character string to be converted
// lpwszStr: [out] Pointer to a buffer that receives the translated string.
// dwSize: [in] Size of the buffer
//
// Return value: TRUE: Succeed FALSE: Failed
// Example: MByteToWChar(szA,szW,sizeof(szW)/sizeof(szW[0]));
//
bool CLyjClass::MByteToWChar(LPCSTR lpcszStr, LPWSTR lpwszStr, DWORD dwSize)
{
// Get the required size of the buffer that receives the Unicode
// string.
DWORD dwMinSize;
dwMinSize = MultiByteToWideChar (CP_ACP, 0, lpcszStr, -1, NULL, 0);

if(dwSize < dwMinSize)
{
return FALSE;
}

// Convert headers from ASCII to Unicode.
MultiByteToWideChar (CP_ACP, 0, lpcszStr, -1, lpwszStr, dwMinSize);
return TRUE;
}

///
// illustrate:
// This function maps a wide-character string to a new character string
// Parameters:
// lpcwszStr: [in] Pointer to the character string to be converted
// lpszStr: [out] Pointer to a buffer that receives the translated string.
// dwSize: [in] Size of the buffer
// Return value: TRUE: Succeed FALSE: Failed
// Example: MByteToWChar(szW,szA,sizeof(szA)/sizeof(szA[0]));
//
bool CLyjClass::WCharToMByte(LPCWSTR lpcwszStr, LPSTR lpszStr, DWORD dwSize)
{
DWORD dwMinSize;
dwMinSize = WideCharToMultiByte(CP_OEMCP,NULL,lpcwszStr,-1,NULL,0,NULL,FALSE);
if(dwSize < dwMinSize)
{
return FALSE;
}
WideCharToMultiByte(CP_OEMCP,NULL,lpcwszStr,-1,lpszStr,dwSize,NULL,FALSE);
return TRUE;
}

///
//Convert unicode encoding to CString
// Return value: TRUE if conversion is successful, otherwise FALSE
//
bool CLyjClass::UnicodeToCString(unsigned short unicode, CString & amp;str)
{
TCHAR tch[1];
char ch[3];
ZeroMemory(tch, _tcslen(tch)* sizeof(TCHAR));
ZeroMemory(ch, sizeof(ch));

tch[0] = unicode;
if(WCharToMByte(tch, ch, sizeof(ch)) )
{
str = ch;
return TRUE;
}
else
return FALSE;
}


int CPrj_ReadUnicodeDlg::ReadTxtFile(const CString & strFileName)
{
CString strUnicode;
CStringArray strUnicodeArray;
CLyjClass myClass;
DWORD dwFileLength;
TCHAR *pch= NULL;

// read text
CFile file;
if (!file.Open(strFileName, CFile::modeRead))
{
return FALSE;
}

dwFileLength = file.GetLength();
pch = new TCHAR[dwFileLength];
ZeroMemory(pch, dwFileLength);
file.Read(pch, dwFileLength);
file.Close();

// Is it unicode text?
if (pch[0] != 0xFEFF)
{
MsgBox(_T("The text read is not Unicode encoding, and this system does not provide support for the time being"));
return FALSE;
}

// Read the first line
GetAllLineTxt(strUnicodeArray, pch, dwFileLength/2);
GetAllStrUnicode(strUnicodeArray);
CWordArray unicodeArray;
ConvertAllUnicode(unicodeArray, strUnicodeArray);

//Control displays each line
m_ctlList.ResetContent();
for (int j = 0; j < strUnicodeArray.GetSize(); j + + )
{
m_ctlList.InsertString(j, strUnicodeArray.GetAt(j));
}

if (pch != NULL)
delete [] pch;

return TRUE;
}

void CPrj_ReadUnicodeDlg::MsgBox(const CString & strMsg)
{
MessageBox(strMsg, _T("Prompt message"), MB_OK | MB_ICONINFORMATION);
}

void CPrj_ReadUnicodeDlg::OnBtnConvert()
{
// TODO: Add your control notification handler code here
UpdateData(TRUE);
if (m_strFileName.IsEmpty())
{
MsgBox(_T("File path does not exist!") + m_strFileName);
return;
}
if (!ReadTxtFile(m_strFileName))
{
MsgBox( _T("Error opening file! \
") + m_strFileName );
}
}

/
// Get each line of text
// Parameters: pch: points to unicode string, iSize: number of strings
//
bool CPrj_ReadUnicodeDlg::GetAllLineTxt(CStringArray & strUnicodeArray, const TCHAR *pch, int iSize)
{

CString strLine;

// Separate each line of text
for (int i = 1; i < iSize; i + + ) // Skip FFFE, let i=1
{
// Get a line 0d 0a is unocide newline mark
while ( (i < iSize) & amp; & amp; (pch[i] != 0x000d) & amp; & amp; (pch[i] != 0x000a))
{
strLine + = pch[i];
i + + ;
}
strLine.TrimLeft();
strLine.TrimRight();
strLine.MakeLower();

if (strLine.GetLength() >= 1)
{
strUnicodeArray.Add(strLine);
}
strLine.Empty();
}
return TRUE;
}

/
// Get the document at the starting position
// Implementation: split by spaces
// Parameters:
//
bool CPrj_ReadUnicodeDlg::GetAllStrUnicode(CStringArray & strUnicodeArray)
{
CString str;
CString strTemp;
for (int i = 0; i < strUnicodeArray.GetSize(); i + + )
{
str = strUnicodeArray.GetAt(i);
int pos = str.Find(' ');

if (pos != -1)
strTemp = str.Left(pos);
else
strTemp = str;

strUnicodeArray.SetAt(i, strTemp); // Modify
TRACE(_T("Content: %s"), strTemp);
}
return TRUE;
}

/
// Convert CStringArray to CWordArray
//Convert string to unicode encoding
// accomplish:
// Parameters:
//
bool CPrj_ReadUnicodeDlg::ConvertAllUnicode(CWordArray &unicodeArray, CStringArray &strUnicodeArray)
{

CString str;
WORD wUnicode;
CLyjClass myClass;

for (int i = 0; i < strUnicodeArray.GetSize(); i + + )
{
str = strUnicodeArray.GetAt(i);
if (myClass.StringToUnicode(str, wUnicode))
{
unicodeArray.Add(wUnicode);
}
}

return TRUE;
}

CStdioFile file;
    if (!file.Open(m_File_Path, CFile::modeRead)) return;
    CString strLine;
    while (file.ReadString(strLine))
    {
       //strLine processing
    }
question:
    CStdioFile has no problem reading any ANSI text data in the _MSBC environment, but garbled characters will be displayed when reading Chinese in ANSI text in the UNICODE environment.
reason:
CStdioFile reads ANSI text data according to the char type. Under _MSBC, it can be filled directly into CString. Under UNICODE environment, char must be converted into wide character WCHAR first, and then filled into CString, that is, two chars for one Chinese character. Will become two UNICODE characters WCHAR.
Solution:
    In the UNICODE environment, the data obtained by file.ReadString(strLine) is actually of char type, but is stored in the UNICODE string. In order to obtain real data, strLine must be processed.
void function(CString &str)
{
    char *szBuf = new char[str.GetLength()];
    for (int i = 0; i < str.GetLength(); i + + )
    {
        szBuf[i] = str.GetAt(i);
    }
    CharToUnicode(szBuf, & amp;str);
    delete []szBuf;
}
    Note: This function will prompt when compiling
                warning C4244: '=' : conversion from 'unsigned short' to 'char', possible loss of data
            Don't worry about it, the lost data is something we don't need.
================================================== =================================
/

// Convert Char characters to Unicode characters
int CharToUnicode(char *pchIn, CString *pstrOut)
{
    int nLen;
    WCHAR *ptch;
    if(pchIn == NULL)
    {
        return 0;
    }
    nLen = MultiByteToWideChar(CP_ACP, 0, pchIn, -1, NULL, 0);
    ptch = new WCHAR[nLen];
    MultiByteToWideChar(CP_ACP, 0, pchIn, -1, ptch, nLen);
    pstrOut->Format(_T("%s"), ptch);
    delete [] ptch;
    return nLen;
}

/

//Convert Unicode characters to Char characters
int UnicodeToChar(CString & strIn, char *pchOut, int nCharLen)
{
    if(pchOut == NULL)
    {
        return 0;
    }
    int nLen = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)strIn.GetBuffer(BUFFER_SIZE_KILO),-1, NULL, 0, NULL, NULL);
    nLen = min(nLen, nCharLen);
    WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)strIn.GetBuffer(BUFFER_SIZE_KILO), -1, pchOut,nLen, NULL, NULL);
    if(nLen < nCharLen)
    {
        pchOut[nLen] = 0;
    }
    return nLen;
}