Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

examples: Fix the encoding issues on Windows #1313

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e428dd7
Fix the encoding issues on Windows
bobqianic Sep 20, 2023
244846c
Fix the encoding issues on Windows
bobqianic Sep 20, 2023
a5c5dff
Fix the encoding issues on Windows
bobqianic Sep 20, 2023
1dc458c
Revert changes
bobqianic Sep 21, 2023
63f87ad
Revert changes
bobqianic Sep 21, 2023
a0530ee
Fix encoding issues on windows
bobqianic Sep 21, 2023
127b9b6
Fix encoding issues on windows
bobqianic Sep 21, 2023
a79ef03
Add console.h
bobqianic Sep 21, 2023
4ddca01
no known conversion error
bobqianic Sep 21, 2023
5c95e9f
fix static_cast not allowed
bobqianic Sep 21, 2023
8e98329
Add stream.cpp support
bobqianic Sep 22, 2023
6356051
Merge branch 'master' into master
bobqianic Sep 22, 2023
88c8976
Fix issue #399
bobqianic Sep 28, 2023
013d434
Fix issue #399
bobqianic Sep 28, 2023
1b94412
Merge branch 'master' into master
bobqianic Oct 3, 2023
c340b6b
move functions to common
bobqianic Oct 7, 2023
15c74d2
refactor some functions
bobqianic Oct 7, 2023
16bb889
fix bug triggered by `-ml`
bobqianic Oct 22, 2023
cecf59f
bug fix
bobqianic Oct 22, 2023
8d50a54
fix ci
bobqianic Oct 22, 2023
62e8022
fix ci
bobqianic Oct 22, 2023
a6763c5
fix ci
bobqianic Oct 22, 2023
23c9a1b
fix ci
bobqianic Oct 22, 2023
47edd08
bugfix
bobqianic Oct 22, 2023
78fe1ed
Merge branch 'master' into master
bobqianic Nov 1, 2023
85b8d1e
Merge branch 'master' into master
bobqianic Nov 7, 2023
39a240b
Merge branch 'ggerganov:master' into master
bobqianic Nov 7, 2023
4b3e480
Set default `temperature_inc` to 0.2f
bobqianic Nov 9, 2023
6ce9893
Revert changes
bobqianic Nov 9, 2023
8cdb9f6
Merge branch 'ggerganov:master' into master
bobqianic Nov 9, 2023
921d4dc
Merge branch 'ggerganov:master' into master
bobqianic Nov 17, 2023
7ecfb22
fix missing clang compiler in workflow
bobqianic Nov 24, 2023
d9145c7
revert change
bobqianic Nov 24, 2023
aa04db6
Merge branch 'ggerganov:master' into master
bobqianic Nov 24, 2023
3fcd234
Fix winsock2.h is included before Windows.h issue
bobqianic Nov 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,82 @@ std::wstring convert_to_wstring(const std::string & input) {
return converter.from_bytes(input);
}

// split UTF-8 string into valid and invalid parts
// eg. (a = "�123456", result = {"�", "123456", ""})
// eg. (a = "123456�", result = {"", "123456", "�"})
// eg. (a = "�123456�", result = {"�", "123456", "�"})
// eg. (a = "�123�456�", result = {"�", "123�456", "�"})
// result = {invalid, valid?, invalid}
std::vector<std::string> utf8_split(const std::string & a) {
if (a.empty()) {return {"", "", ""};}
std::string str1;
std::string str2;
std::string str3;

// forward pass
for (int64_t i = 0; i < static_cast<int64_t>(a.length()); i++) {
auto value = static_cast<uint8_t>(a[i]);
if (value >= 0 && value <= 127 || value >= 192 && value <= 247) {
// 1, 2, 3, 4 byte head
break;
} else if (value >= 128 && value <= 191) {
// body byte
str1 += a[i];
}
}

// backward pass
int length = 0;
int expect = 0;
for (int64_t i = static_cast<int64_t>(a.length()) - 1; i >= 0; i--) {
auto value = static_cast<uint8_t>(a[i]);
if (value >= 0 && value <= 127) {
// 1 byte head
expect = 1;
length++;
break;
} else if (value >= 128 && value <= 191){
// body byte
length++;
} else if (value >= 192 && value <= 223){
// 2 bytes head
expect = 2;
length++;
break;
} else if (value >= 224 && value <= 239){
// 3 bytes head
expect = 3;
length++;
break;
} else if (value >= 240 && value <= 247){
// 4 bytes head
expect = 4;
length++;
break;
}
}
if (expect != length) {
str3 = a.substr(a.length() - length, length);
}

str2 = a.substr(str1.length(), a.length() - str3.length());

if (str1 == str3 && str1.length() + str2.length() + str3.length() > a.length()) {
return {str1, str2, ""};
}
return {str1, str2, str3};
}

// check if the start and end of the std::string are UTF-8 encoded
bool utf8_is_valid(const std::string & a) {
if (a.empty()) {return true;}
auto result = utf8_split(a);
if (result[0].empty() && result[2].empty()) {
return true;
}
return false;
}

void gpt_split_words(std::string str, std::vector<std::string>& words) {
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
const std::regex re(pattern);
Expand Down Expand Up @@ -639,10 +715,17 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector

fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
}
#if _WIN32
else if (drwav_init_file_w(&wav, ConvertUTF8toUTF16(fname).c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
return false;
}
#else
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
return false;
}
#endif

if (wav.channels != 1 && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
Expand Down
43 changes: 43 additions & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#pragma once

#include "console.h"

#include <string>
#include <map>
#include <vector>
Expand Down Expand Up @@ -77,6 +79,47 @@ std::string convert_to_utf8(const std::wstring & input);

std::wstring convert_to_wstring(const std::string & input);

std::vector<std::string> utf8_split(const std::string & a);

bool utf8_is_valid(const std::string & a);

// used to store merged tokens
struct utf8_token {
std::string text; // text of tokens
float p_sum; // token probability sum
int token_c; // total number of tokens in buffer
int64_t t0; // start time
int64_t t1; // end time
bool start_of_seg; // start of segment

void clear() {
text = "";
p_sum = 0.0;
token_c = 0;
t0 = 0;
t1 = 0;
start_of_seg = false;
}

utf8_token()
: text(""),
p_sum(0.0),
token_c(0),
t0(0),
t1(0),
start_of_seg(false)
{}

utf8_token(const std::string& text, float p_sum, int token_c, int64_t t0, int64_t t1, bool start_of_seg)
: text(text),
p_sum(p_sum),
token_c(token_c),
t0(t0),
t1(t1),
start_of_seg(start_of_seg)
{}
};

void gpt_split_words(std::string str, std::vector<std::string>& words);

// split text into tokens
Expand Down
99 changes: 99 additions & 0 deletions examples/console.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
//
// Created by bobqianic on 9/19/2023.
//

#ifndef CONSOLE_H
#define CONSOLE_H

#include <string>
#if _WIN32
#define NOMINMAX
#define _WINSOCKAPI_
#include <windows.h>
#include <io.h>
#include <fcntl.h>
#endif

#if _WIN32
// use std::wstring on Windows
typedef std::wstring ustring;
#else
// use std::string on other platforms
typedef std::string ustring;
#endif

#if _WIN32
// Convert UTF-8 to UTF-16
// Windows only
inline std::wstring ConvertUTF8toUTF16(const std::string& utf8Str) {
if (utf8Str.empty()) return {std::wstring()};

int requiredSize = MultiByteToWideChar(CP_UTF8, 0, utf8Str.c_str(), -1, NULL, 0);
if (requiredSize == 0) {
// Handle error here
return {std::wstring()};
}

std::wstring utf16Str(requiredSize, 0);
if (MultiByteToWideChar(CP_UTF8, 0, utf8Str.c_str(), -1, &utf16Str[0], requiredSize) == 0) {
// Handle error here
return {std::wstring()};
}

// Remove the additional null byte from the end
utf16Str.resize(requiredSize - 1);

return utf16Str;
}
#endif

#if _WIN32
// Convert UTF-16 to UTF-8
// Windows only
inline std::string ConvertUTF16toUTF8(const std::wstring & utf16Str) {
if (utf16Str.empty()) return {std::string()};

int requiredSize = WideCharToMultiByte(CP_UTF8, 0, utf16Str.c_str(), -1, NULL, 0, NULL, NULL);
if (requiredSize == 0) {
// Handle error here
return {std::string()};
}

std::string utf8Str(requiredSize, 0);
if (WideCharToMultiByte(CP_UTF8, 0, utf16Str.c_str(), -1, &utf8Str[0], requiredSize, NULL, NULL) == 0) {
// Handle error here
return {std::string()};
}

// Remove the additional null byte from the end
utf8Str.resize(requiredSize - 1);

return utf8Str;
}
#endif

// initialize the console
// set output encoding
inline bool init_console() {
#if _WIN32
// set output encoding to UTF-8
SetConsoleOutputCP(CP_UTF8);
HANDLE hOut = GetStdHandle(STD_OUTPUT_HANDLE);
if (hOut == INVALID_HANDLE_VALUE) {
return GetLastError();
}

DWORD dwMode = 0;
if (!GetConsoleMode(hOut, &dwMode)) {
return GetLastError();
}

dwMode |= ENABLE_VIRTUAL_TERMINAL_PROCESSING;
if (!SetConsoleMode(hOut, dwMode)) {
return GetLastError();
}
#endif
return true;
}

#endif //CONSOLE_H
Loading
Loading