// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include #include #include #include #include #include #include #include // with caution: #include "unicode/utf8.h" static const char kSPACE = 0x20, kTAB = 0x09, kLF = 0x0A, kCR = 0x0D; // kHASH = 0x23, // kSLASH = 0x2f, // kSTAR = 0x2A, # include "cptbl.h" # define cp1047_to_8859(c) cp1047_8859_1[c] std::string prog; void usage() { fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str()); } int cleanup(const std::string &outfile) { const char *outstr = outfile.c_str(); if(outstr && *outstr) { int rc = unlink(outstr); if(rc == 0) { fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); return 0; } else { if( errno == ENOENT ) { return 0; // File did not exist - no error. } else { perror("unlink"); return 1; } } } return 0; } // inline bool hasNonAscii(const char *line, size_t len) { // const unsigned char *uline = reinterpret_cast(line); // for(size_t i=0;i 0x7F) { // return true; // } // } // return false; // } inline const char *skipws(const char *p, const char *e) { for(;p0; pos2++,trail--) { linestr[pos2] = cp1047_to_8859(linestr[pos2]); if(linestr[pos2] == 0x0A) { linestr[pos2] = 0x85; // NL is ambiguous here } } #endif // Proceed to decode utf-8 const uint8_t *s = (const uint8_t*) (linestr.c_str()); int32_t length = linestr.size(); UChar32 c; if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) linestr[pos] = old_byte; // put it back #endif continue; // single code point not previously legal for \u escaping } // otherwise, convert it to \u / \U { U8_NEXT(s, i, length, c); } if(c<0) { fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos); fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); return true; } size_t seqLen = (i-pos); //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout); char newSeq[20]; if( c <= 0xFFFF) { sprintf(newSeq, "\\u%04X", c); } else { sprintf(newSeq, "\\U%08X", c); } linestr.replace(pos, seqLen, newSeq); pos += strlen(newSeq) - 1; } } return false; } /** * false = no err * true = had err */ bool fixLine(int /*no*/, std::string &linestr) { const char *line = linestr.c_str(); size_t len = linestr.size(); // no u' in the line? if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { return false; // Nothing to do. No u' or u" detected } // lines such as u8"\u0308" are all ASCII. // // Quick Check: all ascii? // if(!hasNonAscii(line, len)) { // return false; // ASCII // } // // comment or empty line? // if(isCommentOrEmpty(line, len)) { // return false; // Comment or just empty // } // start from the end and find all u" cases size_t pos = len = linestr.size(); while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { //printf("found doublequote at %d\n", pos); if(fixAt(linestr, pos)) return true; if(pos == 0) break; pos--; } // reset and find all u' cases pos = len = linestr.size(); while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { //printf("found singlequote at %d\n", pos); if(fixAt(linestr, pos)) return true; if(pos == 0) break; pos--; } // reset and find all u8" cases pos = len = linestr.size(); while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { if(fixAt(linestr, pos)) return true; if(pos == 0) break; pos--; } //fprintf(stderr, "%d - fixed\n", no); return false; } int convert(const std::string &infile, const std::string &outfile) { fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); std::ifstream inf; inf.open(infile.c_str(), std::ios::in); if(!inf.is_open()) { fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); cleanup(outfile); return 1; } std::ofstream outf; outf.open(outfile.c_str(), std::ios::out); if(!outf.is_open()) { fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); return 1; } // TODO: any platform variations of #line? outf << "#line 1 \"" << infile << "\"" << '\n'; int no = 0; std::string linestr; while( getline( inf, linestr)) { no++; if(fixLine(no, linestr)) { outf.close(); fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); cleanup(outfile); return 1; } outf << linestr << '\n'; } return 0; } int main(int argc, const char *argv[]) { prog = argv[0]; if(argc != 3) { usage(); return 1; } std::string infile = argv[1]; std::string outfile = argv[2]; return convert(infile, outfile); } #include "utf_impl.cpp"