Self-use-Python-Components/StringManage.py at main · Luis-530/Self-use-Python-Components · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import sys, os, warnings, hashlib
from typing import Literal, Tuple


# region 计算字符打印宽度
from Const import WIDE_EASTASIAN, ZERO_WIDTH, ZERO_WIDTH_CF, UNICODE_VERSIONS
def cut_and_pad_string(string:str, length:int, align:str='left', pad_char:str=' ') -> str:
    """
    This function cuts a string to a certain length and pads it with spaces if necessary.
    """
    if calc_string_width(string) > length:
        while calc_string_width(string) > length:
            if align == 'left':
                string = string[1:]
            elif align == 'right':
                string = string[:-1]
        return string
    elif calc_string_width(string) < length:
        while calc_string_width(string) < length:
            if align == 'left':
                string = pad_char + string
            elif align == 'right':
                string = string + pad_char
        return string
    else:
        return string

def calc_string_width(text:str|bytes, start_offs:int=0, end_offs:int=None) -> int:
    """
    Return the screen column width of text between start_offs and end_offs.

    text may be unicode or a byte string in the target _byte_encoding

    Some characters are wide (take two columns) and others affect the
    previous character (take zero columns).  Use the widths table above
    to calculate the screen column width of text[start_offs:end_offs]
    """
    _byte_encoding: Literal["utf8", "narrow", "wide"] = "narrow"
    if end_offs is None:
        end_offs = len(text)
    if start_offs > end_offs:
        raise ValueError((start_offs, end_offs))
    if isinstance(text, str):
        return sum(calc_char_width(char) for char in text[start_offs:end_offs])
    if _byte_encoding == "utf8":
        try:
            return sum(calc_char_width(char) for char in text[start_offs:end_offs].decode("utf-8"))
        except UnicodeDecodeError as exc:
            warnings.warn(
                "`calc_width` with text encoded to bytes can produce incorrect results"
                f"due to possible offset in the middle of character: {exc}",
                UnicodeWarning,
                stacklevel=2,
            )
        i = start_offs
        sc = 0
        while i < end_offs:
            o, i = _decode_one(text, i)
            w = calc_char_width(chr(o))
            sc += w
        return sc
    return end_offs - start_offs

def calc_char_width(char:str) -> Literal[0, 1, 2]:
    """
    Calculate the width of a character.
    """
    width = _wcwidth(char)
    if width < 0:
        return 0
    return width

def _decode_one(text: bytes | str, pos: int) -> Tuple[int, int]:
    """
    Return (ordinal at pos, next position) for UTF-8 encoded text.
    """
    lt = len(text) - pos
    b2 = 0
    b3 = 0
    b4 = 0
    try:
        if isinstance(text, str):
            b1 = ord(text[pos])
            if lt > 1:
                b2 = ord(text[pos + 1])
            if lt > 2:
                b3 = ord(text[pos + 2])
            if lt > 3:
                b4 = ord(text[pos + 3])
        else:
            b1 = text[pos]
            if lt > 1:
                b2 = text[pos + 1]
            if lt > 2:
                b3 = text[pos + 2]
            if lt > 3:
                b4 = text[pos + 3]
    except Exception as e:
        raise ValueError(f"{e}: text={text!r}, pos={pos!r}, lt={lt!r}").with_traceback(e.__traceback__) from e
    if not b1 & 0x80:
        return b1, pos + 1
    error = ord("?"), pos + 1
    if lt < 2:
        return error
    if b1 & 0xE0 == 0xC0:
        if b2 & 0xC0 != 0x80:
            return error
        o = ((b1 & 0x1F) << 6) | (b2 & 0x3F)
        if o < 0x80:
            return error
        return o, pos + 2
    if lt < 3:
        return error
    if b1 & 0xF0 == 0xE0:
        if b2 & 0xC0 != 0x80:
            return error
        if b3 & 0xC0 != 0x80:
            return error
        o = ((b1 & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
        if o < 0x800:
            return error
        return o, pos + 3
    if lt < 4:
        return error
    if b1 & 0xF8 == 0xF0:
        if b2 & 0xC0 != 0x80:
            return error
        if b3 & 0xC0 != 0x80:
            return error
        if b4 & 0xC0 != 0x80:
            return error
        o = ((b1 & 0x07) << 18) | ((b2 & 0x3F) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F)
        if o < 0x10000:
            return error
        return o, pos + 4
    return error

def _wcwidth(wc, unicode_version='auto'):
    ucs = ord(wc)
    if ucs in ZERO_WIDTH_CF:
        return 0
    if ucs < 32 or 0x07F <= ucs < 0x0A0:
        return -1
    _unicode_version = _wcmatch_version(unicode_version)
    if _bisearch(ucs, ZERO_WIDTH[_unicode_version]):
        return 0
    return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version])

def _bisearch(ucs, table):
    """
    Auxiliary function for binary search in interval table.

    :arg int ucs: Ordinal value of unicode character.
    :arg list table: List of starting and ending ranges of ordinal values,
        in form of ``[(start, end), ...]``.
    :rtype: int
    :returns: 1 if ordinal value ucs is found within lookup table, else 0.
    """
    lbound = 0
    ubound = len(table) - 1
    if ucs < table[0][0] or ucs > table[ubound][1]:
        return 0
    while ubound >= lbound:
        mid = (lbound + ubound) // 2
        if ucs > table[mid][1]:
            lbound = mid + 1
        elif ucs < table[mid][0]:
            ubound = mid - 1
        else:
            return 1
    return 0

def _wcversion_value(ver_string):
    retval = tuple(map(int, (ver_string.split('.'))))
    return retval

def _wcmatch_version(given_version):
    _PY3 = (sys.version_info[0] >= 3)
    _return_str = not _PY3 and isinstance(given_version, str)
    if _return_str:
        unicode_versions = [ucs.encode() for ucs in UNICODE_VERSIONS]
    else:
        unicode_versions = UNICODE_VERSIONS
    latest_version = unicode_versions[-1]
    if given_version in (u'auto', 'auto'):
        given_version = os.environ.get(
            'UNICODE_VERSION',
            'latest' if not _return_str else latest_version.encode())
    if given_version in (u'latest', 'latest'):
        return latest_version if not _return_str else latest_version.encode()
    if given_version in unicode_versions:
        return given_version if not _return_str else given_version.encode()
    try:
        cmp_given = _wcversion_value(given_version)
    except ValueError:
        warnings.warn("UNICODE_VERSION value, {given_version!r}, is invalid. "
                      "Value should be in form of `integer[.]+', the latest "
                      "supported unicode version {latest_version!r} has been "
                      "inferred.".format(given_version=given_version,
                                         latest_version=latest_version))
        return latest_version if not _return_str else latest_version.encode()
    earliest_version = unicode_versions[0]
    cmp_earliest_version = _wcversion_value(earliest_version)
    if cmp_given <= cmp_earliest_version:
        warnings.warn("UNICODE_VERSION value, {given_version!r}, is lower "
                      "than any available unicode version. Returning lowest "
                      "version level, {earliest_version!r}".format(
                          given_version=given_version,
                          earliest_version=earliest_version))
        return earliest_version if not _return_str else earliest_version.encode()
    for idx, unicode_version in enumerate(unicode_versions):
        try:
            cmp_next_version = _wcversion_value(unicode_versions[idx + 1])
        except IndexError:
            return latest_version if not _return_str else latest_version.encode()
        if cmp_given == cmp_next_version[:len(cmp_given)]:
            return unicode_versions[idx + 1]
        if cmp_next_version > cmp_given:
            return unicode_version
    assert False, ("Code path unreachable", given_version, unicode_versions)
# endregion

def getmd5(s):
    m1 = hashlib.md5()
    m1.update(s.encode(encoding="utf-8"))
    return m1.hexdigest().upper()


if __name__ == '__main__':
    print(calc_string_width("Hello, world!", 0, 13))
    print(calc_string_width("こんにちは、世界！aaa", 0, 13))