00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include <assert.h>
00018
00019 #define BEG(no) regs->beg[no]
00020 #define END(no) regs->end[no]
00021
00022 #include <math.h>
00023 #include <ctype.h>
00024
00025 #ifdef HAVE_UNISTD_H
00026 #include <unistd.h>
00027 #endif
00028
00029 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00030
00031 #undef rb_str_new_cstr
00032 #undef rb_tainted_str_new_cstr
00033 #undef rb_usascii_str_new_cstr
00034 #undef rb_external_str_new_cstr
00035 #undef rb_locale_str_new_cstr
00036 #undef rb_str_new2
00037 #undef rb_str_new3
00038 #undef rb_str_new4
00039 #undef rb_str_new5
00040 #undef rb_tainted_str_new2
00041 #undef rb_usascii_str_new2
00042 #undef rb_str_dup_frozen
00043 #undef rb_str_buf_new_cstr
00044 #undef rb_str_buf_new2
00045 #undef rb_str_buf_cat2
00046 #undef rb_str_cat2
00047
00048 VALUE rb_cString;
00049 VALUE rb_cSymbol;
00050
00051 #define RUBY_MAX_CHAR_LEN 16
00052 #define STR_TMPLOCK FL_USER7
00053 #define STR_NOEMBED FL_USER1
00054 #define STR_SHARED FL_USER2
00055 #define STR_ASSOC FL_USER3
00056 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
00057 #define STR_ASSOC_P(s) FL_ALL(s, STR_NOEMBED|STR_ASSOC)
00058 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00059 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
00060 #define STR_UNSET_NOCAPA(s) do {\
00061 if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
00062 } while (0)
00063
00064
00065 #define STR_SET_NOEMBED(str) do {\
00066 FL_SET(str, STR_NOEMBED);\
00067 STR_SET_EMBED_LEN(str, 0);\
00068 } while (0)
00069 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
00070 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
00071 #define STR_SET_EMBED_LEN(str, n) do { \
00072 long tmp_n = (n);\
00073 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00074 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00075 } while (0)
00076
00077 #define STR_SET_LEN(str, n) do { \
00078 if (STR_EMBED_P(str)) {\
00079 STR_SET_EMBED_LEN(str, n);\
00080 }\
00081 else {\
00082 RSTRING(str)->as.heap.len = (n);\
00083 }\
00084 } while (0)
00085
00086 #define STR_DEC_LEN(str) do {\
00087 if (STR_EMBED_P(str)) {\
00088 long n = RSTRING_LEN(str);\
00089 n--;\
00090 STR_SET_EMBED_LEN(str, n);\
00091 }\
00092 else {\
00093 RSTRING(str)->as.heap.len--;\
00094 }\
00095 } while (0)
00096
00097 #define RESIZE_CAPA(str,capacity) do {\
00098 if (STR_EMBED_P(str)) {\
00099 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00100 char *tmp = ALLOC_N(char, capacity+1);\
00101 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00102 RSTRING(str)->as.heap.ptr = tmp;\
00103 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00104 STR_SET_NOEMBED(str);\
00105 RSTRING(str)->as.heap.aux.capa = (capacity);\
00106 }\
00107 }\
00108 else {\
00109 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00110 if (!STR_NOCAPA_P(str))\
00111 RSTRING(str)->as.heap.aux.capa = (capacity);\
00112 }\
00113 } while (0)
00114
00115 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00116 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00117
00118 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00119
00120 static inline int
00121 single_byte_optimizable(VALUE str)
00122 {
00123 rb_encoding *enc;
00124
00125
00126 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00127 return 1;
00128
00129 enc = STR_ENC_GET(str);
00130 if (rb_enc_mbmaxlen(enc) == 1)
00131 return 1;
00132
00133
00134
00135 return 0;
00136 }
00137
00138 VALUE rb_fs;
00139
00140 static inline const char *
00141 search_nonascii(const char *p, const char *e)
00142 {
00143 #if SIZEOF_VALUE == 8
00144 # define NONASCII_MASK 0x8080808080808080ULL
00145 #elif SIZEOF_VALUE == 4
00146 # define NONASCII_MASK 0x80808080UL
00147 #endif
00148 #ifdef NONASCII_MASK
00149 if ((int)sizeof(VALUE) * 2 < e - p) {
00150 const VALUE *s, *t;
00151 const VALUE lowbits = sizeof(VALUE) - 1;
00152 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00153 while (p < (const char *)s) {
00154 if (!ISASCII(*p))
00155 return p;
00156 p++;
00157 }
00158 t = (const VALUE*)(~lowbits & (VALUE)e);
00159 while (s < t) {
00160 if (*s & NONASCII_MASK) {
00161 t = s;
00162 break;
00163 }
00164 s++;
00165 }
00166 p = (const char *)t;
00167 }
00168 #endif
00169 while (p < e) {
00170 if (!ISASCII(*p))
00171 return p;
00172 p++;
00173 }
00174 return NULL;
00175 }
00176
00177 static int
00178 coderange_scan(const char *p, long len, rb_encoding *enc)
00179 {
00180 const char *e = p + len;
00181
00182 if (rb_enc_to_index(enc) == 0) {
00183
00184 p = search_nonascii(p, e);
00185 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00186 }
00187
00188 if (rb_enc_asciicompat(enc)) {
00189 p = search_nonascii(p, e);
00190 if (!p) {
00191 return ENC_CODERANGE_7BIT;
00192 }
00193 while (p < e) {
00194 int ret = rb_enc_precise_mbclen(p, e, enc);
00195 if (!MBCLEN_CHARFOUND_P(ret)) {
00196 return ENC_CODERANGE_BROKEN;
00197 }
00198 p += MBCLEN_CHARFOUND_LEN(ret);
00199 if (p < e) {
00200 p = search_nonascii(p, e);
00201 if (!p) {
00202 return ENC_CODERANGE_VALID;
00203 }
00204 }
00205 }
00206 if (e < p) {
00207 return ENC_CODERANGE_BROKEN;
00208 }
00209 return ENC_CODERANGE_VALID;
00210 }
00211
00212 while (p < e) {
00213 int ret = rb_enc_precise_mbclen(p, e, enc);
00214
00215 if (!MBCLEN_CHARFOUND_P(ret)) {
00216 return ENC_CODERANGE_BROKEN;
00217 }
00218 p += MBCLEN_CHARFOUND_LEN(ret);
00219 }
00220 if (e < p) {
00221 return ENC_CODERANGE_BROKEN;
00222 }
00223 return ENC_CODERANGE_VALID;
00224 }
00225
00226 long
00227 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00228 {
00229 const char *p = s;
00230
00231 if (*cr == ENC_CODERANGE_BROKEN)
00232 return e - s;
00233
00234 if (rb_enc_to_index(enc) == 0) {
00235
00236 p = search_nonascii(p, e);
00237 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00238 return e - s;
00239 }
00240 else if (rb_enc_asciicompat(enc)) {
00241 p = search_nonascii(p, e);
00242 if (!p) {
00243 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00244 return e - s;
00245 }
00246 while (p < e) {
00247 int ret = rb_enc_precise_mbclen(p, e, enc);
00248 if (!MBCLEN_CHARFOUND_P(ret)) {
00249 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00250 return p - s;
00251 }
00252 p += MBCLEN_CHARFOUND_LEN(ret);
00253 if (p < e) {
00254 p = search_nonascii(p, e);
00255 if (!p) {
00256 *cr = ENC_CODERANGE_VALID;
00257 return e - s;
00258 }
00259 }
00260 }
00261 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00262 return p - s;
00263 }
00264 else {
00265 while (p < e) {
00266 int ret = rb_enc_precise_mbclen(p, e, enc);
00267 if (!MBCLEN_CHARFOUND_P(ret)) {
00268 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00269 return p - s;
00270 }
00271 p += MBCLEN_CHARFOUND_LEN(ret);
00272 }
00273 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00274 return p - s;
00275 }
00276 }
00277
00278 static inline void
00279 str_enc_copy(VALUE str1, VALUE str2)
00280 {
00281 rb_enc_set_index(str1, ENCODING_GET(str2));
00282 }
00283
00284 static void
00285 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00286 {
00287
00288
00289
00290 str_enc_copy(dest, src);
00291 switch (ENC_CODERANGE(src)) {
00292 case ENC_CODERANGE_7BIT:
00293 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00294 break;
00295 case ENC_CODERANGE_VALID:
00296 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00297 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00298 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299 else
00300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301 break;
00302 default:
00303 if (RSTRING_LEN(dest) == 0) {
00304 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00305 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00306 else
00307 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00308 }
00309 break;
00310 }
00311 }
00312
00313 static void
00314 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00315 {
00316 str_enc_copy(dest, src);
00317 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00318 }
00319
00320 int
00321 rb_enc_str_coderange(VALUE str)
00322 {
00323 int cr = ENC_CODERANGE(str);
00324
00325 if (cr == ENC_CODERANGE_UNKNOWN) {
00326 rb_encoding *enc = STR_ENC_GET(str);
00327 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00328 ENC_CODERANGE_SET(str, cr);
00329 }
00330 return cr;
00331 }
00332
00333 int
00334 rb_enc_str_asciionly_p(VALUE str)
00335 {
00336 rb_encoding *enc = STR_ENC_GET(str);
00337
00338 if (!rb_enc_asciicompat(enc))
00339 return FALSE;
00340 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00341 return TRUE;
00342 return FALSE;
00343 }
00344
00345 static inline void
00346 str_mod_check(VALUE s, const char *p, long len)
00347 {
00348 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00349 rb_raise(rb_eRuntimeError, "string modified");
00350 }
00351 }
00352
00353 static inline void
00354 str_frozen_check(VALUE s)
00355 {
00356 if (OBJ_FROZEN(s)) {
00357 rb_raise(rb_eRuntimeError, "string frozen");
00358 }
00359 }
00360
00361 size_t
00362 rb_str_capacity(VALUE str)
00363 {
00364 if (STR_EMBED_P(str)) {
00365 return RSTRING_EMBED_LEN_MAX;
00366 }
00367 else if (STR_NOCAPA_P(str)) {
00368 return RSTRING(str)->as.heap.len;
00369 }
00370 else {
00371 return RSTRING(str)->as.heap.aux.capa;
00372 }
00373 }
00374
00375 static inline VALUE
00376 str_alloc(VALUE klass)
00377 {
00378 NEWOBJ(str, struct RString);
00379 OBJSETUP(str, klass, T_STRING);
00380
00381 str->as.heap.ptr = 0;
00382 str->as.heap.len = 0;
00383 str->as.heap.aux.capa = 0;
00384
00385 return (VALUE)str;
00386 }
00387
00388 static VALUE
00389 str_new(VALUE klass, const char *ptr, long len)
00390 {
00391 VALUE str;
00392
00393 if (len < 0) {
00394 rb_raise(rb_eArgError, "negative string size (or size too big)");
00395 }
00396
00397 str = str_alloc(klass);
00398 if (len > RSTRING_EMBED_LEN_MAX) {
00399 RSTRING(str)->as.heap.aux.capa = len;
00400 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00401 STR_SET_NOEMBED(str);
00402 }
00403 else if (len == 0) {
00404 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00405 }
00406 if (ptr) {
00407 memcpy(RSTRING_PTR(str), ptr, len);
00408 }
00409 STR_SET_LEN(str, len);
00410 RSTRING_PTR(str)[len] = '\0';
00411 return str;
00412 }
00413
00414 VALUE
00415 rb_str_new(const char *ptr, long len)
00416 {
00417 return str_new(rb_cString, ptr, len);
00418 }
00419
00420 VALUE
00421 rb_usascii_str_new(const char *ptr, long len)
00422 {
00423 VALUE str = rb_str_new(ptr, len);
00424 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00425 return str;
00426 }
00427
00428 VALUE
00429 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00430 {
00431 VALUE str = rb_str_new(ptr, len);
00432 rb_enc_associate(str, enc);
00433 return str;
00434 }
00435
00436 VALUE
00437 rb_str_new_cstr(const char *ptr)
00438 {
00439 if (!ptr) {
00440 rb_raise(rb_eArgError, "NULL pointer given");
00441 }
00442 return rb_str_new(ptr, strlen(ptr));
00443 }
00444
00445 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00446 #define rb_str_new2 rb_str_new_cstr
00447
00448 VALUE
00449 rb_usascii_str_new_cstr(const char *ptr)
00450 {
00451 VALUE str = rb_str_new2(ptr);
00452 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00453 return str;
00454 }
00455
00456 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00457 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00458
00459 VALUE
00460 rb_tainted_str_new(const char *ptr, long len)
00461 {
00462 VALUE str = rb_str_new(ptr, len);
00463
00464 OBJ_TAINT(str);
00465 return str;
00466 }
00467
00468 VALUE
00469 rb_tainted_str_new_cstr(const char *ptr)
00470 {
00471 VALUE str = rb_str_new2(ptr);
00472
00473 OBJ_TAINT(str);
00474 return str;
00475 }
00476
00477 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00478 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00479
00480 VALUE
00481 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00482 {
00483 rb_econv_t *ec;
00484 rb_econv_result_t ret;
00485 long len;
00486 VALUE newstr;
00487 const unsigned char *sp;
00488 unsigned char *dp;
00489
00490 if (!to) return str;
00491 if (from == to) return str;
00492 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00493 to == rb_ascii8bit_encoding()) {
00494 if (STR_ENC_GET(str) != to) {
00495 str = rb_str_dup(str);
00496 rb_enc_associate(str, to);
00497 }
00498 return str;
00499 }
00500
00501 len = RSTRING_LEN(str);
00502 newstr = rb_str_new(0, len);
00503
00504 retry:
00505 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00506 if (!ec) return str;
00507
00508 sp = (unsigned char*)RSTRING_PTR(str);
00509 dp = (unsigned char*)RSTRING_PTR(newstr);
00510 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00511 &dp, (unsigned char*)RSTRING_END(newstr), 0);
00512 rb_econv_close(ec);
00513 switch (ret) {
00514 case econv_destination_buffer_full:
00515
00516 len = len < 2 ? 2 : len * 2;
00517 rb_str_resize(newstr, len);
00518 goto retry;
00519
00520 case econv_finished:
00521 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00522 rb_str_set_len(newstr, len);
00523 rb_enc_associate(newstr, to);
00524 return newstr;
00525
00526 default:
00527
00528 return str;
00529 }
00530 }
00531
00532 VALUE
00533 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00534 {
00535 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00536 }
00537
00538 VALUE
00539 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00540 {
00541 VALUE str;
00542
00543 str = rb_tainted_str_new(ptr, len);
00544 if (eenc == rb_usascii_encoding() &&
00545 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00546 rb_enc_associate(str, rb_ascii8bit_encoding());
00547 return str;
00548 }
00549 rb_enc_associate(str, eenc);
00550 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00551 }
00552
00553 VALUE
00554 rb_external_str_new(const char *ptr, long len)
00555 {
00556 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00557 }
00558
00559 VALUE
00560 rb_external_str_new_cstr(const char *ptr)
00561 {
00562 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00563 }
00564
00565 VALUE
00566 rb_locale_str_new(const char *ptr, long len)
00567 {
00568 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00569 }
00570
00571 VALUE
00572 rb_locale_str_new_cstr(const char *ptr)
00573 {
00574 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00575 }
00576
00577 VALUE
00578 rb_filesystem_str_new(const char *ptr, long len)
00579 {
00580 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00581 }
00582
00583 VALUE
00584 rb_filesystem_str_new_cstr(const char *ptr)
00585 {
00586 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00587 }
00588
00589 VALUE
00590 rb_str_export(VALUE str)
00591 {
00592 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00593 }
00594
00595 VALUE
00596 rb_str_export_locale(VALUE str)
00597 {
00598 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00599 }
00600
00601 VALUE
00602 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00603 {
00604 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00605 }
00606
00607 static VALUE
00608 str_replace_shared(VALUE str2, VALUE str)
00609 {
00610 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00611 STR_SET_EMBED(str2);
00612 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00613 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00614 }
00615 else {
00616 str = rb_str_new_frozen(str);
00617 FL_SET(str2, STR_NOEMBED);
00618 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00619 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00620 RSTRING(str2)->as.heap.aux.shared = str;
00621 FL_SET(str2, ELTS_SHARED);
00622 }
00623 rb_enc_cr_str_exact_copy(str2, str);
00624
00625 return str2;
00626 }
00627
00628 static VALUE
00629 str_new_shared(VALUE klass, VALUE str)
00630 {
00631 return str_replace_shared(str_alloc(klass), str);
00632 }
00633
00634 static VALUE
00635 str_new3(VALUE klass, VALUE str)
00636 {
00637 return str_new_shared(klass, str);
00638 }
00639
00640 VALUE
00641 rb_str_new_shared(VALUE str)
00642 {
00643 VALUE str2 = str_new3(rb_obj_class(str), str);
00644
00645 OBJ_INFECT(str2, str);
00646 return str2;
00647 }
00648
00649 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00650 #define rb_str_new3 rb_str_new_shared
00651
00652 static VALUE
00653 str_new4(VALUE klass, VALUE str)
00654 {
00655 VALUE str2;
00656
00657 str2 = str_alloc(klass);
00658 STR_SET_NOEMBED(str2);
00659 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00660 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00661 if (STR_SHARED_P(str)) {
00662 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00663 assert(OBJ_FROZEN(shared));
00664 FL_SET(str2, ELTS_SHARED);
00665 RSTRING(str2)->as.heap.aux.shared = shared;
00666 }
00667 else {
00668 FL_SET(str, ELTS_SHARED);
00669 RSTRING(str)->as.heap.aux.shared = str2;
00670 }
00671 rb_enc_cr_str_exact_copy(str2, str);
00672 OBJ_INFECT(str2, str);
00673 return str2;
00674 }
00675
00676 VALUE
00677 rb_str_new_frozen(VALUE orig)
00678 {
00679 VALUE klass, str;
00680
00681 if (OBJ_FROZEN(orig)) return orig;
00682 klass = rb_obj_class(orig);
00683 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00684 long ofs;
00685 assert(OBJ_FROZEN(str));
00686 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00687 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00688 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00689 ENCODING_GET(str) != ENCODING_GET(orig)) {
00690 str = str_new3(klass, str);
00691 RSTRING(str)->as.heap.ptr += ofs;
00692 RSTRING(str)->as.heap.len -= ofs;
00693 rb_enc_cr_str_exact_copy(str, orig);
00694 OBJ_INFECT(str, orig);
00695 }
00696 }
00697 else if (STR_EMBED_P(orig)) {
00698 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00699 rb_enc_cr_str_exact_copy(str, orig);
00700 OBJ_INFECT(str, orig);
00701 }
00702 else if (STR_ASSOC_P(orig)) {
00703 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00704 FL_UNSET(orig, STR_ASSOC);
00705 str = str_new4(klass, orig);
00706 FL_SET(str, STR_ASSOC);
00707 RSTRING(str)->as.heap.aux.shared = assoc;
00708 }
00709 else {
00710 str = str_new4(klass, orig);
00711 }
00712 OBJ_FREEZE(str);
00713 return str;
00714 }
00715
00716 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00717 #define rb_str_new4 rb_str_new_frozen
00718
00719 VALUE
00720 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00721 {
00722 return str_new(rb_obj_class(obj), ptr, len);
00723 }
00724
00725 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00726 rb_str_new_with_class, (obj, ptr, len))
00727 #define rb_str_new5 rb_str_new_with_class
00728
00729 static VALUE
00730 str_new_empty(VALUE str)
00731 {
00732 VALUE v = rb_str_new5(str, 0, 0);
00733 OBJ_INFECT(v, str);
00734 return v;
00735 }
00736
00737 #define STR_BUF_MIN_SIZE 128
00738
00739 VALUE
00740 rb_str_buf_new(long capa)
00741 {
00742 VALUE str = str_alloc(rb_cString);
00743
00744 if (capa < STR_BUF_MIN_SIZE) {
00745 capa = STR_BUF_MIN_SIZE;
00746 }
00747 FL_SET(str, STR_NOEMBED);
00748 RSTRING(str)->as.heap.aux.capa = capa;
00749 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00750 RSTRING(str)->as.heap.ptr[0] = '\0';
00751
00752 return str;
00753 }
00754
00755 VALUE
00756 rb_str_buf_new_cstr(const char *ptr)
00757 {
00758 VALUE str;
00759 long len = strlen(ptr);
00760
00761 str = rb_str_buf_new(len);
00762 rb_str_buf_cat(str, ptr, len);
00763
00764 return str;
00765 }
00766
00767 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00768 #define rb_str_buf_new2 rb_str_buf_new_cstr
00769
00770 VALUE
00771 rb_str_tmp_new(long len)
00772 {
00773 return str_new(0, 0, len);
00774 }
00775
00776 void
00777 rb_str_free(VALUE str)
00778 {
00779 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00780 xfree(RSTRING(str)->as.heap.ptr);
00781 }
00782 }
00783
00784 size_t
00785 rb_str_memsize(VALUE str)
00786 {
00787 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00788 return RSTRING(str)->as.heap.aux.capa;
00789 }
00790 else {
00791 return 0;
00792 }
00793 }
00794
00795 VALUE
00796 rb_str_to_str(VALUE str)
00797 {
00798 return rb_convert_type(str, T_STRING, "String", "to_str");
00799 }
00800
00801 static inline void str_discard(VALUE str);
00802
00803 void
00804 rb_str_shared_replace(VALUE str, VALUE str2)
00805 {
00806 rb_encoding *enc;
00807 int cr;
00808 if (str == str2) return;
00809 enc = STR_ENC_GET(str2);
00810 cr = ENC_CODERANGE(str2);
00811 str_discard(str);
00812 OBJ_INFECT(str, str2);
00813 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00814 STR_SET_EMBED(str);
00815 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00816 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00817 rb_enc_associate(str, enc);
00818 ENC_CODERANGE_SET(str, cr);
00819 return;
00820 }
00821 STR_SET_NOEMBED(str);
00822 STR_UNSET_NOCAPA(str);
00823 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00824 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00825 if (STR_NOCAPA_P(str2)) {
00826 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00827 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00828 }
00829 else {
00830 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00831 }
00832 STR_SET_EMBED(str2);
00833 RSTRING_PTR(str2)[0] = 0;
00834 STR_SET_EMBED_LEN(str2, 0);
00835 rb_enc_associate(str, enc);
00836 ENC_CODERANGE_SET(str, cr);
00837 }
00838
00839 static ID id_to_s;
00840
00841 VALUE
00842 rb_obj_as_string(VALUE obj)
00843 {
00844 VALUE str;
00845
00846 if (TYPE(obj) == T_STRING) {
00847 return obj;
00848 }
00849 str = rb_funcall(obj, id_to_s, 0);
00850 if (TYPE(str) != T_STRING)
00851 return rb_any_to_s(obj);
00852 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00853 return str;
00854 }
00855
00856 static VALUE
00857 str_replace(VALUE str, VALUE str2)
00858 {
00859 long len;
00860
00861 len = RSTRING_LEN(str2);
00862 if (STR_ASSOC_P(str2)) {
00863 str2 = rb_str_new4(str2);
00864 }
00865 if (STR_SHARED_P(str2)) {
00866 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00867 assert(OBJ_FROZEN(shared));
00868 STR_SET_NOEMBED(str);
00869 RSTRING(str)->as.heap.len = len;
00870 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00871 FL_SET(str, ELTS_SHARED);
00872 FL_UNSET(str, STR_ASSOC);
00873 RSTRING(str)->as.heap.aux.shared = shared;
00874 }
00875 else {
00876 str_replace_shared(str, str2);
00877 }
00878
00879 OBJ_INFECT(str, str2);
00880 rb_enc_cr_str_exact_copy(str, str2);
00881 return str;
00882 }
00883
00884 static VALUE
00885 str_duplicate(VALUE klass, VALUE str)
00886 {
00887 VALUE dup = str_alloc(klass);
00888 str_replace(dup, str);
00889 return dup;
00890 }
00891
00892 VALUE
00893 rb_str_dup(VALUE str)
00894 {
00895 return str_duplicate(rb_obj_class(str), str);
00896 }
00897
00898 VALUE
00899 rb_str_resurrect(VALUE str)
00900 {
00901 return str_replace(str_alloc(rb_cString), str);
00902 }
00903
00904
00905
00906
00907
00908
00909
00910
00911 static VALUE
00912 rb_str_init(int argc, VALUE *argv, VALUE str)
00913 {
00914 VALUE orig;
00915
00916 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00917 rb_str_replace(str, orig);
00918 return str;
00919 }
00920
00921 static inline long
00922 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00923 {
00924 long c;
00925 const char *q;
00926
00927 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00928 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00929 }
00930 else if (rb_enc_asciicompat(enc)) {
00931 c = 0;
00932 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00933 while (p < e) {
00934 if (ISASCII(*p)) {
00935 q = search_nonascii(p, e);
00936 if (!q)
00937 return c + (e - p);
00938 c += q - p;
00939 p = q;
00940 }
00941 p += rb_enc_fast_mbclen(p, e, enc);
00942 c++;
00943 }
00944 }
00945 else {
00946 while (p < e) {
00947 if (ISASCII(*p)) {
00948 q = search_nonascii(p, e);
00949 if (!q)
00950 return c + (e - p);
00951 c += q - p;
00952 p = q;
00953 }
00954 p += rb_enc_mbclen(p, e, enc);
00955 c++;
00956 }
00957 }
00958 return c;
00959 }
00960
00961 for (c=0; p<e; c++) {
00962 p += rb_enc_mbclen(p, e, enc);
00963 }
00964 return c;
00965 }
00966
00967 long
00968 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00969 {
00970 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00971 }
00972
00973 long
00974 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00975 {
00976 long c;
00977 const char *q;
00978 int ret;
00979
00980 *cr = 0;
00981 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00982 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00983 }
00984 else if (rb_enc_asciicompat(enc)) {
00985 c = 0;
00986 while (p < e) {
00987 if (ISASCII(*p)) {
00988 q = search_nonascii(p, e);
00989 if (!q) {
00990 if (!*cr) *cr = ENC_CODERANGE_7BIT;
00991 return c + (e - p);
00992 }
00993 c += q - p;
00994 p = q;
00995 }
00996 ret = rb_enc_precise_mbclen(p, e, enc);
00997 if (MBCLEN_CHARFOUND_P(ret)) {
00998 *cr |= ENC_CODERANGE_VALID;
00999 p += MBCLEN_CHARFOUND_LEN(ret);
01000 }
01001 else {
01002 *cr = ENC_CODERANGE_BROKEN;
01003 p++;
01004 }
01005 c++;
01006 }
01007 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01008 return c;
01009 }
01010
01011 for (c=0; p<e; c++) {
01012 ret = rb_enc_precise_mbclen(p, e, enc);
01013 if (MBCLEN_CHARFOUND_P(ret)) {
01014 *cr |= ENC_CODERANGE_VALID;
01015 p += MBCLEN_CHARFOUND_LEN(ret);
01016 }
01017 else {
01018 *cr = ENC_CODERANGE_BROKEN;
01019 if (p + rb_enc_mbminlen(enc) <= e)
01020 p += rb_enc_mbminlen(enc);
01021 else
01022 p = e;
01023 }
01024 }
01025 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01026 return c;
01027 }
01028
01029 #ifdef NONASCII_MASK
01030 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01031 static inline VALUE
01032 count_utf8_lead_bytes_with_word(const VALUE *s)
01033 {
01034 VALUE d = *s;
01035 d |= ~(d>>1);
01036 d >>= 6;
01037 d &= NONASCII_MASK >> 7;
01038 d += (d>>8);
01039 d += (d>>16);
01040 #if SIZEOF_VALUE == 8
01041 d += (d>>32);
01042 #endif
01043 return (d&0xF);
01044 }
01045 #endif
01046
01047 static long
01048 str_strlen(VALUE str, rb_encoding *enc)
01049 {
01050 const char *p, *e;
01051 long n;
01052 int cr;
01053
01054 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01055 if (!enc) enc = STR_ENC_GET(str);
01056 p = RSTRING_PTR(str);
01057 e = RSTRING_END(str);
01058 cr = ENC_CODERANGE(str);
01059 #ifdef NONASCII_MASK
01060 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01061 enc == rb_utf8_encoding()) {
01062
01063 VALUE len = 0;
01064 if ((int)sizeof(VALUE) * 2 < e - p) {
01065 const VALUE *s, *t;
01066 const VALUE lowbits = sizeof(VALUE) - 1;
01067 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01068 t = (const VALUE*)(~lowbits & (VALUE)e);
01069 while (p < (const char *)s) {
01070 if (is_utf8_lead_byte(*p)) len++;
01071 p++;
01072 }
01073 while (s < t) {
01074 len += count_utf8_lead_bytes_with_word(s);
01075 s++;
01076 }
01077 p = (const char *)s;
01078 }
01079 while (p < e) {
01080 if (is_utf8_lead_byte(*p)) len++;
01081 p++;
01082 }
01083 return (long)len;
01084 }
01085 #endif
01086 n = rb_enc_strlen_cr(p, e, enc, &cr);
01087 if (cr) {
01088 ENC_CODERANGE_SET(str, cr);
01089 }
01090 return n;
01091 }
01092
01093 long
01094 rb_str_strlen(VALUE str)
01095 {
01096 return str_strlen(str, STR_ENC_GET(str));
01097 }
01098
01099
01100
01101
01102
01103
01104
01105
01106
01107 VALUE
01108 rb_str_length(VALUE str)
01109 {
01110 long len;
01111
01112 len = str_strlen(str, STR_ENC_GET(str));
01113 return LONG2NUM(len);
01114 }
01115
01116
01117
01118
01119
01120
01121
01122
01123 static VALUE
01124 rb_str_bytesize(VALUE str)
01125 {
01126 return INT2NUM(RSTRING_LEN(str));
01127 }
01128
01129
01130
01131
01132
01133
01134
01135
01136
01137
01138
01139 static VALUE
01140 rb_str_empty(VALUE str)
01141 {
01142 if (RSTRING_LEN(str) == 0)
01143 return Qtrue;
01144 return Qfalse;
01145 }
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156
01157 VALUE
01158 rb_str_plus(VALUE str1, VALUE str2)
01159 {
01160 VALUE str3;
01161 rb_encoding *enc;
01162
01163 StringValue(str2);
01164 enc = rb_enc_check(str1, str2);
01165 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01166 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01167 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01168 RSTRING_PTR(str2), RSTRING_LEN(str2));
01169 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01170
01171 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01172 OBJ_TAINT(str3);
01173 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01174 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01175 return str3;
01176 }
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188 VALUE
01189 rb_str_times(VALUE str, VALUE times)
01190 {
01191 VALUE str2;
01192 long n, len;
01193 char *ptr2;
01194
01195 len = NUM2LONG(times);
01196 if (len < 0) {
01197 rb_raise(rb_eArgError, "negative argument");
01198 }
01199 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01200 rb_raise(rb_eArgError, "argument too big");
01201 }
01202
01203 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01204 ptr2 = RSTRING_PTR(str2);
01205 if (len) {
01206 n = RSTRING_LEN(str);
01207 memcpy(ptr2, RSTRING_PTR(str), n);
01208 while (n <= len/2) {
01209 memcpy(ptr2 + n, ptr2, n);
01210 n *= 2;
01211 }
01212 memcpy(ptr2 + n, ptr2, len-n);
01213 }
01214 ptr2[RSTRING_LEN(str2)] = '\0';
01215 OBJ_INFECT(str2, str);
01216 rb_enc_cr_str_copy_for_substr(str2, str);
01217
01218 return str2;
01219 }
01220
01221
01222
01223
01224
01225
01226
01227
01228
01229
01230
01231
01232
01233
01234
01235
01236 static VALUE
01237 rb_str_format_m(VALUE str, VALUE arg)
01238 {
01239 volatile VALUE tmp = rb_check_array_type(arg);
01240
01241 if (!NIL_P(tmp)) {
01242 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01243 }
01244 return rb_str_format(1, &arg, str);
01245 }
01246
01247 static inline void
01248 str_modifiable(VALUE str)
01249 {
01250 if (FL_TEST(str, STR_TMPLOCK)) {
01251 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01252 }
01253 if (OBJ_FROZEN(str)) rb_error_frozen("string");
01254 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01255 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01256 }
01257
01258 static inline int
01259 str_independent(VALUE str)
01260 {
01261 str_modifiable(str);
01262 if (!STR_SHARED_P(str)) return 1;
01263 if (STR_EMBED_P(str)) return 1;
01264 return 0;
01265 }
01266
01267 static void
01268 str_make_independent(VALUE str)
01269 {
01270 char *ptr;
01271 long len = RSTRING_LEN(str);
01272
01273 ptr = ALLOC_N(char, len+1);
01274 if (RSTRING_PTR(str)) {
01275 memcpy(ptr, RSTRING_PTR(str), len);
01276 }
01277 STR_SET_NOEMBED(str);
01278 ptr[len] = 0;
01279 RSTRING(str)->as.heap.ptr = ptr;
01280 RSTRING(str)->as.heap.len = len;
01281 RSTRING(str)->as.heap.aux.capa = len;
01282 STR_UNSET_NOCAPA(str);
01283 }
01284
01285 void
01286 rb_str_modify(VALUE str)
01287 {
01288 if (!str_independent(str))
01289 str_make_independent(str);
01290 ENC_CODERANGE_CLEAR(str);
01291 }
01292
01293
01294 static void
01295 str_modify_keep_cr(VALUE str)
01296 {
01297 if (!str_independent(str))
01298 str_make_independent(str);
01299 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01300
01301 ENC_CODERANGE_CLEAR(str);
01302 }
01303
01304 static inline void
01305 str_discard(VALUE str)
01306 {
01307 str_modifiable(str);
01308 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01309 xfree(RSTRING_PTR(str));
01310 RSTRING(str)->as.heap.ptr = 0;
01311 RSTRING(str)->as.heap.len = 0;
01312 }
01313 }
01314
01315 void
01316 rb_str_associate(VALUE str, VALUE add)
01317 {
01318
01319 if (OBJ_FROZEN(str)) rb_error_frozen("string");
01320 if (STR_ASSOC_P(str)) {
01321
01322 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01323 }
01324 else {
01325 if (STR_SHARED_P(str)) {
01326 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01327 str_make_independent(str);
01328 if (STR_ASSOC_P(assoc)) {
01329 assoc = RSTRING(assoc)->as.heap.aux.shared;
01330 rb_ary_concat(assoc, add);
01331 add = assoc;
01332 }
01333 }
01334 else if (STR_EMBED_P(str)) {
01335 str_make_independent(str);
01336 }
01337 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01338 RESIZE_CAPA(str, RSTRING_LEN(str));
01339 }
01340 FL_SET(str, STR_ASSOC);
01341 RBASIC(add)->klass = 0;
01342 RSTRING(str)->as.heap.aux.shared = add;
01343 }
01344 }
01345
01346 VALUE
01347 rb_str_associated(VALUE str)
01348 {
01349 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01350 if (STR_ASSOC_P(str)) {
01351 return RSTRING(str)->as.heap.aux.shared;
01352 }
01353 return Qfalse;
01354 }
01355
01356 VALUE
01357 rb_string_value(volatile VALUE *ptr)
01358 {
01359 VALUE s = *ptr;
01360 if (TYPE(s) != T_STRING) {
01361 s = rb_str_to_str(s);
01362 *ptr = s;
01363 }
01364 return s;
01365 }
01366
01367 char *
01368 rb_string_value_ptr(volatile VALUE *ptr)
01369 {
01370 VALUE str = rb_string_value(ptr);
01371 return RSTRING_PTR(str);
01372 }
01373
01374 char *
01375 rb_string_value_cstr(volatile VALUE *ptr)
01376 {
01377 VALUE str = rb_string_value(ptr);
01378 char *s = RSTRING_PTR(str);
01379 long len = RSTRING_LEN(str);
01380
01381 if (!s || memchr(s, 0, len)) {
01382 rb_raise(rb_eArgError, "string contains null byte");
01383 }
01384 if (s[len]) rb_str_modify(str);
01385 return s;
01386 }
01387
01388 VALUE
01389 rb_check_string_type(VALUE str)
01390 {
01391 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01392 return str;
01393 }
01394
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404
01405
01406 static VALUE
01407 rb_str_s_try_convert(VALUE dummy, VALUE str)
01408 {
01409 return rb_check_string_type(str);
01410 }
01411
01412 char*
01413 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01414 {
01415 if (rb_enc_mbmaxlen(enc) == 1) {
01416 p += nth;
01417 }
01418 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01419 p += nth * rb_enc_mbmaxlen(enc);
01420 }
01421 else if (rb_enc_asciicompat(enc)) {
01422 const char *p2, *e2;
01423 int n;
01424
01425 while (p < e && 0 < nth) {
01426 e2 = p + nth;
01427 if (e < e2)
01428 return (char *)e;
01429 if (ISASCII(*p)) {
01430 p2 = search_nonascii(p, e2);
01431 if (!p2)
01432 return (char *)e2;
01433 nth -= p2 - p;
01434 p = p2;
01435 }
01436 n = rb_enc_mbclen(p, e, enc);
01437 p += n;
01438 nth--;
01439 }
01440 if (nth != 0)
01441 return (char *)e;
01442 return (char *)p;
01443 }
01444 else {
01445 while (p<e && nth--) {
01446 p += rb_enc_mbclen(p, e, enc);
01447 }
01448 }
01449 if (p > e) p = e;
01450 return (char*)p;
01451 }
01452
01453 static char*
01454 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01455 {
01456 if (singlebyte)
01457 p += nth;
01458 else {
01459 p = rb_enc_nth(p, e, nth, enc);
01460 }
01461 if (!p) return 0;
01462 if (p > e) p = e;
01463 return (char *)p;
01464 }
01465
01466
01467 static long
01468 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01469 {
01470 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01471 if (!pp) return e - p;
01472 return pp - p;
01473 }
01474
01475 long
01476 rb_str_offset(VALUE str, long pos)
01477 {
01478 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01479 STR_ENC_GET(str), single_byte_optimizable(str));
01480 }
01481
01482 #ifdef NONASCII_MASK
01483 static char *
01484 str_utf8_nth(const char *p, const char *e, long nth)
01485 {
01486 if ((int)SIZEOF_VALUE < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01487 const VALUE *s, *t;
01488 const VALUE lowbits = sizeof(VALUE) - 1;
01489 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01490 t = (const VALUE*)(~lowbits & (VALUE)e);
01491 while (p < (const char *)s) {
01492 if (is_utf8_lead_byte(*p)) nth--;
01493 p++;
01494 }
01495 do {
01496 nth -= count_utf8_lead_bytes_with_word(s);
01497 s++;
01498 } while (s < t && (int)sizeof(VALUE) <= nth);
01499 p = (char *)s;
01500 }
01501 while (p < e) {
01502 if (is_utf8_lead_byte(*p)) {
01503 if (nth == 0) break;
01504 nth--;
01505 }
01506 p++;
01507 }
01508 return (char *)p;
01509 }
01510
01511 static long
01512 str_utf8_offset(const char *p, const char *e, long nth)
01513 {
01514 const char *pp = str_utf8_nth(p, e, nth);
01515 return pp - p;
01516 }
01517 #endif
01518
01519
01520 long
01521 rb_str_sublen(VALUE str, long pos)
01522 {
01523 if (single_byte_optimizable(str) || pos < 0)
01524 return pos;
01525 else {
01526 char *p = RSTRING_PTR(str);
01527 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01528 }
01529 }
01530
01531 VALUE
01532 rb_str_subseq(VALUE str, long beg, long len)
01533 {
01534 VALUE str2;
01535
01536 if (RSTRING_LEN(str) == beg + len &&
01537 RSTRING_EMBED_LEN_MAX < len) {
01538 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01539 rb_str_drop_bytes(str2, beg);
01540 }
01541 else {
01542 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01543 }
01544
01545 rb_enc_cr_str_copy_for_substr(str2, str);
01546 OBJ_INFECT(str2, str);
01547
01548 return str2;
01549 }
01550
01551 VALUE
01552 rb_str_substr(VALUE str, long beg, long len)
01553 {
01554 rb_encoding *enc = STR_ENC_GET(str);
01555 VALUE str2;
01556 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01557
01558 if (len < 0) return Qnil;
01559 if (!RSTRING_LEN(str)) {
01560 len = 0;
01561 }
01562 if (single_byte_optimizable(str)) {
01563 if (beg > RSTRING_LEN(str)) return Qnil;
01564 if (beg < 0) {
01565 beg += RSTRING_LEN(str);
01566 if (beg < 0) return Qnil;
01567 }
01568 if (beg + len > RSTRING_LEN(str))
01569 len = RSTRING_LEN(str) - beg;
01570 if (len <= 0) {
01571 len = 0;
01572 p = 0;
01573 }
01574 else
01575 p = s + beg;
01576 goto sub;
01577 }
01578 if (beg < 0) {
01579 if (len > -beg) len = -beg;
01580 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01581 beg = -beg;
01582 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01583 p = e;
01584 if (!p) return Qnil;
01585 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01586 if (!p) return Qnil;
01587 len = e - p;
01588 goto sub;
01589 }
01590 else {
01591 beg += str_strlen(str, enc);
01592 if (beg < 0) return Qnil;
01593 }
01594 }
01595 else if (beg > 0 && beg > str_strlen(str, enc)) {
01596 return Qnil;
01597 }
01598 if (len == 0) {
01599 p = 0;
01600 }
01601 #ifdef NONASCII_MASK
01602 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01603 enc == rb_utf8_encoding()) {
01604 p = str_utf8_nth(s, e, beg);
01605 len = str_utf8_offset(p, e, len);
01606 }
01607 #endif
01608 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01609 int char_sz = rb_enc_mbmaxlen(enc);
01610
01611 p = s + beg * char_sz;
01612 if (p > e) {
01613 p = e;
01614 len = 0;
01615 }
01616 else if (len * char_sz > e - p)
01617 len = e - p;
01618 else
01619 len *= char_sz;
01620 }
01621 else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
01622 len = 0;
01623 }
01624 else {
01625 len = str_offset(p, e, len, enc, 0);
01626 }
01627 sub:
01628 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01629 str2 = rb_str_new4(str);
01630 str2 = str_new3(rb_obj_class(str2), str2);
01631 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01632 RSTRING(str2)->as.heap.len = len;
01633 }
01634 else {
01635 str2 = rb_str_new5(str, p, len);
01636 rb_enc_cr_str_copy_for_substr(str2, str);
01637 OBJ_INFECT(str2, str);
01638 }
01639
01640 return str2;
01641 }
01642
01643 VALUE
01644 rb_str_freeze(VALUE str)
01645 {
01646 if (STR_ASSOC_P(str)) {
01647 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01648 OBJ_FREEZE(ary);
01649 }
01650 return rb_obj_freeze(str);
01651 }
01652
01653 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01654 #define rb_str_dup_frozen rb_str_new_frozen
01655
01656 VALUE
01657 rb_str_locktmp(VALUE str)
01658 {
01659 if (FL_TEST(str, STR_TMPLOCK)) {
01660 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01661 }
01662 FL_SET(str, STR_TMPLOCK);
01663 return str;
01664 }
01665
01666 VALUE
01667 rb_str_unlocktmp(VALUE str)
01668 {
01669 if (!FL_TEST(str, STR_TMPLOCK)) {
01670 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01671 }
01672 FL_UNSET(str, STR_TMPLOCK);
01673 return str;
01674 }
01675
01676 void
01677 rb_str_set_len(VALUE str, long len)
01678 {
01679 rb_str_modify(str);
01680 STR_SET_LEN(str, len);
01681 RSTRING_PTR(str)[len] = '\0';
01682 }
01683
01684 VALUE
01685 rb_str_resize(VALUE str, long len)
01686 {
01687 long slen;
01688
01689 if (len < 0) {
01690 rb_raise(rb_eArgError, "negative string size (or size too big)");
01691 }
01692
01693 rb_str_modify(str);
01694 slen = RSTRING_LEN(str);
01695 if (len != slen) {
01696 if (STR_EMBED_P(str)) {
01697 char *ptr;
01698 if (len <= RSTRING_EMBED_LEN_MAX) {
01699 STR_SET_EMBED_LEN(str, len);
01700 RSTRING(str)->as.ary[len] = '\0';
01701 return str;
01702 }
01703 ptr = ALLOC_N(char,len+1);
01704 MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
01705 RSTRING(str)->as.heap.ptr = ptr;
01706 STR_SET_NOEMBED(str);
01707 }
01708 else if (len <= RSTRING_EMBED_LEN_MAX) {
01709 char *ptr = RSTRING(str)->as.heap.ptr;
01710 STR_SET_EMBED(str);
01711 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
01712 RSTRING(str)->as.ary[len] = '\0';
01713 STR_SET_EMBED_LEN(str, len);
01714 xfree(ptr);
01715 return str;
01716 }
01717 else if (slen < len || slen - len > 1024) {
01718 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01719 }
01720 if (!STR_NOCAPA_P(str)) {
01721 RSTRING(str)->as.heap.aux.capa = len;
01722 }
01723 RSTRING(str)->as.heap.len = len;
01724 RSTRING(str)->as.heap.ptr[len] = '\0';
01725 }
01726 return str;
01727 }
01728
01729 static VALUE
01730 str_buf_cat(VALUE str, const char *ptr, long len)
01731 {
01732 long capa, total, off = -1;
01733
01734 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01735 off = ptr - RSTRING_PTR(str);
01736 }
01737 rb_str_modify(str);
01738 if (len == 0) return 0;
01739 if (STR_ASSOC_P(str)) {
01740 FL_UNSET(str, STR_ASSOC);
01741 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01742 }
01743 else if (STR_EMBED_P(str)) {
01744 capa = RSTRING_EMBED_LEN_MAX;
01745 }
01746 else {
01747 capa = RSTRING(str)->as.heap.aux.capa;
01748 }
01749 if (RSTRING_LEN(str) >= LONG_MAX - len) {
01750 rb_raise(rb_eArgError, "string sizes too big");
01751 }
01752 total = RSTRING_LEN(str)+len;
01753 if (capa <= total) {
01754 while (total > capa) {
01755 if (capa + 1 >= LONG_MAX / 2) {
01756 capa = (total + 4095) / 4096;
01757 break;
01758 }
01759 capa = (capa + 1) * 2;
01760 }
01761 RESIZE_CAPA(str, capa);
01762 }
01763 if (off != -1) {
01764 ptr = RSTRING_PTR(str) + off;
01765 }
01766 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01767 STR_SET_LEN(str, total);
01768 RSTRING_PTR(str)[total] = '\0';
01769
01770 return str;
01771 }
01772
01773 #define str_buf_cat2(str, ptr) str_buf_cat(str, (ptr), strlen(ptr))
01774
01775 VALUE
01776 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01777 {
01778 if (len == 0) return str;
01779 if (len < 0) {
01780 rb_raise(rb_eArgError, "negative string size (or size too big)");
01781 }
01782 return str_buf_cat(str, ptr, len);
01783 }
01784
01785 VALUE
01786 rb_str_buf_cat2(VALUE str, const char *ptr)
01787 {
01788 return rb_str_buf_cat(str, ptr, strlen(ptr));
01789 }
01790
01791 VALUE
01792 rb_str_cat(VALUE str, const char *ptr, long len)
01793 {
01794 if (len < 0) {
01795 rb_raise(rb_eArgError, "negative string size (or size too big)");
01796 }
01797 if (STR_ASSOC_P(str)) {
01798 rb_str_modify(str);
01799 if (STR_EMBED_P(str)) str_make_independent(str);
01800 REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
01801 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
01802 RSTRING(str)->as.heap.len += len;
01803 RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0';
01804 return str;
01805 }
01806
01807 return rb_str_buf_cat(str, ptr, len);
01808 }
01809
01810 VALUE
01811 rb_str_cat2(VALUE str, const char *ptr)
01812 {
01813 return rb_str_cat(str, ptr, strlen(ptr));
01814 }
01815
01816 static VALUE
01817 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01818 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01819 {
01820 int str_encindex = ENCODING_GET(str);
01821 int res_encindex;
01822 int str_cr, res_cr;
01823 int str_a8 = ENCODING_IS_ASCII8BIT(str);
01824 int ptr_a8 = ptr_encindex == 0;
01825
01826 str_cr = ENC_CODERANGE(str);
01827
01828 if (str_encindex == ptr_encindex) {
01829 if (str_cr == ENC_CODERANGE_UNKNOWN ||
01830 (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
01831 ptr_cr = ENC_CODERANGE_UNKNOWN;
01832 }
01833 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01834 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01835 }
01836 }
01837 else {
01838 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01839 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01840 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01841 if (len == 0)
01842 return str;
01843 if (RSTRING_LEN(str) == 0) {
01844 rb_str_buf_cat(str, ptr, len);
01845 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01846 return str;
01847 }
01848 goto incompatible;
01849 }
01850 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01851 ptr_cr = coderange_scan(ptr, len, ptr_enc);
01852 }
01853 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01854 if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
01855 str_cr = rb_enc_str_coderange(str);
01856 }
01857 }
01858 }
01859 if (ptr_cr_ret)
01860 *ptr_cr_ret = ptr_cr;
01861
01862 if (str_encindex != ptr_encindex &&
01863 str_cr != ENC_CODERANGE_7BIT &&
01864 ptr_cr != ENC_CODERANGE_7BIT) {
01865 incompatible:
01866 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01867 rb_enc_name(rb_enc_from_index(str_encindex)),
01868 rb_enc_name(rb_enc_from_index(ptr_encindex)));
01869 }
01870
01871 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01872 res_encindex = str_encindex;
01873 res_cr = ENC_CODERANGE_UNKNOWN;
01874 }
01875 else if (str_cr == ENC_CODERANGE_7BIT) {
01876 if (ptr_cr == ENC_CODERANGE_7BIT) {
01877 res_encindex = !str_a8 ? str_encindex : ptr_encindex;
01878 res_cr = ENC_CODERANGE_7BIT;
01879 }
01880 else {
01881 res_encindex = ptr_encindex;
01882 res_cr = ptr_cr;
01883 }
01884 }
01885 else if (str_cr == ENC_CODERANGE_VALID) {
01886 res_encindex = str_encindex;
01887 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01888 res_cr = str_cr;
01889 else
01890 res_cr = ptr_cr;
01891 }
01892 else {
01893 res_encindex = str_encindex;
01894 res_cr = str_cr;
01895 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01896 }
01897
01898 if (len < 0) {
01899 rb_raise(rb_eArgError, "negative string size (or size too big)");
01900 }
01901 str_buf_cat(str, ptr, len);
01902 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01903 return str;
01904 }
01905
01906 VALUE
01907 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01908 {
01909 return rb_enc_cr_str_buf_cat(str, ptr, len,
01910 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01911 }
01912
01913 VALUE
01914 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
01915 {
01916
01917 int encindex = ENCODING_GET(str);
01918 rb_encoding *enc = rb_enc_from_index(encindex);
01919 if (rb_enc_asciicompat(enc)) {
01920 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
01921 encindex, ENC_CODERANGE_7BIT, 0);
01922 }
01923 else {
01924 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
01925 while (*ptr) {
01926 unsigned int c = (unsigned char)*ptr;
01927 int len = rb_enc_codelen(c, enc);
01928 rb_enc_mbcput(c, buf, enc);
01929 rb_enc_cr_str_buf_cat(str, buf, len,
01930 encindex, ENC_CODERANGE_VALID, 0);
01931 ptr++;
01932 }
01933 return str;
01934 }
01935 }
01936
01937 VALUE
01938 rb_str_buf_append(VALUE str, VALUE str2)
01939 {
01940 int str2_cr;
01941
01942 str2_cr = ENC_CODERANGE(str2);
01943
01944 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
01945 ENCODING_GET(str2), str2_cr, &str2_cr);
01946
01947 OBJ_INFECT(str, str2);
01948 ENC_CODERANGE_SET(str2, str2_cr);
01949
01950 return str;
01951 }
01952
01953 VALUE
01954 rb_str_append(VALUE str, VALUE str2)
01955 {
01956 rb_encoding *enc;
01957 int cr, cr2;
01958
01959 StringValue(str2);
01960 if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
01961 long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
01962 enc = rb_enc_check(str, str2);
01963 cr = ENC_CODERANGE(str);
01964 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
01965 rb_str_modify(str);
01966 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01967 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
01968 RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
01969 RSTRING(str)->as.heap.len = len;
01970 rb_enc_associate(str, enc);
01971 ENC_CODERANGE_SET(str, cr);
01972 OBJ_INFECT(str, str2);
01973 return str;
01974 }
01975 return rb_str_buf_append(str, str2);
01976 }
01977
01978
01979
01980
01981
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991
01992
01993
01994
01995 VALUE
01996 rb_str_concat(VALUE str1, VALUE str2)
01997 {
01998 unsigned int lc;
01999
02000 if (FIXNUM_P(str2)) {
02001 if ((int)str2 < 0)
02002 rb_raise(rb_eRangeError, "negative argument");
02003 lc = FIX2UINT(str2);
02004 }
02005 else if (TYPE(str2) == T_BIGNUM) {
02006 if (!RBIGNUM_SIGN(str2))
02007 rb_raise(rb_eRangeError, "negative argument");
02008 lc = NUM2UINT(str2);
02009 }
02010 else {
02011 return rb_str_append(str1, str2);
02012 }
02013 #if SIZEOF_INT < SIZEOF_VALUE
02014 if ((VALUE)lc > UINT_MAX) {
02015 rb_raise(rb_eRangeError, "%"PRIuVALUE" out of char range", lc);
02016 }
02017 #endif
02018 {
02019 rb_encoding *enc = STR_ENC_GET(str1);
02020 long pos = RSTRING_LEN(str1);
02021 int cr = ENC_CODERANGE(str1);
02022 int len;
02023
02024 if ((len = rb_enc_codelen(lc, enc)) <= 0) {
02025 rb_raise(rb_eRangeError, "%u invalid char", lc);
02026 }
02027 rb_str_resize(str1, pos+len);
02028 rb_enc_mbcput(lc, RSTRING_PTR(str1)+pos, enc);
02029 if (cr == ENC_CODERANGE_7BIT && lc > 127)
02030 cr = ENC_CODERANGE_VALID;
02031 ENC_CODERANGE_SET(str1, cr);
02032 return str1;
02033 }
02034 }
02035
02036 st_index_t
02037 rb_memhash(const void *ptr, long len)
02038 {
02039 return st_hash(ptr, len, rb_hash_start(0));
02040 }
02041
02042 st_index_t
02043 rb_str_hash(VALUE str)
02044 {
02045 int e = ENCODING_GET(str);
02046 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02047 e = 0;
02048 }
02049 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02050 }
02051
02052 int
02053 rb_str_hash_cmp(VALUE str1, VALUE str2)
02054 {
02055 long len;
02056
02057 if (!rb_str_comparable(str1, str2)) return 1;
02058 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02059 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02060 return 0;
02061 }
02062 return 1;
02063 }
02064
02065
02066
02067
02068
02069
02070
02071
02072 static VALUE
02073 rb_str_hash_m(VALUE str)
02074 {
02075 st_index_t hval = rb_str_hash(str);
02076 return INT2FIX(hval);
02077 }
02078
02079 #define lesser(a,b) (((a)>(b))?(b):(a))
02080
02081 int
02082 rb_str_comparable(VALUE str1, VALUE str2)
02083 {
02084 int idx1, idx2;
02085 int rc1, rc2;
02086
02087 if (RSTRING_LEN(str1) == 0) return TRUE;
02088 if (RSTRING_LEN(str2) == 0) return TRUE;
02089 idx1 = ENCODING_GET(str1);
02090 idx2 = ENCODING_GET(str2);
02091 if (idx1 == idx2) return TRUE;
02092 rc1 = rb_enc_str_coderange(str1);
02093 rc2 = rb_enc_str_coderange(str2);
02094 if (rc1 == ENC_CODERANGE_7BIT) {
02095 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02096 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02097 return TRUE;
02098 }
02099 if (rc2 == ENC_CODERANGE_7BIT) {
02100 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02101 return TRUE;
02102 }
02103 return FALSE;
02104 }
02105
02106 int
02107 rb_str_cmp(VALUE str1, VALUE str2)
02108 {
02109 long len;
02110 int retval;
02111
02112 len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
02113 retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
02114 if (retval == 0) {
02115 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
02116 if (!rb_str_comparable(str1, str2)) {
02117 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02118 return 1;
02119 return -1;
02120 }
02121 return 0;
02122 }
02123 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
02124 return -1;
02125 }
02126 if (retval > 0) return 1;
02127 return -1;
02128 }
02129
02130
02131 static VALUE
02132 str_eql(const VALUE str1, const VALUE str2)
02133 {
02134 const long len = RSTRING_LEN(str1);
02135
02136 if (len != RSTRING_LEN(str2)) return Qfalse;
02137 if (!rb_str_comparable(str1, str2)) return Qfalse;
02138 if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0)
02139 return Qtrue;
02140 return Qfalse;
02141 }
02142
02143
02144
02145
02146
02147
02148
02149
02150
02151 VALUE
02152 rb_str_equal(VALUE str1, VALUE str2)
02153 {
02154 if (str1 == str2) return Qtrue;
02155 if (TYPE(str2) != T_STRING) {
02156 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02157 return Qfalse;
02158 }
02159 return rb_equal(str2, str1);
02160 }
02161 return str_eql(str1, str2);
02162 }
02163
02164
02165
02166
02167
02168
02169
02170
02171 static VALUE
02172 rb_str_eql(VALUE str1, VALUE str2)
02173 {
02174 if (TYPE(str2) != T_STRING) return Qfalse;
02175 return str_eql(str1, str2);
02176 }
02177
02178
02179
02180
02181
02182
02183
02184
02185
02186
02187
02188
02189
02190
02191
02192
02193
02194
02195
02196
02197
02198
02199
02200
02201 static VALUE
02202 rb_str_cmp_m(VALUE str1, VALUE str2)
02203 {
02204 long result;
02205
02206 if (TYPE(str2) != T_STRING) {
02207 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02208 return Qnil;
02209 }
02210 else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02211 return Qnil;
02212 }
02213 else {
02214 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02215
02216 if (NIL_P(tmp)) return Qnil;
02217 if (!FIXNUM_P(tmp)) {
02218 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02219 }
02220 result = -FIX2LONG(tmp);
02221 }
02222 }
02223 else {
02224 result = rb_str_cmp(str1, str2);
02225 }
02226 return LONG2NUM(result);
02227 }
02228
02229
02230
02231
02232
02233
02234
02235
02236
02237
02238
02239
02240
02241 static VALUE
02242 rb_str_casecmp(VALUE str1, VALUE str2)
02243 {
02244 long len;
02245 rb_encoding *enc;
02246 char *p1, *p1end, *p2, *p2end;
02247
02248 StringValue(str2);
02249 enc = rb_enc_compatible(str1, str2);
02250 if (!enc) {
02251 return Qnil;
02252 }
02253
02254 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02255 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02256 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02257 while (p1 < p1end && p2 < p2end) {
02258 if (*p1 != *p2) {
02259 unsigned int c1 = TOUPPER(*p1 & 0xff);
02260 unsigned int c2 = TOUPPER(*p2 & 0xff);
02261 if (c1 != c2)
02262 return INT2FIX(c1 < c2 ? -1 : 1);
02263 }
02264 p1++;
02265 p2++;
02266 }
02267 }
02268 else {
02269 while (p1 < p1end && p2 < p2end) {
02270 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02271 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02272
02273 if (0 <= c1 && 0 <= c2) {
02274 c1 = TOUPPER(c1);
02275 c2 = TOUPPER(c2);
02276 if (c1 != c2)
02277 return INT2FIX(c1 < c2 ? -1 : 1);
02278 }
02279 else {
02280 int r;
02281 l1 = rb_enc_mbclen(p1, p1end, enc);
02282 l2 = rb_enc_mbclen(p2, p2end, enc);
02283 len = l1 < l2 ? l1 : l2;
02284 r = memcmp(p1, p2, len);
02285 if (r != 0)
02286 return INT2FIX(r < 0 ? -1 : 1);
02287 if (l1 != l2)
02288 return INT2FIX(l1 < l2 ? -1 : 1);
02289 }
02290 p1 += l1;
02291 p2 += l2;
02292 }
02293 }
02294 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02295 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02296 return INT2FIX(-1);
02297 }
02298
02299 static long
02300 rb_str_index(VALUE str, VALUE sub, long offset)
02301 {
02302 long pos;
02303 char *s, *sptr, *e;
02304 long len, slen;
02305 rb_encoding *enc;
02306
02307 enc = rb_enc_check(str, sub);
02308 if (is_broken_string(sub)) {
02309 return -1;
02310 }
02311 len = str_strlen(str, enc);
02312 slen = str_strlen(sub, enc);
02313 if (offset < 0) {
02314 offset += len;
02315 if (offset < 0) return -1;
02316 }
02317 if (len - offset < slen) return -1;
02318 s = RSTRING_PTR(str);
02319 e = s + RSTRING_LEN(str);
02320 if (offset) {
02321 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02322 s += offset;
02323 }
02324 if (slen == 0) return offset;
02325
02326 sptr = RSTRING_PTR(sub);
02327 slen = RSTRING_LEN(sub);
02328 len = RSTRING_LEN(str) - offset;
02329 for (;;) {
02330 char *t;
02331 pos = rb_memsearch(sptr, slen, s, len, enc);
02332 if (pos < 0) return pos;
02333 t = rb_enc_right_char_head(s, s+pos, e, enc);
02334 if (t == s + pos) break;
02335 if ((len -= t - s) <= 0) return -1;
02336 offset += t - s;
02337 s = t;
02338 }
02339 return pos + offset;
02340 }
02341
02342
02343
02344
02345
02346
02347
02348
02349
02350
02351
02352
02353
02354
02355
02356
02357
02358
02359
02360 static VALUE
02361 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02362 {
02363 VALUE sub;
02364 VALUE initpos;
02365 long pos;
02366
02367 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02368 pos = NUM2LONG(initpos);
02369 }
02370 else {
02371 pos = 0;
02372 }
02373 if (pos < 0) {
02374 pos += str_strlen(str, STR_ENC_GET(str));
02375 if (pos < 0) {
02376 if (TYPE(sub) == T_REGEXP) {
02377 rb_backref_set(Qnil);
02378 }
02379 return Qnil;
02380 }
02381 }
02382
02383 switch (TYPE(sub)) {
02384 case T_REGEXP:
02385 if (pos > str_strlen(str, STR_ENC_GET(str)))
02386 return Qnil;
02387 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02388 rb_enc_check(str, sub), single_byte_optimizable(str));
02389
02390 pos = rb_reg_search(sub, str, pos, 0);
02391 pos = rb_str_sublen(str, pos);
02392 break;
02393
02394 default: {
02395 VALUE tmp;
02396
02397 tmp = rb_check_string_type(sub);
02398 if (NIL_P(tmp)) {
02399 rb_raise(rb_eTypeError, "type mismatch: %s given",
02400 rb_obj_classname(sub));
02401 }
02402 sub = tmp;
02403 }
02404
02405 case T_STRING:
02406 pos = rb_str_index(str, sub, pos);
02407 pos = rb_str_sublen(str, pos);
02408 break;
02409 }
02410
02411 if (pos == -1) return Qnil;
02412 return LONG2NUM(pos);
02413 }
02414
02415 static long
02416 rb_str_rindex(VALUE str, VALUE sub, long pos)
02417 {
02418 long len, slen;
02419 char *s, *sbeg, *e, *t;
02420 rb_encoding *enc;
02421 int singlebyte = single_byte_optimizable(str);
02422
02423 enc = rb_enc_check(str, sub);
02424 if (is_broken_string(sub)) {
02425 return -1;
02426 }
02427 len = str_strlen(str, enc);
02428 slen = str_strlen(sub, enc);
02429
02430 if (len < slen) return -1;
02431 if (len - pos < slen) {
02432 pos = len - slen;
02433 }
02434 if (len == 0) {
02435 return pos;
02436 }
02437 sbeg = RSTRING_PTR(str);
02438 e = RSTRING_END(str);
02439 t = RSTRING_PTR(sub);
02440 slen = RSTRING_LEN(sub);
02441 for (;;) {
02442 s = str_nth(sbeg, e, pos, enc, singlebyte);
02443 if (!s) return -1;
02444 if (memcmp(s, t, slen) == 0) {
02445 return pos;
02446 }
02447 if (pos == 0) break;
02448 pos--;
02449 }
02450 return -1;
02451 }
02452
02453
02454
02455
02456
02457
02458
02459
02460
02461
02462
02463
02464
02465
02466
02467
02468
02469
02470
02471
02472 static VALUE
02473 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02474 {
02475 VALUE sub;
02476 VALUE vpos;
02477 rb_encoding *enc = STR_ENC_GET(str);
02478 long pos, len = str_strlen(str, enc);
02479
02480 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02481 pos = NUM2LONG(vpos);
02482 if (pos < 0) {
02483 pos += len;
02484 if (pos < 0) {
02485 if (TYPE(sub) == T_REGEXP) {
02486 rb_backref_set(Qnil);
02487 }
02488 return Qnil;
02489 }
02490 }
02491 if (pos > len) pos = len;
02492 }
02493 else {
02494 pos = len;
02495 }
02496
02497 switch (TYPE(sub)) {
02498 case T_REGEXP:
02499
02500 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02501 STR_ENC_GET(str), single_byte_optimizable(str));
02502
02503 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02504 pos = rb_reg_search(sub, str, pos, 1);
02505 pos = rb_str_sublen(str, pos);
02506 }
02507 if (pos >= 0) return LONG2NUM(pos);
02508 break;
02509
02510 default: {
02511 VALUE tmp;
02512
02513 tmp = rb_check_string_type(sub);
02514 if (NIL_P(tmp)) {
02515 rb_raise(rb_eTypeError, "type mismatch: %s given",
02516 rb_obj_classname(sub));
02517 }
02518 sub = tmp;
02519 }
02520
02521 case T_STRING:
02522 pos = rb_str_rindex(str, sub, pos);
02523 if (pos >= 0) return LONG2NUM(pos);
02524 break;
02525 }
02526 return Qnil;
02527 }
02528
02529
02530
02531
02532
02533
02534
02535
02536
02537
02538
02539
02540
02541
02542
02543 static VALUE
02544 rb_str_match(VALUE x, VALUE y)
02545 {
02546 switch (TYPE(y)) {
02547 case T_STRING:
02548 rb_raise(rb_eTypeError, "type mismatch: String given");
02549
02550 case T_REGEXP:
02551 return rb_reg_match(y, x);
02552
02553 default:
02554 return rb_funcall(y, rb_intern("=~"), 1, x);
02555 }
02556 }
02557
02558
02559 static VALUE get_pat(VALUE, int);
02560
02561
02562
02563
02564
02565
02566
02567
02568
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586
02587
02588
02589
02590
02591
02592
02593 static VALUE
02594 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02595 {
02596 VALUE re, result;
02597 if (argc < 1)
02598 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02599 re = argv[0];
02600 argv[0] = str;
02601 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02602 if (!NIL_P(result) && rb_block_given_p()) {
02603 return rb_yield(result);
02604 }
02605 return result;
02606 }
02607
02608 enum neighbor_char {
02609 NEIGHBOR_NOT_CHAR,
02610 NEIGHBOR_FOUND,
02611 NEIGHBOR_WRAPPED
02612 };
02613
02614 static enum neighbor_char
02615 enc_succ_char(char *p, long len, rb_encoding *enc)
02616 {
02617 long i;
02618 int l;
02619 while (1) {
02620 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02621 p[i] = '\0';
02622 if (i < 0)
02623 return NEIGHBOR_WRAPPED;
02624 ++((unsigned char*)p)[i];
02625 l = rb_enc_precise_mbclen(p, p+len, enc);
02626 if (MBCLEN_CHARFOUND_P(l)) {
02627 l = MBCLEN_CHARFOUND_LEN(l);
02628 if (l == len) {
02629 return NEIGHBOR_FOUND;
02630 }
02631 else {
02632 memset(p+l, 0xff, len-l);
02633 }
02634 }
02635 if (MBCLEN_INVALID_P(l) && i < len-1) {
02636 long len2;
02637 int l2;
02638 for (len2 = len-1; 0 < len2; len2--) {
02639 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02640 if (!MBCLEN_INVALID_P(l2))
02641 break;
02642 }
02643 memset(p+len2+1, 0xff, len-(len2+1));
02644 }
02645 }
02646 }
02647
02648 static enum neighbor_char
02649 enc_pred_char(char *p, long len, rb_encoding *enc)
02650 {
02651 long i;
02652 int l;
02653 while (1) {
02654 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02655 p[i] = '\xff';
02656 if (i < 0)
02657 return NEIGHBOR_WRAPPED;
02658 --((unsigned char*)p)[i];
02659 l = rb_enc_precise_mbclen(p, p+len, enc);
02660 if (MBCLEN_CHARFOUND_P(l)) {
02661 l = MBCLEN_CHARFOUND_LEN(l);
02662 if (l == len) {
02663 return NEIGHBOR_FOUND;
02664 }
02665 else {
02666 memset(p+l, 0, len-l);
02667 }
02668 }
02669 if (MBCLEN_INVALID_P(l) && i < len-1) {
02670 long len2;
02671 int l2;
02672 for (len2 = len-1; 0 < len2; len2--) {
02673 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02674 if (!MBCLEN_INVALID_P(l2))
02675 break;
02676 }
02677 memset(p+len2+1, 0, len-(len2+1));
02678 }
02679 }
02680 }
02681
02682
02683
02684
02685
02686
02687
02688
02689
02690
02691 static enum neighbor_char
02692 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02693 {
02694 enum neighbor_char ret;
02695 unsigned int c;
02696 int ctype;
02697 int range;
02698 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02699
02700 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02701 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02702 ctype = ONIGENC_CTYPE_DIGIT;
02703 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02704 ctype = ONIGENC_CTYPE_ALPHA;
02705 else
02706 return NEIGHBOR_NOT_CHAR;
02707
02708 MEMCPY(save, p, char, len);
02709 ret = enc_succ_char(p, len, enc);
02710 if (ret == NEIGHBOR_FOUND) {
02711 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02712 if (rb_enc_isctype(c, ctype, enc))
02713 return NEIGHBOR_FOUND;
02714 }
02715 MEMCPY(p, save, char, len);
02716 range = 1;
02717 while (1) {
02718 MEMCPY(save, p, char, len);
02719 ret = enc_pred_char(p, len, enc);
02720 if (ret == NEIGHBOR_FOUND) {
02721 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02722 if (!rb_enc_isctype(c, ctype, enc)) {
02723 MEMCPY(p, save, char, len);
02724 break;
02725 }
02726 }
02727 else {
02728 MEMCPY(p, save, char, len);
02729 break;
02730 }
02731 range++;
02732 }
02733 if (range == 1) {
02734 return NEIGHBOR_NOT_CHAR;
02735 }
02736
02737 if (ctype != ONIGENC_CTYPE_DIGIT) {
02738 MEMCPY(carry, p, char, len);
02739 return NEIGHBOR_WRAPPED;
02740 }
02741
02742 MEMCPY(carry, p, char, len);
02743 enc_succ_char(carry, len, enc);
02744 return NEIGHBOR_WRAPPED;
02745 }
02746
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764
02765
02766
02767
02768
02769
02770
02771
02772
02773 VALUE
02774 rb_str_succ(VALUE orig)
02775 {
02776 rb_encoding *enc;
02777 VALUE str;
02778 char *sbeg, *s, *e, *last_alnum = 0;
02779 int c = -1;
02780 long l;
02781 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02782 long carry_pos = 0, carry_len = 1;
02783 enum neighbor_char neighbor = NEIGHBOR_FOUND;
02784
02785 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02786 rb_enc_cr_str_copy_for_substr(str, orig);
02787 OBJ_INFECT(str, orig);
02788 if (RSTRING_LEN(str) == 0) return str;
02789
02790 enc = STR_ENC_GET(orig);
02791 sbeg = RSTRING_PTR(str);
02792 s = e = sbeg + RSTRING_LEN(str);
02793
02794 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02795 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02796 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02797 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02798 s = last_alnum;
02799 break;
02800 }
02801 }
02802 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02803 neighbor = enc_succ_alnum_char(s, l, enc, carry);
02804 switch (neighbor) {
02805 case NEIGHBOR_NOT_CHAR:
02806 continue;
02807 case NEIGHBOR_FOUND:
02808 return str;
02809 case NEIGHBOR_WRAPPED:
02810 last_alnum = s;
02811 break;
02812 }
02813 c = 1;
02814 carry_pos = s - sbeg;
02815 carry_len = l;
02816 }
02817 if (c == -1) {
02818 s = e;
02819 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02820 enum neighbor_char neighbor;
02821 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02822 neighbor = enc_succ_char(s, l, enc);
02823 if (neighbor == NEIGHBOR_FOUND)
02824 return str;
02825 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02826
02827 enc_succ_char(s, l, enc);
02828 }
02829 if (!rb_enc_asciicompat(enc)) {
02830 MEMCPY(carry, s, char, l);
02831 carry_len = l;
02832 }
02833 carry_pos = s - sbeg;
02834 }
02835 }
02836 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02837 s = RSTRING_PTR(str) + carry_pos;
02838 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02839 memmove(s, carry, carry_len);
02840 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02841 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02842 rb_enc_str_coderange(str);
02843 return str;
02844 }
02845
02846
02847
02848
02849
02850
02851
02852
02853
02854
02855
02856 static VALUE
02857 rb_str_succ_bang(VALUE str)
02858 {
02859 rb_str_shared_replace(str, rb_str_succ(str));
02860
02861 return str;
02862 }
02863
02864
02865
02866
02867
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896
02897 static VALUE
02898 rb_str_upto(int argc, VALUE *argv, VALUE beg)
02899 {
02900 VALUE end, exclusive;
02901 VALUE current, after_end;
02902 ID succ;
02903 int n, excl, ascii;
02904 rb_encoding *enc;
02905
02906 rb_scan_args(argc, argv, "11", &end, &exclusive);
02907 RETURN_ENUMERATOR(beg, argc, argv);
02908 excl = RTEST(exclusive);
02909 CONST_ID(succ, "succ");
02910 StringValue(end);
02911 enc = rb_enc_check(beg, end);
02912 ascii = (is_ascii_string(beg) && is_ascii_string(end));
02913
02914 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
02915 char c = RSTRING_PTR(beg)[0];
02916 char e = RSTRING_PTR(end)[0];
02917
02918 if (c > e || (excl && c == e)) return beg;
02919 for (;;) {
02920 rb_yield(rb_enc_str_new(&c, 1, enc));
02921 if (!excl && c == e) break;
02922 c++;
02923 if (excl && c == e) break;
02924 }
02925 return beg;
02926 }
02927
02928 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
02929 char *s, *send;
02930 VALUE b, e;
02931 int width;
02932
02933 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
02934 width = rb_long2int(send - s);
02935 while (s < send) {
02936 if (!ISDIGIT(*s)) goto no_digits;
02937 s++;
02938 }
02939 s = RSTRING_PTR(end); send = RSTRING_END(end);
02940 while (s < send) {
02941 if (!ISDIGIT(*s)) goto no_digits;
02942 s++;
02943 }
02944 b = rb_str_to_inum(beg, 10, FALSE);
02945 e = rb_str_to_inum(end, 10, FALSE);
02946 if (FIXNUM_P(b) && FIXNUM_P(e)) {
02947 long bi = FIX2LONG(b);
02948 long ei = FIX2LONG(e);
02949 rb_encoding *usascii = rb_usascii_encoding();
02950
02951 while (bi <= ei) {
02952 if (excl && bi == ei) break;
02953 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
02954 bi++;
02955 }
02956 }
02957 else {
02958 ID op = excl ? '<' : rb_intern("<=");
02959 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
02960
02961 args[0] = INT2FIX(width);
02962 while (rb_funcall(b, op, 1, e)) {
02963 args[1] = b;
02964 rb_yield(rb_str_format(numberof(args), args, fmt));
02965 b = rb_funcall(b, succ, 0, 0);
02966 }
02967 }
02968 return beg;
02969 }
02970
02971 no_digits:
02972 n = rb_str_cmp(beg, end);
02973 if (n > 0 || (excl && n == 0)) return beg;
02974
02975 after_end = rb_funcall(end, succ, 0, 0);
02976 current = rb_str_dup(beg);
02977 while (!rb_str_equal(current, after_end)) {
02978 VALUE next = Qnil;
02979 if (excl || !rb_str_equal(current, end))
02980 next = rb_funcall(current, succ, 0, 0);
02981 rb_yield(current);
02982 if (NIL_P(next)) break;
02983 current = next;
02984 StringValue(current);
02985 if (excl && rb_str_equal(current, end)) break;
02986 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
02987 break;
02988 }
02989
02990 return beg;
02991 }
02992
02993 static VALUE
02994 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
02995 {
02996 if (rb_reg_search(re, str, 0, 0) >= 0) {
02997 VALUE match = rb_backref_get();
02998 int nth = rb_reg_backref_number(match, backref);
02999 return rb_reg_nth_match(nth, match);
03000 }
03001 return Qnil;
03002 }
03003
03004 static VALUE
03005 rb_str_aref(VALUE str, VALUE indx)
03006 {
03007 long idx;
03008
03009 switch (TYPE(indx)) {
03010 case T_FIXNUM:
03011 idx = FIX2LONG(indx);
03012
03013 num_index:
03014 str = rb_str_substr(str, idx, 1);
03015 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03016 return str;
03017
03018 case T_REGEXP:
03019 return rb_str_subpat(str, indx, INT2FIX(0));
03020
03021 case T_STRING:
03022 if (rb_str_index(str, indx, 0) != -1)
03023 return rb_str_dup(indx);
03024 return Qnil;
03025
03026 default:
03027
03028 {
03029 long beg, len;
03030 VALUE tmp;
03031
03032 len = str_strlen(str, STR_ENC_GET(str));
03033 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03034 case Qfalse:
03035 break;
03036 case Qnil:
03037 return Qnil;
03038 default:
03039 tmp = rb_str_substr(str, beg, len);
03040 return tmp;
03041 }
03042 }
03043 idx = NUM2LONG(indx);
03044 goto num_index;
03045 }
03046 return Qnil;
03047 }
03048
03049
03050
03051
03052
03053
03054
03055
03056
03057
03058
03059
03060
03061
03062
03063
03064
03065
03066
03067
03068
03069
03070
03071
03072
03073
03074
03075
03076
03077
03078
03079
03080
03081
03082
03083
03084
03085
03086
03087
03088
03089
03090
03091
03092
03093
03094
03095
03096
03097
03098 static VALUE
03099 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03100 {
03101 if (argc == 2) {
03102 if (TYPE(argv[0]) == T_REGEXP) {
03103 return rb_str_subpat(str, argv[0], argv[1]);
03104 }
03105 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03106 }
03107 if (argc != 1) {
03108 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03109 }
03110 return rb_str_aref(str, argv[0]);
03111 }
03112
03113 VALUE
03114 rb_str_drop_bytes(VALUE str, long len)
03115 {
03116 char *ptr = RSTRING_PTR(str);
03117 long olen = RSTRING_LEN(str), nlen;
03118
03119 str_modifiable(str);
03120 if (len > olen) len = olen;
03121 nlen = olen - len;
03122 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03123 char *oldptr = ptr;
03124 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03125 STR_SET_EMBED(str);
03126 STR_SET_EMBED_LEN(str, nlen);
03127 ptr = RSTRING(str)->as.ary;
03128 memmove(ptr, oldptr + len, nlen);
03129 if (fl == STR_NOEMBED) xfree(oldptr);
03130 }
03131 else {
03132 if (!STR_SHARED_P(str)) rb_str_new4(str);
03133 ptr = RSTRING(str)->as.heap.ptr += len;
03134 RSTRING(str)->as.heap.len = nlen;
03135 }
03136 ptr[nlen] = 0;
03137 ENC_CODERANGE_CLEAR(str);
03138 return str;
03139 }
03140
03141 static void
03142 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03143 {
03144 if (beg == 0 && RSTRING_LEN(val) == 0) {
03145 rb_str_drop_bytes(str, len);
03146 OBJ_INFECT(str, val);
03147 return;
03148 }
03149
03150 rb_str_modify(str);
03151 if (len < RSTRING_LEN(val)) {
03152
03153 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03154 }
03155
03156 if (RSTRING_LEN(val) != len) {
03157 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03158 RSTRING_PTR(str) + beg + len,
03159 RSTRING_LEN(str) - (beg + len));
03160 }
03161 if (RSTRING_LEN(val) < beg && len < 0) {
03162 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03163 }
03164 if (RSTRING_LEN(val) > 0) {
03165 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03166 }
03167 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03168 if (RSTRING_PTR(str)) {
03169 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03170 }
03171 OBJ_INFECT(str, val);
03172 }
03173
03174 static void
03175 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03176 {
03177 long slen;
03178 char *p, *e;
03179 rb_encoding *enc;
03180 int singlebyte = single_byte_optimizable(str);
03181 int cr;
03182
03183 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03184
03185 StringValue(val);
03186 enc = rb_enc_check(str, val);
03187 slen = str_strlen(str, enc);
03188
03189 if (slen < beg) {
03190 out_of_range:
03191 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03192 }
03193 if (beg < 0) {
03194 if (-beg > slen) {
03195 goto out_of_range;
03196 }
03197 beg += slen;
03198 }
03199 if (slen < len || slen < beg + len) {
03200 len = slen - beg;
03201 }
03202 str_modify_keep_cr(str);
03203 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03204 if (!p) p = RSTRING_END(str);
03205 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03206 if (!e) e = RSTRING_END(str);
03207
03208 beg = p - RSTRING_PTR(str);
03209 len = e - p;
03210 rb_str_splice_0(str, beg, len, val);
03211 rb_enc_associate(str, enc);
03212 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03213 if (cr != ENC_CODERANGE_BROKEN)
03214 ENC_CODERANGE_SET(str, cr);
03215 }
03216
03217 void
03218 rb_str_update(VALUE str, long beg, long len, VALUE val)
03219 {
03220 rb_str_splice(str, beg, len, val);
03221 }
03222
03223 static void
03224 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03225 {
03226 int nth;
03227 VALUE match;
03228 long start, end, len;
03229 rb_encoding *enc;
03230 struct re_registers *regs;
03231
03232 if (rb_reg_search(re, str, 0, 0) < 0) {
03233 rb_raise(rb_eIndexError, "regexp not matched");
03234 }
03235 match = rb_backref_get();
03236 nth = rb_reg_backref_number(match, backref);
03237 regs = RMATCH_REGS(match);
03238 if (nth >= regs->num_regs) {
03239 out_of_range:
03240 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03241 }
03242 if (nth < 0) {
03243 if (-nth >= regs->num_regs) {
03244 goto out_of_range;
03245 }
03246 nth += regs->num_regs;
03247 }
03248
03249 start = BEG(nth);
03250 if (start == -1) {
03251 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03252 }
03253 end = END(nth);
03254 len = end - start;
03255 StringValue(val);
03256 enc = rb_enc_check(str, val);
03257 rb_str_splice_0(str, start, len, val);
03258 rb_enc_associate(str, enc);
03259 }
03260
03261 static VALUE
03262 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03263 {
03264 long idx, beg;
03265
03266 switch (TYPE(indx)) {
03267 case T_FIXNUM:
03268 idx = FIX2LONG(indx);
03269 num_index:
03270 rb_str_splice(str, idx, 1, val);
03271 return val;
03272
03273 case T_REGEXP:
03274 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03275 return val;
03276
03277 case T_STRING:
03278 beg = rb_str_index(str, indx, 0);
03279 if (beg < 0) {
03280 rb_raise(rb_eIndexError, "string not matched");
03281 }
03282 beg = rb_str_sublen(str, beg);
03283 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03284 return val;
03285
03286 default:
03287
03288 {
03289 long beg, len;
03290 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03291 rb_str_splice(str, beg, len, val);
03292 return val;
03293 }
03294 }
03295 idx = NUM2LONG(indx);
03296 goto num_index;
03297 }
03298 }
03299
03300
03301
03302
03303
03304
03305
03306
03307
03308
03309
03310
03311
03312
03313
03314
03315
03316
03317
03318
03319
03320
03321
03322
03323
03324
03325 static VALUE
03326 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03327 {
03328 if (argc == 3) {
03329 if (TYPE(argv[0]) == T_REGEXP) {
03330 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03331 }
03332 else {
03333 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03334 }
03335 return argv[2];
03336 }
03337 if (argc != 2) {
03338 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03339 }
03340 return rb_str_aset(str, argv[0], argv[1]);
03341 }
03342
03343
03344
03345
03346
03347
03348
03349
03350
03351
03352
03353
03354
03355
03356
03357
03358
03359
03360 static VALUE
03361 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03362 {
03363 long pos = NUM2LONG(idx);
03364
03365 if (pos == -1) {
03366 return rb_str_append(str, str2);
03367 }
03368 else if (pos < 0) {
03369 pos++;
03370 }
03371 rb_str_splice(str, pos, 0, str2);
03372 return str;
03373 }
03374
03375
03376
03377
03378
03379
03380
03381
03382
03383
03384
03385
03386
03387
03388
03389
03390
03391
03392
03393
03394
03395 static VALUE
03396 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03397 {
03398 VALUE result;
03399 VALUE buf[3];
03400 int i;
03401
03402 if (argc < 1 || 2 < argc) {
03403 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03404 }
03405 for (i=0; i<argc; i++) {
03406 buf[i] = argv[i];
03407 }
03408 str_modify_keep_cr(str);
03409 buf[i] = rb_str_new(0,0);
03410 result = rb_str_aref_m(argc, buf, str);
03411 if (!NIL_P(result)) {
03412 rb_str_aset_m(argc+1, buf, str);
03413 }
03414 return result;
03415 }
03416
03417 static VALUE
03418 get_pat(VALUE pat, int quote)
03419 {
03420 VALUE val;
03421
03422 switch (TYPE(pat)) {
03423 case T_REGEXP:
03424 return pat;
03425
03426 case T_STRING:
03427 break;
03428
03429 default:
03430 val = rb_check_string_type(pat);
03431 if (NIL_P(val)) {
03432 Check_Type(pat, T_REGEXP);
03433 }
03434 pat = val;
03435 }
03436
03437 if (quote) {
03438 pat = rb_reg_quote(pat);
03439 }
03440
03441 return rb_reg_regcomp(pat);
03442 }
03443
03444
03445
03446
03447
03448
03449
03450
03451
03452
03453
03454
03455 static VALUE
03456 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03457 {
03458 VALUE pat, repl, hash = Qnil;
03459 int iter = 0;
03460 int tainted = 0;
03461 int untrusted = 0;
03462 long plen;
03463
03464 if (argc == 1 && rb_block_given_p()) {
03465 iter = 1;
03466 }
03467 else if (argc == 2) {
03468 repl = argv[1];
03469 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03470 if (NIL_P(hash)) {
03471 StringValue(repl);
03472 }
03473 if (OBJ_TAINTED(repl)) tainted = 1;
03474 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03475 }
03476 else {
03477 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03478 }
03479
03480 pat = get_pat(argv[0], 1);
03481 str_modifiable(str);
03482 if (rb_reg_search(pat, str, 0, 0) >= 0) {
03483 rb_encoding *enc;
03484 int cr = ENC_CODERANGE(str);
03485 VALUE match = rb_backref_get();
03486 struct re_registers *regs = RMATCH_REGS(match);
03487 long beg0 = BEG(0);
03488 long end0 = END(0);
03489 char *p, *rp;
03490 long len, rlen;
03491
03492 if (iter || !NIL_P(hash)) {
03493 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03494
03495 if (iter) {
03496 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03497 }
03498 else {
03499 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03500 repl = rb_obj_as_string(repl);
03501 }
03502 str_mod_check(str, p, len);
03503 str_frozen_check(str);
03504 }
03505 else {
03506 repl = rb_reg_regsub(repl, str, regs, pat);
03507 }
03508 enc = rb_enc_compatible(str, repl);
03509 if (!enc) {
03510 rb_encoding *str_enc = STR_ENC_GET(str);
03511 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03512 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03513 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03514 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03515 rb_enc_name(str_enc),
03516 rb_enc_name(STR_ENC_GET(repl)));
03517 }
03518 enc = STR_ENC_GET(repl);
03519 }
03520 rb_str_modify(str);
03521 rb_enc_associate(str, enc);
03522 if (OBJ_TAINTED(repl)) tainted = 1;
03523 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03524 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03525 int cr2 = ENC_CODERANGE(repl);
03526 if (cr2 == ENC_CODERANGE_BROKEN ||
03527 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03528 cr = ENC_CODERANGE_UNKNOWN;
03529 else
03530 cr = cr2;
03531 }
03532 plen = end0 - beg0;
03533 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03534 len = RSTRING_LEN(str);
03535 if (rlen > plen) {
03536 RESIZE_CAPA(str, len + rlen - plen);
03537 }
03538 p = RSTRING_PTR(str);
03539 if (rlen != plen) {
03540 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03541 }
03542 memcpy(p + beg0, rp, rlen);
03543 len += rlen - plen;
03544 STR_SET_LEN(str, len);
03545 RSTRING_PTR(str)[len] = '\0';
03546 ENC_CODERANGE_SET(str, cr);
03547 if (tainted) OBJ_TAINT(str);
03548 if (untrusted) OBJ_UNTRUST(str);
03549
03550 return str;
03551 }
03552 return Qnil;
03553 }
03554
03555
03556
03557
03558
03559
03560
03561
03562
03563
03564
03565
03566
03567
03568
03569
03570
03571
03572
03573
03574
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587
03588
03589
03590
03591
03592
03593
03594
03595
03596 static VALUE
03597 rb_str_sub(int argc, VALUE *argv, VALUE str)
03598 {
03599 str = rb_str_dup(str);
03600 rb_str_sub_bang(argc, argv, str);
03601 return str;
03602 }
03603
03604 static VALUE
03605 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03606 {
03607 VALUE pat, val, repl, match, dest, hash = Qnil;
03608 struct re_registers *regs;
03609 long beg, n;
03610 long beg0, end0;
03611 long offset, blen, slen, len, last;
03612 int iter = 0;
03613 char *sp, *cp;
03614 int tainted = 0;
03615 rb_encoding *str_enc;
03616
03617 switch (argc) {
03618 case 1:
03619 RETURN_ENUMERATOR(str, argc, argv);
03620 iter = 1;
03621 break;
03622 case 2:
03623 repl = argv[1];
03624 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03625 if (NIL_P(hash)) {
03626 StringValue(repl);
03627 }
03628 if (OBJ_TAINTED(repl)) tainted = 1;
03629 break;
03630 default:
03631 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03632 }
03633
03634 pat = get_pat(argv[0], 1);
03635 beg = rb_reg_search(pat, str, 0, 0);
03636 if (beg < 0) {
03637 if (bang) return Qnil;
03638 return rb_str_dup(str);
03639 }
03640
03641 offset = 0;
03642 n = 0;
03643 blen = RSTRING_LEN(str) + 30;
03644 dest = rb_str_buf_new(blen);
03645 sp = RSTRING_PTR(str);
03646 slen = RSTRING_LEN(str);
03647 cp = sp;
03648 str_enc = STR_ENC_GET(str);
03649
03650 do {
03651 n++;
03652 match = rb_backref_get();
03653 regs = RMATCH_REGS(match);
03654 beg0 = BEG(0);
03655 end0 = END(0);
03656 if (iter || !NIL_P(hash)) {
03657 if (iter) {
03658 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03659 }
03660 else {
03661 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03662 val = rb_obj_as_string(val);
03663 }
03664 str_mod_check(str, sp, slen);
03665 if (val == dest) {
03666 rb_raise(rb_eRuntimeError, "block should not cheat");
03667 }
03668 }
03669 else {
03670 val = rb_reg_regsub(repl, str, regs, pat);
03671 }
03672
03673 if (OBJ_TAINTED(val)) tainted = 1;
03674
03675 len = beg - offset;
03676 if (len) {
03677 rb_enc_str_buf_cat(dest, cp, len, str_enc);
03678 }
03679
03680 rb_str_buf_append(dest, val);
03681
03682 last = offset;
03683 offset = end0;
03684 if (beg0 == end0) {
03685
03686
03687
03688
03689 if (RSTRING_LEN(str) <= end0) break;
03690 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03691 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03692 offset = end0 + len;
03693 }
03694 cp = RSTRING_PTR(str) + offset;
03695 if (offset > RSTRING_LEN(str)) break;
03696 beg = rb_reg_search(pat, str, offset, 0);
03697 } while (beg >= 0);
03698 if (RSTRING_LEN(str) > offset) {
03699 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03700 }
03701 rb_reg_search(pat, str, last, 0);
03702 if (bang) {
03703 rb_str_shared_replace(str, dest);
03704 }
03705 else {
03706 RBASIC(dest)->klass = rb_obj_class(str);
03707 OBJ_INFECT(dest, str);
03708 str = dest;
03709 }
03710
03711 if (tainted) OBJ_TAINT(str);
03712 return str;
03713 }
03714
03715
03716
03717
03718
03719
03720
03721
03722
03723
03724
03725
03726
03727 static VALUE
03728 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03729 {
03730 str_modify_keep_cr(str);
03731 return str_gsub(argc, argv, str, 1);
03732 }
03733
03734
03735
03736
03737
03738
03739
03740
03741
03742
03743
03744
03745
03746
03747
03748
03749
03750
03751
03752
03753
03754
03755
03756
03757
03758
03759
03760
03761
03762
03763
03764
03765
03766
03767
03768
03769
03770
03771
03772
03773
03774
03775
03776
03777
03778 static VALUE
03779 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03780 {
03781 return str_gsub(argc, argv, str, 0);
03782 }
03783
03784
03785
03786
03787
03788
03789
03790
03791
03792
03793
03794
03795
03796 VALUE
03797 rb_str_replace(VALUE str, VALUE str2)
03798 {
03799 str_modifiable(str);
03800 if (str == str2) return str;
03801
03802 StringValue(str2);
03803 str_discard(str);
03804 return str_replace(str, str2);
03805 }
03806
03807
03808
03809
03810
03811
03812
03813
03814
03815
03816
03817 static VALUE
03818 rb_str_clear(VALUE str)
03819 {
03820 str_discard(str);
03821 STR_SET_EMBED(str);
03822 STR_SET_EMBED_LEN(str, 0);
03823 RSTRING_PTR(str)[0] = 0;
03824 if (rb_enc_asciicompat(STR_ENC_GET(str)))
03825 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03826 else
03827 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03828 return str;
03829 }
03830
03831
03832
03833
03834
03835
03836
03837
03838
03839
03840
03841 static VALUE
03842 rb_str_chr(VALUE str)
03843 {
03844 return rb_str_substr(str, 0, 1);
03845 }
03846
03847
03848
03849
03850
03851
03852
03853 static VALUE
03854 rb_str_getbyte(VALUE str, VALUE index)
03855 {
03856 long pos = NUM2LONG(index);
03857
03858 if (pos < 0)
03859 pos += RSTRING_LEN(str);
03860 if (pos < 0 || RSTRING_LEN(str) <= pos)
03861 return Qnil;
03862
03863 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03864 }
03865
03866
03867
03868
03869
03870
03871
03872 static VALUE
03873 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03874 {
03875 long pos = NUM2LONG(index);
03876 int byte = NUM2INT(value);
03877
03878 rb_str_modify(str);
03879
03880 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
03881 rb_raise(rb_eIndexError, "index %ld out of string", pos);
03882 if (pos < 0)
03883 pos += RSTRING_LEN(str);
03884
03885 RSTRING_PTR(str)[pos] = byte;
03886
03887 return value;
03888 }
03889
03890
03891
03892
03893
03894
03895
03896
03897
03898
03899 static VALUE
03900 rb_str_reverse(VALUE str)
03901 {
03902 rb_encoding *enc;
03903 VALUE rev;
03904 char *s, *e, *p;
03905 int single = 1;
03906
03907 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
03908 enc = STR_ENC_GET(str);
03909 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
03910 s = RSTRING_PTR(str); e = RSTRING_END(str);
03911 p = RSTRING_END(rev);
03912
03913 if (RSTRING_LEN(str) > 1) {
03914 if (single_byte_optimizable(str)) {
03915 while (s < e) {
03916 *--p = *s++;
03917 }
03918 }
03919 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
03920 while (s < e) {
03921 int clen = rb_enc_fast_mbclen(s, e, enc);
03922
03923 if (clen > 1 || (*s & 0x80)) single = 0;
03924 p -= clen;
03925 memcpy(p, s, clen);
03926 s += clen;
03927 }
03928 }
03929 else {
03930 while (s < e) {
03931 int clen = rb_enc_mbclen(s, e, enc);
03932
03933 if (clen > 1 || (*s & 0x80)) single = 0;
03934 p -= clen;
03935 memcpy(p, s, clen);
03936 s += clen;
03937 }
03938 }
03939 }
03940 STR_SET_LEN(rev, RSTRING_LEN(str));
03941 OBJ_INFECT(rev, str);
03942 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
03943 if (single) {
03944 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03945 }
03946 else {
03947 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03948 }
03949 }
03950 rb_enc_cr_str_copy_for_substr(rev, str);
03951
03952 return rev;
03953 }
03954
03955
03956
03957
03958
03959
03960
03961
03962
03963 static VALUE
03964 rb_str_reverse_bang(VALUE str)
03965 {
03966 if (RSTRING_LEN(str) > 1) {
03967 if (single_byte_optimizable(str)) {
03968 char *s, *e, c;
03969
03970 str_modify_keep_cr(str);
03971 s = RSTRING_PTR(str);
03972 e = RSTRING_END(str) - 1;
03973 while (s < e) {
03974 c = *s;
03975 *s++ = *e;
03976 *e-- = c;
03977 }
03978 }
03979 else {
03980 rb_str_shared_replace(str, rb_str_reverse(str));
03981 }
03982 }
03983 else {
03984 str_modify_keep_cr(str);
03985 }
03986 return str;
03987 }
03988
03989
03990
03991
03992
03993
03994
03995
03996
03997
03998
03999
04000
04001
04002 static VALUE
04003 rb_str_include(VALUE str, VALUE arg)
04004 {
04005 long i;
04006
04007 StringValue(arg);
04008 i = rb_str_index(str, arg, 0);
04009
04010 if (i == -1) return Qfalse;
04011 return Qtrue;
04012 }
04013
04014
04015
04016
04017
04018
04019
04020
04021
04022
04023
04024
04025
04026
04027
04028
04029
04030
04031
04032
04033
04034
04035
04036 static VALUE
04037 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04038 {
04039 int base;
04040
04041 if (argc == 0) base = 10;
04042 else {
04043 VALUE b;
04044
04045 rb_scan_args(argc, argv, "01", &b);
04046 base = NUM2INT(b);
04047 }
04048 if (base < 0) {
04049 rb_raise(rb_eArgError, "invalid radix %d", base);
04050 }
04051 return rb_str_to_inum(str, base, FALSE);
04052 }
04053
04054
04055
04056
04057
04058
04059
04060
04061
04062
04063
04064
04065
04066
04067
04068
04069 static VALUE
04070 rb_str_to_f(VALUE str)
04071 {
04072 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04073 }
04074
04075
04076
04077
04078
04079
04080
04081
04082
04083
04084 static VALUE
04085 rb_str_to_s(VALUE str)
04086 {
04087 if (rb_obj_class(str) != rb_cString) {
04088 return str_duplicate(rb_cString, str);
04089 }
04090 return str;
04091 }
04092
04093 #if 0
04094 static void
04095 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04096 {
04097 char s[RUBY_MAX_CHAR_LEN];
04098 int n = rb_enc_codelen(c, enc);
04099
04100 rb_enc_mbcput(c, s, enc);
04101 rb_enc_str_buf_cat(str, s, n, enc);
04102 }
04103 #endif
04104
04105 #define CHAR_ESC_LEN 13
04106
04107 int
04108 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04109 {
04110 char buf[CHAR_ESC_LEN + 1];
04111 int l;
04112
04113 #if SIZEOF_INT > 4
04114 c &= 0xffffffff;
04115 #endif
04116 if (unicode_p) {
04117 if (c < 0x7F && ISPRINT(c)) {
04118 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04119 }
04120 else if (c < 0x10000) {
04121 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04122 }
04123 else {
04124 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04125 }
04126 }
04127 else {
04128 if (c < 0x100) {
04129 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04130 }
04131 else {
04132 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04133 }
04134 }
04135 l = (int)strlen(buf);
04136 rb_str_buf_cat(result, buf, l);
04137 return l;
04138 }
04139
04140
04141
04142
04143
04144
04145
04146
04147
04148
04149
04150
04151
04152 VALUE
04153 rb_str_inspect(VALUE str)
04154 {
04155 rb_encoding *enc = STR_ENC_GET(str);
04156 const char *p, *pend, *prev;
04157 char buf[CHAR_ESC_LEN + 1];
04158 VALUE result = rb_str_buf_new(0);
04159 rb_encoding *resenc = rb_default_internal_encoding();
04160 int unicode_p = rb_enc_unicode_p(enc);
04161 int asciicompat = rb_enc_asciicompat(enc);
04162
04163 if (resenc == NULL) resenc = rb_default_external_encoding();
04164 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04165 rb_enc_associate(result, resenc);
04166 str_buf_cat2(result, "\"");
04167
04168 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04169 prev = p;
04170 while (p < pend) {
04171 unsigned int c, cc;
04172 int n;
04173
04174 n = rb_enc_precise_mbclen(p, pend, enc);
04175 if (!MBCLEN_CHARFOUND_P(n)) {
04176 if (p > prev) str_buf_cat(result, prev, p - prev);
04177 n = rb_enc_mbminlen(enc);
04178 if (pend < p + n)
04179 n = (int)(pend - p);
04180 while (n--) {
04181 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04182 str_buf_cat(result, buf, strlen(buf));
04183 prev = ++p;
04184 }
04185 continue;
04186 }
04187 n = MBCLEN_CHARFOUND_LEN(n);
04188 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04189 p += n;
04190 if (c == '"'|| c == '\\' ||
04191 (c == '#' &&
04192 p < pend &&
04193 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04194 (cc = rb_enc_codepoint(p,pend,enc),
04195 (cc == '$' || cc == '@' || cc == '{')))) {
04196 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04197 str_buf_cat2(result, "\\");
04198 if (asciicompat || enc == resenc) {
04199 prev = p - n;
04200 continue;
04201 }
04202 }
04203 switch (c) {
04204 case '\n': cc = 'n'; break;
04205 case '\r': cc = 'r'; break;
04206 case '\t': cc = 't'; break;
04207 case '\f': cc = 'f'; break;
04208 case '\013': cc = 'v'; break;
04209 case '\010': cc = 'b'; break;
04210 case '\007': cc = 'a'; break;
04211 case 033: cc = 'e'; break;
04212 default: cc = 0; break;
04213 }
04214 if (cc) {
04215 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04216 buf[0] = '\\';
04217 buf[1] = (char)cc;
04218 str_buf_cat(result, buf, 2);
04219 prev = p;
04220 continue;
04221 }
04222 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04223 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04224 continue;
04225 }
04226 else {
04227 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04228 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04229 prev = p;
04230 continue;
04231 }
04232 }
04233 if (p > prev) str_buf_cat(result, prev, p - prev);
04234 str_buf_cat2(result, "\"");
04235
04236 OBJ_INFECT(result, str);
04237 return result;
04238 }
04239
04240 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04241
04242
04243
04244
04245
04246
04247
04248
04249
04250 VALUE
04251 rb_str_dump(VALUE str)
04252 {
04253 rb_encoding *enc = rb_enc_get(str);
04254 long len;
04255 const char *p, *pend;
04256 char *q, *qend;
04257 VALUE result;
04258 int u8 = (enc == rb_utf8_encoding());
04259
04260 len = 2;
04261 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04262 while (p < pend) {
04263 unsigned char c = *p++;
04264 switch (c) {
04265 case '"': case '\\':
04266 case '\n': case '\r':
04267 case '\t': case '\f':
04268 case '\013': case '\010': case '\007': case '\033':
04269 len += 2;
04270 break;
04271
04272 case '#':
04273 len += IS_EVSTR(p, pend) ? 2 : 1;
04274 break;
04275
04276 default:
04277 if (ISPRINT(c)) {
04278 len++;
04279 }
04280 else {
04281 if (u8) {
04282 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04283 if (MBCLEN_CHARFOUND_P(n-1)) {
04284 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04285 while (cc >>= 4) len++;
04286 len += 5;
04287 p += MBCLEN_CHARFOUND_LEN(n)-1;
04288 break;
04289 }
04290 }
04291 len += 4;
04292 }
04293 break;
04294 }
04295 }
04296 if (!rb_enc_asciicompat(enc)) {
04297 len += 19;
04298 len += strlen(enc->name);
04299 }
04300
04301 result = rb_str_new5(str, 0, len);
04302 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04303 q = RSTRING_PTR(result); qend = q + len + 1;
04304
04305 *q++ = '"';
04306 while (p < pend) {
04307 unsigned char c = *p++;
04308
04309 if (c == '"' || c == '\\') {
04310 *q++ = '\\';
04311 *q++ = c;
04312 }
04313 else if (c == '#') {
04314 if (IS_EVSTR(p, pend)) *q++ = '\\';
04315 *q++ = '#';
04316 }
04317 else if (c == '\n') {
04318 *q++ = '\\';
04319 *q++ = 'n';
04320 }
04321 else if (c == '\r') {
04322 *q++ = '\\';
04323 *q++ = 'r';
04324 }
04325 else if (c == '\t') {
04326 *q++ = '\\';
04327 *q++ = 't';
04328 }
04329 else if (c == '\f') {
04330 *q++ = '\\';
04331 *q++ = 'f';
04332 }
04333 else if (c == '\013') {
04334 *q++ = '\\';
04335 *q++ = 'v';
04336 }
04337 else if (c == '\010') {
04338 *q++ = '\\';
04339 *q++ = 'b';
04340 }
04341 else if (c == '\007') {
04342 *q++ = '\\';
04343 *q++ = 'a';
04344 }
04345 else if (c == '\033') {
04346 *q++ = '\\';
04347 *q++ = 'e';
04348 }
04349 else if (ISPRINT(c)) {
04350 *q++ = c;
04351 }
04352 else {
04353 *q++ = '\\';
04354 if (u8) {
04355 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04356 if (MBCLEN_CHARFOUND_P(n)) {
04357 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04358 p += n;
04359 snprintf(q, qend-q, "u{%x}", cc);
04360 q += strlen(q);
04361 continue;
04362 }
04363 }
04364 snprintf(q, qend-q, "x%02X", c);
04365 q += 3;
04366 }
04367 }
04368 *q++ = '"';
04369 *q = '\0';
04370 if (!rb_enc_asciicompat(enc)) {
04371 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04372 enc = rb_ascii8bit_encoding();
04373 }
04374 OBJ_INFECT(result, str);
04375
04376 rb_enc_associate(result, enc);
04377 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04378 return result;
04379 }
04380
04381
04382 static void
04383 rb_str_check_dummy_enc(rb_encoding *enc)
04384 {
04385 if (rb_enc_dummy_p(enc)) {
04386 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04387 rb_enc_name(enc));
04388 }
04389 }
04390
04391
04392
04393
04394
04395
04396
04397
04398
04399
04400 static VALUE
04401 rb_str_upcase_bang(VALUE str)
04402 {
04403 rb_encoding *enc;
04404 char *s, *send;
04405 int modify = 0;
04406 int n;
04407
04408 str_modify_keep_cr(str);
04409 enc = STR_ENC_GET(str);
04410 rb_str_check_dummy_enc(enc);
04411 s = RSTRING_PTR(str); send = RSTRING_END(str);
04412 if (single_byte_optimizable(str)) {
04413 while (s < send) {
04414 unsigned int c = *(unsigned char*)s;
04415
04416 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04417 *s = 'A' + (c - 'a');
04418 modify = 1;
04419 }
04420 s++;
04421 }
04422 }
04423 else {
04424 int ascompat = rb_enc_asciicompat(enc);
04425
04426 while (s < send) {
04427 unsigned int c;
04428
04429 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04430 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04431 *s = 'A' + (c - 'a');
04432 modify = 1;
04433 }
04434 s++;
04435 }
04436 else {
04437 c = rb_enc_codepoint_len(s, send, &n, enc);
04438 if (rb_enc_islower(c, enc)) {
04439
04440 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04441 modify = 1;
04442 }
04443 s += n;
04444 }
04445 }
04446 }
04447
04448 if (modify) return str;
04449 return Qnil;
04450 }
04451
04452
04453
04454
04455
04456
04457
04458
04459
04460
04461
04462
04463
04464
04465 static VALUE
04466 rb_str_upcase(VALUE str)
04467 {
04468 str = rb_str_dup(str);
04469 rb_str_upcase_bang(str);
04470 return str;
04471 }
04472
04473
04474
04475
04476
04477
04478
04479
04480
04481
04482
04483 static VALUE
04484 rb_str_downcase_bang(VALUE str)
04485 {
04486 rb_encoding *enc;
04487 char *s, *send;
04488 int modify = 0;
04489
04490 str_modify_keep_cr(str);
04491 enc = STR_ENC_GET(str);
04492 rb_str_check_dummy_enc(enc);
04493 s = RSTRING_PTR(str); send = RSTRING_END(str);
04494 if (single_byte_optimizable(str)) {
04495 while (s < send) {
04496 unsigned int c = *(unsigned char*)s;
04497
04498 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04499 *s = 'a' + (c - 'A');
04500 modify = 1;
04501 }
04502 s++;
04503 }
04504 }
04505 else {
04506 int ascompat = rb_enc_asciicompat(enc);
04507
04508 while (s < send) {
04509 unsigned int c;
04510 int n;
04511
04512 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04513 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04514 *s = 'a' + (c - 'A');
04515 modify = 1;
04516 }
04517 s++;
04518 }
04519 else {
04520 c = rb_enc_codepoint_len(s, send, &n, enc);
04521 if (rb_enc_isupper(c, enc)) {
04522
04523 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04524 modify = 1;
04525 }
04526 s += n;
04527 }
04528 }
04529 }
04530
04531 if (modify) return str;
04532 return Qnil;
04533 }
04534
04535
04536
04537
04538
04539
04540
04541
04542
04543
04544
04545
04546
04547
04548 static VALUE
04549 rb_str_downcase(VALUE str)
04550 {
04551 str = rb_str_dup(str);
04552 rb_str_downcase_bang(str);
04553 return str;
04554 }
04555
04556
04557
04558
04559
04560
04561
04562
04563
04564
04565
04566
04567
04568
04569
04570
04571 static VALUE
04572 rb_str_capitalize_bang(VALUE str)
04573 {
04574 rb_encoding *enc;
04575 char *s, *send;
04576 int modify = 0;
04577 unsigned int c;
04578 int n;
04579
04580 str_modify_keep_cr(str);
04581 enc = STR_ENC_GET(str);
04582 rb_str_check_dummy_enc(enc);
04583 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04584 s = RSTRING_PTR(str); send = RSTRING_END(str);
04585
04586 c = rb_enc_codepoint_len(s, send, &n, enc);
04587 if (rb_enc_islower(c, enc)) {
04588 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04589 modify = 1;
04590 }
04591 s += n;
04592 while (s < send) {
04593 c = rb_enc_codepoint_len(s, send, &n, enc);
04594 if (rb_enc_isupper(c, enc)) {
04595 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04596 modify = 1;
04597 }
04598 s += n;
04599 }
04600
04601 if (modify) return str;
04602 return Qnil;
04603 }
04604
04605
04606
04607
04608
04609
04610
04611
04612
04613
04614
04615
04616
04617
04618
04619 static VALUE
04620 rb_str_capitalize(VALUE str)
04621 {
04622 str = rb_str_dup(str);
04623 rb_str_capitalize_bang(str);
04624 return str;
04625 }
04626
04627
04628
04629
04630
04631
04632
04633
04634
04635
04636
04637 static VALUE
04638 rb_str_swapcase_bang(VALUE str)
04639 {
04640 rb_encoding *enc;
04641 char *s, *send;
04642 int modify = 0;
04643 int n;
04644
04645 str_modify_keep_cr(str);
04646 enc = STR_ENC_GET(str);
04647 rb_str_check_dummy_enc(enc);
04648 s = RSTRING_PTR(str); send = RSTRING_END(str);
04649 while (s < send) {
04650 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04651
04652 if (rb_enc_isupper(c, enc)) {
04653
04654 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04655 modify = 1;
04656 }
04657 else if (rb_enc_islower(c, enc)) {
04658
04659 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04660 modify = 1;
04661 }
04662 s += n;
04663 }
04664
04665 if (modify) return str;
04666 return Qnil;
04667 }
04668
04669
04670
04671
04672
04673
04674
04675
04676
04677
04678
04679
04680
04681
04682 static VALUE
04683 rb_str_swapcase(VALUE str)
04684 {
04685 str = rb_str_dup(str);
04686 rb_str_swapcase_bang(str);
04687 return str;
04688 }
04689
04690 typedef unsigned char *USTR;
04691
04692 struct tr {
04693 int gen;
04694 unsigned int now, max;
04695 char *p, *pend;
04696 };
04697
04698 static unsigned int
04699 trnext(struct tr *t, rb_encoding *enc)
04700 {
04701 int n;
04702
04703 for (;;) {
04704 if (!t->gen) {
04705 if (t->p == t->pend) return -1;
04706 if (t->p < t->pend - 1 && *t->p == '\\') {
04707 t->p++;
04708 }
04709 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04710 t->p += n;
04711 if (t->p < t->pend - 1 && *t->p == '-') {
04712 t->p++;
04713 if (t->p < t->pend) {
04714 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04715 t->p += n;
04716 if (t->now > c) {
04717 if (t->now < 0x80 && c < 0x80) {
04718 rb_raise(rb_eArgError,
04719 "invalid range \"%c-%c\" in string transliteration",
04720 t->now, c);
04721 }
04722 else {
04723 rb_raise(rb_eArgError, "invalid range in string transliteration");
04724 }
04725 continue;
04726 }
04727 t->gen = 1;
04728 t->max = c;
04729 }
04730 }
04731 return t->now;
04732 }
04733 else if (++t->now < t->max) {
04734 return t->now;
04735 }
04736 else {
04737 t->gen = 0;
04738 return t->max;
04739 }
04740 }
04741 }
04742
04743 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04744
04745 static VALUE
04746 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04747 {
04748 const unsigned int errc = -1;
04749 unsigned int trans[256];
04750 rb_encoding *enc, *e1, *e2;
04751 struct tr trsrc, trrepl;
04752 int cflag = 0;
04753 unsigned int c, c0;
04754 int last = 0, modify = 0, i, l;
04755 char *s, *send;
04756 VALUE hash = 0;
04757 int singlebyte = single_byte_optimizable(str);
04758 int cr;
04759
04760 #define CHECK_IF_ASCII(c) \
04761 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
04762 (cr = ENC_CODERANGE_VALID) : 0)
04763
04764 StringValue(src);
04765 StringValue(repl);
04766 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04767 if (RSTRING_LEN(repl) == 0) {
04768 return rb_str_delete_bang(1, &src, str);
04769 }
04770
04771 cr = ENC_CODERANGE(str);
04772 e1 = rb_enc_check(str, src);
04773 e2 = rb_enc_check(str, repl);
04774 if (e1 == e2) {
04775 enc = e1;
04776 }
04777 else {
04778 enc = rb_enc_check(src, repl);
04779 }
04780 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
04781 if (RSTRING_LEN(src) > 1 &&
04782 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
04783 trsrc.p + l < trsrc.pend) {
04784 cflag = 1;
04785 trsrc.p += l;
04786 }
04787 trrepl.p = RSTRING_PTR(repl);
04788 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
04789 trsrc.gen = trrepl.gen = 0;
04790 trsrc.now = trrepl.now = 0;
04791 trsrc.max = trrepl.max = 0;
04792
04793 if (cflag) {
04794 for (i=0; i<256; i++) {
04795 trans[i] = 1;
04796 }
04797 while ((c = trnext(&trsrc, enc)) != errc) {
04798 if (c < 256) {
04799 trans[c] = errc;
04800 }
04801 else {
04802 if (!hash) hash = rb_hash_new();
04803 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
04804 }
04805 }
04806 while ((c = trnext(&trrepl, enc)) != errc)
04807 ;
04808 last = trrepl.now;
04809 for (i=0; i<256; i++) {
04810 if (trans[i] != errc) {
04811 trans[i] = last;
04812 }
04813 }
04814 }
04815 else {
04816 unsigned int r;
04817
04818 for (i=0; i<256; i++) {
04819 trans[i] = errc;
04820 }
04821 while ((c = trnext(&trsrc, enc)) != errc) {
04822 r = trnext(&trrepl, enc);
04823 if (r == errc) r = trrepl.now;
04824 if (c < 256) {
04825 trans[c] = r;
04826 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
04827 }
04828 else {
04829 if (!hash) hash = rb_hash_new();
04830 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
04831 }
04832 }
04833 }
04834
04835 if (cr == ENC_CODERANGE_VALID)
04836 cr = ENC_CODERANGE_7BIT;
04837 str_modify_keep_cr(str);
04838 s = RSTRING_PTR(str); send = RSTRING_END(str);
04839 if (sflag) {
04840 int clen, tlen;
04841 long offset, max = RSTRING_LEN(str);
04842 unsigned int save = -1;
04843 char *buf = ALLOC_N(char, max), *t = buf;
04844
04845 while (s < send) {
04846 int may_modify = 0;
04847
04848 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04849 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04850
04851 s += clen;
04852 if (c < 256) {
04853 c = trans[c];
04854 }
04855 else if (hash) {
04856 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04857 if (NIL_P(tmp)) {
04858 if (cflag) c = last;
04859 else c = errc;
04860 }
04861 else if (cflag) c = errc;
04862 else c = NUM2INT(tmp);
04863 }
04864 else {
04865 c = errc;
04866 }
04867 if (c != (unsigned int)-1) {
04868 if (save == c) {
04869 CHECK_IF_ASCII(c);
04870 continue;
04871 }
04872 save = c;
04873 tlen = rb_enc_codelen(c, enc);
04874 modify = 1;
04875 }
04876 else {
04877 save = -1;
04878 c = c0;
04879 if (enc != e1) may_modify = 1;
04880 }
04881 while (t - buf + tlen >= max) {
04882 offset = t - buf;
04883 max *= 2;
04884 REALLOC_N(buf, char, max);
04885 t = buf + offset;
04886 }
04887 rb_enc_mbcput(c, t, enc);
04888 if (may_modify && memcmp(s, t, tlen) != 0) {
04889 modify = 1;
04890 }
04891 CHECK_IF_ASCII(c);
04892 t += tlen;
04893 }
04894 *t = '\0';
04895 RSTRING(str)->as.heap.ptr = buf;
04896 RSTRING(str)->as.heap.len = t - buf;
04897 STR_SET_NOEMBED(str);
04898 RSTRING(str)->as.heap.aux.capa = max;
04899 }
04900 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
04901 while (s < send) {
04902 c = (unsigned char)*s;
04903 if (trans[c] != errc) {
04904 if (!cflag) {
04905 c = trans[c];
04906 *s = c;
04907 modify = 1;
04908 }
04909 else {
04910 *s = last;
04911 modify = 1;
04912 }
04913 }
04914 CHECK_IF_ASCII(c);
04915 s++;
04916 }
04917 }
04918 else {
04919 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
04920 long offset;
04921 char *buf = ALLOC_N(char, max), *t = buf;
04922
04923 while (s < send) {
04924 int may_modify = 0;
04925 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04926 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04927
04928 if (c < 256) {
04929 c = trans[c];
04930 }
04931 else if (hash) {
04932 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04933 if (NIL_P(tmp)) {
04934 if (cflag) c = last;
04935 else c = errc;
04936 }
04937 else if (cflag) c = errc;
04938 else c = NUM2INT(tmp);
04939 }
04940 else {
04941 c = errc;
04942 }
04943 if (c != errc) {
04944 tlen = rb_enc_codelen(c, enc);
04945 modify = 1;
04946 }
04947 else {
04948 c = c0;
04949 if (enc != e1) may_modify = 1;
04950 }
04951 while (t - buf + tlen >= max) {
04952 offset = t - buf;
04953 max *= 2;
04954 REALLOC_N(buf, char, max);
04955 t = buf + offset;
04956 }
04957 if (s != t) {
04958 rb_enc_mbcput(c, t, enc);
04959 if (may_modify && memcmp(s, t, tlen) != 0) {
04960 modify = 1;
04961 }
04962 }
04963 CHECK_IF_ASCII(c);
04964 s += clen;
04965 t += tlen;
04966 }
04967 if (!STR_EMBED_P(str)) {
04968 xfree(RSTRING(str)->as.heap.ptr);
04969 }
04970 *t = '\0';
04971 RSTRING(str)->as.heap.ptr = buf;
04972 RSTRING(str)->as.heap.len = t - buf;
04973 STR_SET_NOEMBED(str);
04974 RSTRING(str)->as.heap.aux.capa = max;
04975 }
04976
04977 if (modify) {
04978 if (cr != ENC_CODERANGE_BROKEN)
04979 ENC_CODERANGE_SET(str, cr);
04980 rb_enc_associate(str, enc);
04981 return str;
04982 }
04983 return Qnil;
04984 }
04985
04986
04987
04988
04989
04990
04991
04992
04993
04994
04995
04996 static VALUE
04997 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
04998 {
04999 return tr_trans(str, src, repl, 0);
05000 }
05001
05002
05003
05004
05005
05006
05007
05008
05009
05010
05011
05012
05013
05014
05015
05016
05017
05018
05019
05020 static VALUE
05021 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05022 {
05023 str = rb_str_dup(str);
05024 tr_trans(str, src, repl, 0);
05025 return str;
05026 }
05027
05028 static void
05029 tr_setup_table(VALUE str, char stable[256], int first,
05030 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05031 {
05032 const unsigned int errc = -1;
05033 char buf[256];
05034 struct tr tr;
05035 unsigned int c;
05036 VALUE table = 0, ptable = 0;
05037 int i, l, cflag = 0;
05038
05039 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05040 tr.gen = tr.now = tr.max = 0;
05041
05042 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05043 cflag = 1;
05044 tr.p += l;
05045
05046 table = rb_hash_new();
05047 ptable = *ctablep;
05048 *ctablep = table;
05049 }
05050 else {
05051 table = rb_hash_new();
05052 ptable = *tablep;
05053 *tablep = table;
05054 }
05055 if (first) {
05056 for (i=0; i<256; i++) {
05057 stable[i] = 1;
05058 }
05059 }
05060 for (i=0; i<256; i++) {
05061 buf[i] = cflag;
05062 }
05063
05064 while ((c = trnext(&tr, enc)) != errc) {
05065 if (c < 256) {
05066 buf[c & 0xff] = !cflag;
05067 }
05068 else {
05069 VALUE key = UINT2NUM(c);
05070
05071 if (!table) {
05072 table = rb_hash_new();
05073 ptable = *tablep;
05074 *tablep = table;
05075 }
05076 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05077 rb_hash_aset(table, key, Qtrue);
05078 }
05079 }
05080 }
05081 for (i=0; i<256; i++) {
05082 stable[i] = stable[i] && buf[i];
05083 }
05084 }
05085
05086
05087 static int
05088 tr_find(unsigned int c, char table[256], VALUE del, VALUE nodel)
05089 {
05090 if (c < 256) {
05091 return table[c] != 0;
05092 }
05093 else {
05094 VALUE v = UINT2NUM(c);
05095
05096 if (del) {
05097 if (!NIL_P(rb_hash_lookup(del, v)) &&
05098 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05099 return TRUE;
05100 }
05101 }
05102 else if (nodel && NIL_P(rb_hash_lookup(nodel, v))) {
05103 return TRUE;
05104 }
05105 return FALSE;
05106 }
05107 }
05108
05109
05110
05111
05112
05113
05114
05115
05116
05117 static VALUE
05118 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05119 {
05120 char squeez[256];
05121 rb_encoding *enc = 0;
05122 char *s, *send, *t;
05123 VALUE del = 0, nodel = 0;
05124 int modify = 0;
05125 int i, ascompat, cr;
05126
05127 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05128 if (argc < 1) {
05129 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05130 }
05131 for (i=0; i<argc; i++) {
05132 VALUE s = argv[i];
05133
05134 StringValue(s);
05135 enc = rb_enc_check(str, s);
05136 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05137 }
05138
05139 str_modify_keep_cr(str);
05140 ascompat = rb_enc_asciicompat(enc);
05141 s = t = RSTRING_PTR(str);
05142 send = RSTRING_END(str);
05143 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05144 while (s < send) {
05145 unsigned int c;
05146 int clen;
05147
05148 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05149 if (squeez[c]) {
05150 modify = 1;
05151 }
05152 else {
05153 if (t != s) *t = c;
05154 t++;
05155 }
05156 s++;
05157 }
05158 else {
05159 c = rb_enc_codepoint_len(s, send, &clen, enc);
05160
05161 if (tr_find(c, squeez, del, nodel)) {
05162 modify = 1;
05163 }
05164 else {
05165 if (t != s) rb_enc_mbcput(c, t, enc);
05166 t += clen;
05167 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05168 }
05169 s += clen;
05170 }
05171 }
05172 *t = '\0';
05173 STR_SET_LEN(str, t - RSTRING_PTR(str));
05174 ENC_CODERANGE_SET(str, cr);
05175
05176 if (modify) return str;
05177 return Qnil;
05178 }
05179
05180
05181
05182
05183
05184
05185
05186
05187
05188
05189
05190
05191
05192
05193
05194
05195 static VALUE
05196 rb_str_delete(int argc, VALUE *argv, VALUE str)
05197 {
05198 str = rb_str_dup(str);
05199 rb_str_delete_bang(argc, argv, str);
05200 return str;
05201 }
05202
05203
05204
05205
05206
05207
05208
05209
05210
05211
05212 static VALUE
05213 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05214 {
05215 char squeez[256];
05216 rb_encoding *enc = 0;
05217 VALUE del = 0, nodel = 0;
05218 char *s, *send, *t;
05219 int i, modify = 0;
05220 int ascompat, singlebyte = single_byte_optimizable(str);
05221 unsigned int save;
05222
05223 if (argc == 0) {
05224 enc = STR_ENC_GET(str);
05225 }
05226 else {
05227 for (i=0; i<argc; i++) {
05228 VALUE s = argv[i];
05229
05230 StringValue(s);
05231 enc = rb_enc_check(str, s);
05232 if (singlebyte && !single_byte_optimizable(s))
05233 singlebyte = 0;
05234 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05235 }
05236 }
05237
05238 str_modify_keep_cr(str);
05239 s = t = RSTRING_PTR(str);
05240 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05241 send = RSTRING_END(str);
05242 save = -1;
05243 ascompat = rb_enc_asciicompat(enc);
05244
05245 if (singlebyte) {
05246 while (s < send) {
05247 unsigned int c = *(unsigned char*)s++;
05248 if (c != save || (argc > 0 && !squeez[c])) {
05249 *t++ = save = c;
05250 }
05251 }
05252 } else {
05253 while (s < send) {
05254 unsigned int c;
05255 int clen;
05256
05257 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05258 if (c != save || (argc > 0 && !squeez[c])) {
05259 *t++ = save = c;
05260 }
05261 s++;
05262 }
05263 else {
05264 c = rb_enc_codepoint_len(s, send, &clen, enc);
05265
05266 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05267 if (t != s) rb_enc_mbcput(c, t, enc);
05268 save = c;
05269 t += clen;
05270 }
05271 s += clen;
05272 }
05273 }
05274 }
05275
05276 *t = '\0';
05277 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05278 STR_SET_LEN(str, t - RSTRING_PTR(str));
05279 modify = 1;
05280 }
05281
05282 if (modify) return str;
05283 return Qnil;
05284 }
05285
05286
05287
05288
05289
05290
05291
05292
05293
05294
05295
05296
05297
05298
05299
05300
05301
05302 static VALUE
05303 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05304 {
05305 str = rb_str_dup(str);
05306 rb_str_squeeze_bang(argc, argv, str);
05307 return str;
05308 }
05309
05310
05311
05312
05313
05314
05315
05316
05317
05318
05319 static VALUE
05320 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05321 {
05322 return tr_trans(str, src, repl, 1);
05323 }
05324
05325
05326
05327
05328
05329
05330
05331
05332
05333
05334
05335
05336
05337
05338
05339 static VALUE
05340 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05341 {
05342 str = rb_str_dup(str);
05343 tr_trans(str, src, repl, 1);
05344 return str;
05345 }
05346
05347
05348
05349
05350
05351
05352
05353
05354
05355
05356
05357
05358
05359
05360
05361
05362
05363
05364 static VALUE
05365 rb_str_count(int argc, VALUE *argv, VALUE str)
05366 {
05367 char table[256];
05368 rb_encoding *enc = 0;
05369 VALUE del = 0, nodel = 0;
05370 char *s, *send;
05371 int i;
05372 int ascompat;
05373
05374 if (argc < 1) {
05375 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05376 }
05377 for (i=0; i<argc; i++) {
05378 VALUE tstr = argv[i];
05379 unsigned char c;
05380
05381 StringValue(tstr);
05382 enc = rb_enc_check(str, tstr);
05383 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05384 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05385 int n = 0;
05386
05387 s = RSTRING_PTR(str);
05388 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05389 send = RSTRING_END(str);
05390 while (s < send) {
05391 if (*(unsigned char*)s++ == c) n++;
05392 }
05393 return INT2NUM(n);
05394 }
05395 tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05396 }
05397
05398 s = RSTRING_PTR(str);
05399 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05400 send = RSTRING_END(str);
05401 ascompat = rb_enc_asciicompat(enc);
05402 i = 0;
05403 while (s < send) {
05404 unsigned int c;
05405
05406 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05407 if (table[c]) {
05408 i++;
05409 }
05410 s++;
05411 }
05412 else {
05413 int clen;
05414 c = rb_enc_codepoint_len(s, send, &clen, enc);
05415 if (tr_find(c, table, del, nodel)) {
05416 i++;
05417 }
05418 s += clen;
05419 }
05420 }
05421
05422 return INT2NUM(i);
05423 }
05424
05425 static const char isspacetable[256] = {
05426 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05428 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05434 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05442 };
05443
05444 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05445
05446
05447
05448
05449
05450
05451
05452
05453
05454
05455
05456
05457
05458
05459
05460
05461
05462
05463
05464
05465
05466
05467
05468
05469
05470
05471
05472
05473
05474
05475
05476
05477
05478
05479
05480
05481
05482
05483
05484
05485
05486
05487
05488 static VALUE
05489 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05490 {
05491 rb_encoding *enc;
05492 VALUE spat;
05493 VALUE limit;
05494 enum {awk, string, regexp} split_type;
05495 long beg, end, i = 0;
05496 int lim = 0;
05497 VALUE result, tmp;
05498
05499 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05500 lim = NUM2INT(limit);
05501 if (lim <= 0) limit = Qnil;
05502 else if (lim == 1) {
05503 if (RSTRING_LEN(str) == 0)
05504 return rb_ary_new2(0);
05505 return rb_ary_new3(1, str);
05506 }
05507 i = 1;
05508 }
05509
05510 enc = STR_ENC_GET(str);
05511 if (NIL_P(spat)) {
05512 if (!NIL_P(rb_fs)) {
05513 spat = rb_fs;
05514 goto fs_set;
05515 }
05516 split_type = awk;
05517 }
05518 else {
05519 fs_set:
05520 if (TYPE(spat) == T_STRING) {
05521 rb_encoding *enc2 = STR_ENC_GET(spat);
05522
05523 split_type = string;
05524 if (RSTRING_LEN(spat) == 0) {
05525
05526 spat = rb_reg_regcomp(spat);
05527 split_type = regexp;
05528 }
05529 else if (rb_enc_asciicompat(enc2) == 1) {
05530 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05531 split_type = awk;
05532 }
05533 }
05534 else {
05535 int l;
05536 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05537 RSTRING_LEN(spat) == l) {
05538 split_type = awk;
05539 }
05540 }
05541 }
05542 else {
05543 spat = get_pat(spat, 1);
05544 split_type = regexp;
05545 }
05546 }
05547
05548 result = rb_ary_new();
05549 beg = 0;
05550 if (split_type == awk) {
05551 char *ptr = RSTRING_PTR(str);
05552 char *eptr = RSTRING_END(str);
05553 char *bptr = ptr;
05554 int skip = 1;
05555 unsigned int c;
05556
05557 end = beg;
05558 if (is_ascii_string(str)) {
05559 while (ptr < eptr) {
05560 c = (unsigned char)*ptr++;
05561 if (skip) {
05562 if (ascii_isspace(c)) {
05563 beg = ptr - bptr;
05564 }
05565 else {
05566 end = ptr - bptr;
05567 skip = 0;
05568 if (!NIL_P(limit) && lim <= i) break;
05569 }
05570 }
05571 else if (ascii_isspace(c)) {
05572 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05573 skip = 1;
05574 beg = ptr - bptr;
05575 if (!NIL_P(limit)) ++i;
05576 }
05577 else {
05578 end = ptr - bptr;
05579 }
05580 }
05581 }
05582 else {
05583 while (ptr < eptr) {
05584 int n;
05585
05586 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05587 ptr += n;
05588 if (skip) {
05589 if (rb_isspace(c)) {
05590 beg = ptr - bptr;
05591 }
05592 else {
05593 end = ptr - bptr;
05594 skip = 0;
05595 if (!NIL_P(limit) && lim <= i) break;
05596 }
05597 }
05598 else if (rb_isspace(c)) {
05599 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05600 skip = 1;
05601 beg = ptr - bptr;
05602 if (!NIL_P(limit)) ++i;
05603 }
05604 else {
05605 end = ptr - bptr;
05606 }
05607 }
05608 }
05609 }
05610 else if (split_type == string) {
05611 char *ptr = RSTRING_PTR(str);
05612 char *temp = ptr;
05613 char *eptr = RSTRING_END(str);
05614 char *sptr = RSTRING_PTR(spat);
05615 long slen = RSTRING_LEN(spat);
05616
05617 if (is_broken_string(str)) {
05618 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05619 }
05620 if (is_broken_string(spat)) {
05621 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05622 }
05623 enc = rb_enc_check(str, spat);
05624 while (ptr < eptr &&
05625 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05626
05627 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05628 if (t != ptr + end) {
05629 ptr = t;
05630 continue;
05631 }
05632 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05633 ptr += end + slen;
05634 if (!NIL_P(limit) && lim <= ++i) break;
05635 }
05636 beg = ptr - temp;
05637 }
05638 else {
05639 char *ptr = RSTRING_PTR(str);
05640 long len = RSTRING_LEN(str);
05641 long start = beg;
05642 long idx;
05643 int last_null = 0;
05644 struct re_registers *regs;
05645
05646 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05647 regs = RMATCH_REGS(rb_backref_get());
05648 if (start == end && BEG(0) == END(0)) {
05649 if (!ptr) {
05650 rb_ary_push(result, str_new_empty(str));
05651 break;
05652 }
05653 else if (last_null == 1) {
05654 rb_ary_push(result, rb_str_subseq(str, beg,
05655 rb_enc_fast_mbclen(ptr+beg,
05656 ptr+len,
05657 enc)));
05658 beg = start;
05659 }
05660 else {
05661 if (ptr+start == ptr+len)
05662 start++;
05663 else
05664 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05665 last_null = 1;
05666 continue;
05667 }
05668 }
05669 else {
05670 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05671 beg = start = END(0);
05672 }
05673 last_null = 0;
05674
05675 for (idx=1; idx < regs->num_regs; idx++) {
05676 if (BEG(idx) == -1) continue;
05677 if (BEG(idx) == END(idx))
05678 tmp = str_new_empty(str);
05679 else
05680 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05681 rb_ary_push(result, tmp);
05682 }
05683 if (!NIL_P(limit) && lim <= ++i) break;
05684 }
05685 }
05686 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05687 if (RSTRING_LEN(str) == beg)
05688 tmp = str_new_empty(str);
05689 else
05690 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05691 rb_ary_push(result, tmp);
05692 }
05693 if (NIL_P(limit) && lim == 0) {
05694 long len;
05695 while ((len = RARRAY_LEN(result)) > 0 &&
05696 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05697 rb_ary_pop(result);
05698 }
05699
05700 return result;
05701 }
05702
05703 VALUE
05704 rb_str_split(VALUE str, const char *sep0)
05705 {
05706 VALUE sep;
05707
05708 StringValue(str);
05709 sep = rb_str_new2(sep0);
05710 return rb_str_split_m(1, &sep, str);
05711 }
05712
05713
05714
05715
05716
05717
05718
05719
05720
05721
05722
05723
05724
05725
05726
05727
05728
05729
05730
05731
05732
05733
05734
05735
05736
05737
05738
05739
05740
05741
05742
05743
05744
05745
05746
05747
05748
05749
05750
05751 static VALUE
05752 rb_str_each_line(int argc, VALUE *argv, VALUE str)
05753 {
05754 rb_encoding *enc;
05755 VALUE rs;
05756 unsigned int newline;
05757 const char *p, *pend, *s, *ptr;
05758 long len, rslen;
05759 VALUE line;
05760 int n;
05761 VALUE orig = str;
05762
05763 if (argc == 0) {
05764 rs = rb_rs;
05765 }
05766 else {
05767 rb_scan_args(argc, argv, "01", &rs);
05768 }
05769 RETURN_ENUMERATOR(str, argc, argv);
05770 if (NIL_P(rs)) {
05771 rb_yield(str);
05772 return orig;
05773 }
05774 str = rb_str_new4(str);
05775 ptr = p = s = RSTRING_PTR(str);
05776 pend = p + RSTRING_LEN(str);
05777 len = RSTRING_LEN(str);
05778 StringValue(rs);
05779 if (rs == rb_default_rs) {
05780 enc = rb_enc_get(str);
05781 while (p < pend) {
05782 char *p0;
05783
05784 p = memchr(p, '\n', pend - p);
05785 if (!p) break;
05786 p0 = rb_enc_left_char_head(s, p, pend, enc);
05787 if (!rb_enc_is_newline(p0, pend, enc)) {
05788 p++;
05789 continue;
05790 }
05791 p = p0 + rb_enc_mbclen(p0, pend, enc);
05792 line = rb_str_new5(str, s, p - s);
05793 OBJ_INFECT(line, str);
05794 rb_enc_cr_str_copy_for_substr(line, str);
05795 rb_yield(line);
05796 str_mod_check(str, ptr, len);
05797 s = p;
05798 }
05799 goto finish;
05800 }
05801
05802 enc = rb_enc_check(str, rs);
05803 rslen = RSTRING_LEN(rs);
05804 if (rslen == 0) {
05805 newline = '\n';
05806 }
05807 else {
05808 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
05809 }
05810
05811 while (p < pend) {
05812 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
05813
05814 again:
05815 if (rslen == 0 && c == newline) {
05816 p += n;
05817 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
05818 goto again;
05819 }
05820 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
05821 p += n;
05822 }
05823 p -= n;
05824 }
05825 if (c == newline &&
05826 (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
05827 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
05828 OBJ_INFECT(line, str);
05829 rb_enc_cr_str_copy_for_substr(line, str);
05830 rb_yield(line);
05831 str_mod_check(str, ptr, len);
05832 s = p + (rslen ? rslen : n);
05833 }
05834 p += n;
05835 }
05836
05837 finish:
05838 if (s != pend) {
05839 line = rb_str_new5(str, s, pend - s);
05840 OBJ_INFECT(line, str);
05841 rb_enc_cr_str_copy_for_substr(line, str);
05842 rb_yield(line);
05843 }
05844
05845 return orig;
05846 }
05847
05848
05849
05850
05851
05852
05853
05854
05855
05856
05857
05858
05859
05860
05861
05862
05863
05864
05865
05866
05867 static VALUE
05868 rb_str_each_byte(VALUE str)
05869 {
05870 long i;
05871
05872 RETURN_ENUMERATOR(str, 0, 0);
05873 for (i=0; i<RSTRING_LEN(str); i++) {
05874 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
05875 }
05876 return str;
05877 }
05878
05879
05880
05881
05882
05883
05884
05885
05886
05887
05888
05889
05890
05891
05892
05893
05894
05895
05896
05897
05898 static VALUE
05899 rb_str_each_char(VALUE str)
05900 {
05901 VALUE orig = str;
05902 long i, len, n;
05903 const char *ptr;
05904 rb_encoding *enc;
05905
05906 RETURN_ENUMERATOR(str, 0, 0);
05907 str = rb_str_new4(str);
05908 ptr = RSTRING_PTR(str);
05909 len = RSTRING_LEN(str);
05910 enc = rb_enc_get(str);
05911 switch (ENC_CODERANGE(str)) {
05912 case ENC_CODERANGE_VALID:
05913 case ENC_CODERANGE_7BIT:
05914 for (i = 0; i < len; i += n) {
05915 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
05916 rb_yield(rb_str_subseq(str, i, n));
05917 }
05918 break;
05919 default:
05920 for (i = 0; i < len; i += n) {
05921 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
05922 rb_yield(rb_str_subseq(str, i, n));
05923 }
05924 }
05925 return orig;
05926 }
05927
05928
05929
05930
05931
05932
05933
05934
05935
05936
05937
05938
05939
05940
05941
05942
05943
05944
05945
05946
05947
05948
05949 static VALUE
05950 rb_str_each_codepoint(VALUE str)
05951 {
05952 VALUE orig = str;
05953 long len;
05954 int n;
05955 unsigned int c;
05956 const char *ptr, *end;
05957 rb_encoding *enc;
05958
05959 if (single_byte_optimizable(str)) return rb_str_each_byte(str);
05960 RETURN_ENUMERATOR(str, 0, 0);
05961 str = rb_str_new4(str);
05962 ptr = RSTRING_PTR(str);
05963 len = RSTRING_LEN(str);
05964 end = RSTRING_END(str);
05965 enc = STR_ENC_GET(str);
05966 while (ptr < end) {
05967 c = rb_enc_codepoint_len(ptr, end, &n, enc);
05968 rb_yield(UINT2NUM(c));
05969 ptr += n;
05970 }
05971 return orig;
05972 }
05973
05974 static long
05975 chopped_length(VALUE str)
05976 {
05977 rb_encoding *enc = STR_ENC_GET(str);
05978 const char *p, *p2, *beg, *end;
05979
05980 beg = RSTRING_PTR(str);
05981 end = beg + RSTRING_LEN(str);
05982 if (beg > end) return 0;
05983 p = rb_enc_prev_char(beg, end, end, enc);
05984 if (!p) return 0;
05985 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
05986 p2 = rb_enc_prev_char(beg, p, end, enc);
05987 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
05988 }
05989 return p - beg;
05990 }
05991
05992
05993
05994
05995
05996
05997
05998
05999
06000
06001 static VALUE
06002 rb_str_chop_bang(VALUE str)
06003 {
06004 str_modify_keep_cr(str);
06005 if (RSTRING_LEN(str) > 0) {
06006 long len;
06007 len = chopped_length(str);
06008 STR_SET_LEN(str, len);
06009 RSTRING_PTR(str)[len] = '\0';
06010 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06011 ENC_CODERANGE_CLEAR(str);
06012 }
06013 return str;
06014 }
06015 return Qnil;
06016 }
06017
06018
06019
06020
06021
06022
06023
06024
06025
06026
06027
06028
06029
06030
06031
06032
06033
06034
06035
06036 static VALUE
06037 rb_str_chop(VALUE str)
06038 {
06039 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06040 rb_enc_cr_str_copy_for_substr(str2, str);
06041 OBJ_INFECT(str2, str);
06042 return str2;
06043 }
06044
06045
06046
06047
06048
06049
06050
06051
06052
06053
06054 static VALUE
06055 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06056 {
06057 rb_encoding *enc;
06058 VALUE rs;
06059 int newline;
06060 char *p, *pp, *e;
06061 long len, rslen;
06062
06063 str_modify_keep_cr(str);
06064 len = RSTRING_LEN(str);
06065 if (len == 0) return Qnil;
06066 p = RSTRING_PTR(str);
06067 e = p + len;
06068 if (argc == 0) {
06069 rs = rb_rs;
06070 if (rs == rb_default_rs) {
06071 smart_chomp:
06072 enc = rb_enc_get(str);
06073 if (rb_enc_mbminlen(enc) > 1) {
06074 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06075 if (rb_enc_is_newline(pp, e, enc)) {
06076 e = pp;
06077 }
06078 pp = e - rb_enc_mbminlen(enc);
06079 if (pp >= p) {
06080 pp = rb_enc_left_char_head(p, pp, e, enc);
06081 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06082 e = pp;
06083 }
06084 }
06085 if (e == RSTRING_END(str)) {
06086 return Qnil;
06087 }
06088 len = e - RSTRING_PTR(str);
06089 STR_SET_LEN(str, len);
06090 }
06091 else {
06092 if (RSTRING_PTR(str)[len-1] == '\n') {
06093 STR_DEC_LEN(str);
06094 if (RSTRING_LEN(str) > 0 &&
06095 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06096 STR_DEC_LEN(str);
06097 }
06098 }
06099 else if (RSTRING_PTR(str)[len-1] == '\r') {
06100 STR_DEC_LEN(str);
06101 }
06102 else {
06103 return Qnil;
06104 }
06105 }
06106 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06107 return str;
06108 }
06109 }
06110 else {
06111 rb_scan_args(argc, argv, "01", &rs);
06112 }
06113 if (NIL_P(rs)) return Qnil;
06114 StringValue(rs);
06115 rslen = RSTRING_LEN(rs);
06116 if (rslen == 0) {
06117 while (len>0 && p[len-1] == '\n') {
06118 len--;
06119 if (len>0 && p[len-1] == '\r')
06120 len--;
06121 }
06122 if (len < RSTRING_LEN(str)) {
06123 STR_SET_LEN(str, len);
06124 RSTRING_PTR(str)[len] = '\0';
06125 return str;
06126 }
06127 return Qnil;
06128 }
06129 if (rslen > len) return Qnil;
06130 newline = RSTRING_PTR(rs)[rslen-1];
06131 if (rslen == 1 && newline == '\n')
06132 goto smart_chomp;
06133
06134 enc = rb_enc_check(str, rs);
06135 if (is_broken_string(rs)) {
06136 return Qnil;
06137 }
06138 pp = e - rslen;
06139 if (p[len-1] == newline &&
06140 (rslen <= 1 ||
06141 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06142 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06143 return Qnil;
06144 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06145 ENC_CODERANGE_CLEAR(str);
06146 }
06147 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06148 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06149 return str;
06150 }
06151 return Qnil;
06152 }
06153
06154
06155
06156
06157
06158
06159
06160
06161
06162
06163
06164
06165
06166
06167
06168
06169
06170
06171
06172
06173
06174 static VALUE
06175 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06176 {
06177 str = rb_str_dup(str);
06178 rb_str_chomp_bang(argc, argv, str);
06179 return str;
06180 }
06181
06182
06183
06184
06185
06186
06187
06188
06189
06190
06191
06192
06193
06194 static VALUE
06195 rb_str_lstrip_bang(VALUE str)
06196 {
06197 rb_encoding *enc;
06198 char *s, *t, *e;
06199
06200 str_modify_keep_cr(str);
06201 enc = STR_ENC_GET(str);
06202 s = RSTRING_PTR(str);
06203 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06204 e = t = RSTRING_END(str);
06205
06206 while (s < e) {
06207 int n;
06208 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06209
06210 if (!rb_isspace(cc)) break;
06211 s += n;
06212 }
06213
06214 if (s > RSTRING_PTR(str)) {
06215 STR_SET_LEN(str, t-s);
06216 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06217 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06218 return str;
06219 }
06220 return Qnil;
06221 }
06222
06223
06224
06225
06226
06227
06228
06229
06230
06231
06232
06233
06234
06235 static VALUE
06236 rb_str_lstrip(VALUE str)
06237 {
06238 str = rb_str_dup(str);
06239 rb_str_lstrip_bang(str);
06240 return str;
06241 }
06242
06243
06244
06245
06246
06247
06248
06249
06250
06251
06252
06253
06254
06255
06256 static VALUE
06257 rb_str_rstrip_bang(VALUE str)
06258 {
06259 rb_encoding *enc;
06260 char *s, *t, *e;
06261
06262 str_modify_keep_cr(str);
06263 enc = STR_ENC_GET(str);
06264 rb_str_check_dummy_enc(enc);
06265 s = RSTRING_PTR(str);
06266 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06267 t = e = RSTRING_END(str);
06268
06269
06270 if (single_byte_optimizable(str)) {
06271 unsigned char c;
06272 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06273 }
06274 else {
06275 char *tp;
06276
06277 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06278 unsigned int c = rb_enc_codepoint(tp, e, enc);
06279 if (c && !rb_isspace(c)) break;
06280 t = tp;
06281 }
06282 }
06283 if (t < e) {
06284 long len = t-RSTRING_PTR(str);
06285
06286 STR_SET_LEN(str, len);
06287 RSTRING_PTR(str)[len] = '\0';
06288 return str;
06289 }
06290 return Qnil;
06291 }
06292
06293
06294
06295
06296
06297
06298
06299
06300
06301
06302
06303
06304
06305 static VALUE
06306 rb_str_rstrip(VALUE str)
06307 {
06308 str = rb_str_dup(str);
06309 rb_str_rstrip_bang(str);
06310 return str;
06311 }
06312
06313
06314
06315
06316
06317
06318
06319
06320
06321
06322 static VALUE
06323 rb_str_strip_bang(VALUE str)
06324 {
06325 VALUE l = rb_str_lstrip_bang(str);
06326 VALUE r = rb_str_rstrip_bang(str);
06327
06328 if (NIL_P(l) && NIL_P(r)) return Qnil;
06329 return str;
06330 }
06331
06332
06333
06334
06335
06336
06337
06338
06339
06340
06341
06342
06343 static VALUE
06344 rb_str_strip(VALUE str)
06345 {
06346 str = rb_str_dup(str);
06347 rb_str_strip_bang(str);
06348 return str;
06349 }
06350
06351 static VALUE
06352 scan_once(VALUE str, VALUE pat, long *start)
06353 {
06354 VALUE result, match;
06355 struct re_registers *regs;
06356 int i;
06357
06358 if (rb_reg_search(pat, str, *start, 0) >= 0) {
06359 match = rb_backref_get();
06360 regs = RMATCH_REGS(match);
06361 if (BEG(0) == END(0)) {
06362 rb_encoding *enc = STR_ENC_GET(str);
06363
06364
06365
06366 if (RSTRING_LEN(str) > END(0))
06367 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06368 RSTRING_END(str), enc);
06369 else
06370 *start = END(0)+1;
06371 }
06372 else {
06373 *start = END(0);
06374 }
06375 if (regs->num_regs == 1) {
06376 return rb_reg_nth_match(0, match);
06377 }
06378 result = rb_ary_new2(regs->num_regs);
06379 for (i=1; i < regs->num_regs; i++) {
06380 rb_ary_push(result, rb_reg_nth_match(i, match));
06381 }
06382
06383 return result;
06384 }
06385 return Qnil;
06386 }
06387
06388
06389
06390
06391
06392
06393
06394
06395
06396
06397
06398
06399
06400
06401
06402
06403
06404
06405
06406
06407
06408
06409
06410
06411
06412
06413
06414
06415
06416
06417
06418
06419
06420 static VALUE
06421 rb_str_scan(VALUE str, VALUE pat)
06422 {
06423 VALUE result;
06424 long start = 0;
06425 long last = -1, prev = 0;
06426 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06427
06428 pat = get_pat(pat, 1);
06429 if (!rb_block_given_p()) {
06430 VALUE ary = rb_ary_new();
06431
06432 while (!NIL_P(result = scan_once(str, pat, &start))) {
06433 last = prev;
06434 prev = start;
06435 rb_ary_push(ary, result);
06436 }
06437 if (last >= 0) rb_reg_search(pat, str, last, 0);
06438 return ary;
06439 }
06440
06441 while (!NIL_P(result = scan_once(str, pat, &start))) {
06442 last = prev;
06443 prev = start;
06444 rb_yield(result);
06445 str_mod_check(str, p, len);
06446 }
06447 if (last >= 0) rb_reg_search(pat, str, last, 0);
06448 return str;
06449 }
06450
06451
06452
06453
06454
06455
06456
06457
06458
06459
06460
06461
06462
06463
06464
06465
06466 static VALUE
06467 rb_str_hex(VALUE str)
06468 {
06469 rb_encoding *enc = rb_enc_get(str);
06470
06471 if (!rb_enc_asciicompat(enc)) {
06472 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06473 }
06474 return rb_str_to_inum(str, 16, FALSE);
06475 }
06476
06477
06478
06479
06480
06481
06482
06483
06484
06485
06486
06487
06488
06489
06490
06491
06492 static VALUE
06493 rb_str_oct(VALUE str)
06494 {
06495 rb_encoding *enc = rb_enc_get(str);
06496
06497 if (!rb_enc_asciicompat(enc)) {
06498 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06499 }
06500 return rb_str_to_inum(str, -8, FALSE);
06501 }
06502
06503
06504
06505
06506
06507
06508
06509
06510
06511
06512
06513
06514 static VALUE
06515 rb_str_crypt(VALUE str, VALUE salt)
06516 {
06517 extern char *crypt(const char *, const char *);
06518 VALUE result;
06519 const char *s, *saltp;
06520 #ifdef BROKEN_CRYPT
06521 char salt_8bit_clean[3];
06522 #endif
06523
06524 StringValue(salt);
06525 if (RSTRING_LEN(salt) < 2)
06526 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06527
06528 s = RSTRING_PTR(str);
06529 if (!s) s = "";
06530 saltp = RSTRING_PTR(salt);
06531 #ifdef BROKEN_CRYPT
06532 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06533 salt_8bit_clean[0] = saltp[0] & 0x7f;
06534 salt_8bit_clean[1] = saltp[1] & 0x7f;
06535 salt_8bit_clean[2] = '\0';
06536 saltp = salt_8bit_clean;
06537 }
06538 #endif
06539 result = rb_str_new2(crypt(s, saltp));
06540 OBJ_INFECT(result, str);
06541 OBJ_INFECT(result, salt);
06542 return result;
06543 }
06544
06545
06546
06547
06548
06549
06550
06551
06552
06553
06554
06555
06556
06557
06558
06559
06560
06561
06562
06563
06564
06565
06566 VALUE
06567 rb_str_intern(VALUE s)
06568 {
06569 VALUE str = RB_GC_GUARD(s);
06570 ID id;
06571
06572 id = rb_intern_str(str);
06573 return ID2SYM(id);
06574 }
06575
06576
06577
06578
06579
06580
06581
06582
06583
06584
06585
06586 VALUE
06587 rb_str_ord(VALUE s)
06588 {
06589 unsigned int c;
06590
06591 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06592 return UINT2NUM(c);
06593 }
06594
06595
06596
06597
06598
06599
06600
06601
06602
06603
06604
06605 static VALUE
06606 rb_str_sum(int argc, VALUE *argv, VALUE str)
06607 {
06608 VALUE vbits;
06609 int bits;
06610 char *ptr, *p, *pend;
06611 long len;
06612 VALUE sum = INT2FIX(0);
06613 unsigned long sum0 = 0;
06614
06615 if (argc == 0) {
06616 bits = 16;
06617 }
06618 else {
06619 rb_scan_args(argc, argv, "01", &vbits);
06620 bits = NUM2INT(vbits);
06621 }
06622 ptr = p = RSTRING_PTR(str);
06623 len = RSTRING_LEN(str);
06624 pend = p + len;
06625
06626 while (p < pend) {
06627 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06628 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06629 str_mod_check(str, ptr, len);
06630 sum0 = 0;
06631 }
06632 sum0 += (unsigned char)*p;
06633 p++;
06634 }
06635
06636 if (bits == 0) {
06637 if (sum0) {
06638 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06639 }
06640 }
06641 else {
06642 if (sum == INT2FIX(0)) {
06643 if (bits < (int)sizeof(long)*CHAR_BIT) {
06644 sum0 &= (((unsigned long)1)<<bits)-1;
06645 }
06646 sum = LONG2FIX(sum0);
06647 }
06648 else {
06649 VALUE mod;
06650
06651 if (sum0) {
06652 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06653 }
06654
06655 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06656 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06657 sum = rb_funcall(sum, '&', 1, mod);
06658 }
06659 }
06660 return sum;
06661 }
06662
06663 static VALUE
06664 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06665 {
06666 rb_encoding *enc;
06667 VALUE w;
06668 long width, len, flen = 1, fclen = 1;
06669 VALUE res;
06670 char *p;
06671 const char *f = " ";
06672 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06673 volatile VALUE pad;
06674 int singlebyte = 1, cr;
06675
06676 rb_scan_args(argc, argv, "11", &w, &pad);
06677 enc = STR_ENC_GET(str);
06678 width = NUM2LONG(w);
06679 if (argc == 2) {
06680 StringValue(pad);
06681 enc = rb_enc_check(str, pad);
06682 f = RSTRING_PTR(pad);
06683 flen = RSTRING_LEN(pad);
06684 fclen = str_strlen(pad, enc);
06685 singlebyte = single_byte_optimizable(pad);
06686 if (flen == 0 || fclen == 0) {
06687 rb_raise(rb_eArgError, "zero width padding");
06688 }
06689 }
06690 len = str_strlen(str, enc);
06691 if (width < 0 || len >= width) return rb_str_dup(str);
06692 n = width - len;
06693 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06694 rlen = n - llen;
06695 cr = ENC_CODERANGE(str);
06696 if (flen > 1) {
06697 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06698 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06699 }
06700 size = RSTRING_LEN(str);
06701 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06702 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06703 (len += llen2 + rlen2) >= LONG_MAX - size) {
06704 rb_raise(rb_eArgError, "argument too big");
06705 }
06706 len += size;
06707 res = rb_str_new5(str, 0, len);
06708 p = RSTRING_PTR(res);
06709 if (flen <= 1) {
06710 memset(p, *f, llen);
06711 p += llen;
06712 }
06713 else {
06714 while (llen >= fclen) {
06715 memcpy(p,f,flen);
06716 p += flen;
06717 llen -= fclen;
06718 }
06719 if (llen > 0) {
06720 memcpy(p, f, llen2);
06721 p += llen2;
06722 }
06723 }
06724 memcpy(p, RSTRING_PTR(str), size);
06725 p += size;
06726 if (flen <= 1) {
06727 memset(p, *f, rlen);
06728 p += rlen;
06729 }
06730 else {
06731 while (rlen >= fclen) {
06732 memcpy(p,f,flen);
06733 p += flen;
06734 rlen -= fclen;
06735 }
06736 if (rlen > 0) {
06737 memcpy(p, f, rlen2);
06738 p += rlen2;
06739 }
06740 }
06741 *p = '\0';
06742 STR_SET_LEN(res, p-RSTRING_PTR(res));
06743 OBJ_INFECT(res, str);
06744 if (!NIL_P(pad)) OBJ_INFECT(res, pad);
06745 rb_enc_associate(res, enc);
06746 if (argc == 2)
06747 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
06748 if (cr != ENC_CODERANGE_BROKEN)
06749 ENC_CODERANGE_SET(res, cr);
06750 return res;
06751 }
06752
06753
06754
06755
06756
06757
06758
06759
06760
06761
06762
06763
06764
06765
06766
06767 static VALUE
06768 rb_str_ljust(int argc, VALUE *argv, VALUE str)
06769 {
06770 return rb_str_justify(argc, argv, str, 'l');
06771 }
06772
06773
06774
06775
06776
06777
06778
06779
06780
06781
06782
06783
06784
06785
06786
06787 static VALUE
06788 rb_str_rjust(int argc, VALUE *argv, VALUE str)
06789 {
06790 return rb_str_justify(argc, argv, str, 'r');
06791 }
06792
06793
06794
06795
06796
06797
06798
06799
06800
06801
06802
06803
06804
06805
06806
06807 static VALUE
06808 rb_str_center(int argc, VALUE *argv, VALUE str)
06809 {
06810 return rb_str_justify(argc, argv, str, 'c');
06811 }
06812
06813
06814
06815
06816
06817
06818
06819
06820
06821
06822
06823
06824
06825
06826
06827
06828 static VALUE
06829 rb_str_partition(VALUE str, VALUE sep)
06830 {
06831 long pos;
06832 int regex = FALSE;
06833
06834 if (TYPE(sep) == T_REGEXP) {
06835 pos = rb_reg_search(sep, str, 0, 0);
06836 regex = TRUE;
06837 }
06838 else {
06839 VALUE tmp;
06840
06841 tmp = rb_check_string_type(sep);
06842 if (NIL_P(tmp)) {
06843 rb_raise(rb_eTypeError, "type mismatch: %s given",
06844 rb_obj_classname(sep));
06845 }
06846 sep = tmp;
06847 pos = rb_str_index(str, sep, 0);
06848 }
06849 if (pos < 0) {
06850 failed:
06851 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
06852 }
06853 if (regex) {
06854 sep = rb_str_subpat(str, sep, INT2FIX(0));
06855 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
06856 }
06857 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
06858 sep,
06859 rb_str_subseq(str, pos+RSTRING_LEN(sep),
06860 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
06861 }
06862
06863
06864
06865
06866
06867
06868
06869
06870
06871
06872
06873
06874
06875
06876
06877
06878 static VALUE
06879 rb_str_rpartition(VALUE str, VALUE sep)
06880 {
06881 long pos = RSTRING_LEN(str);
06882 int regex = FALSE;
06883
06884 if (TYPE(sep) == T_REGEXP) {
06885 pos = rb_reg_search(sep, str, pos, 1);
06886 regex = TRUE;
06887 }
06888 else {
06889 VALUE tmp;
06890
06891 tmp = rb_check_string_type(sep);
06892 if (NIL_P(tmp)) {
06893 rb_raise(rb_eTypeError, "type mismatch: %s given",
06894 rb_obj_classname(sep));
06895 }
06896 sep = tmp;
06897 pos = rb_str_sublen(str, pos);
06898 pos = rb_str_rindex(str, sep, pos);
06899 }
06900 if (pos < 0) {
06901 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
06902 }
06903 if (regex) {
06904 sep = rb_reg_nth_match(0, rb_backref_get());
06905 }
06906 return rb_ary_new3(3, rb_str_substr(str, 0, pos),
06907 sep,
06908 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
06909 }
06910
06911
06912
06913
06914
06915
06916
06917
06918
06919
06920
06921
06922
06923
06924
06925
06926
06927 static VALUE
06928 rb_str_start_with(int argc, VALUE *argv, VALUE str)
06929 {
06930 int i;
06931
06932 for (i=0; i<argc; i++) {
06933 VALUE tmp = rb_check_string_type(argv[i]);
06934 if (NIL_P(tmp)) continue;
06935 rb_enc_check(str, tmp);
06936 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06937 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06938 return Qtrue;
06939 }
06940 return Qfalse;
06941 }
06942
06943
06944
06945
06946
06947
06948
06949
06950 static VALUE
06951 rb_str_end_with(int argc, VALUE *argv, VALUE str)
06952 {
06953 int i;
06954 char *p, *s, *e;
06955 rb_encoding *enc;
06956
06957 for (i=0; i<argc; i++) {
06958 VALUE tmp = rb_check_string_type(argv[i]);
06959 if (NIL_P(tmp)) continue;
06960 enc = rb_enc_check(str, tmp);
06961 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06962 p = RSTRING_PTR(str);
06963 e = p + RSTRING_LEN(str);
06964 s = e - RSTRING_LEN(tmp);
06965 if (rb_enc_left_char_head(p, s, e, enc) != s)
06966 continue;
06967 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06968 return Qtrue;
06969 }
06970 return Qfalse;
06971 }
06972
06973 void
06974 rb_str_setter(VALUE val, ID id, VALUE *var)
06975 {
06976 if (!NIL_P(val) && TYPE(val) != T_STRING) {
06977 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
06978 }
06979 *var = val;
06980 }
06981
06982
06983
06984
06985
06986
06987
06988
06989
06990 static VALUE
06991 rb_str_force_encoding(VALUE str, VALUE enc)
06992 {
06993 str_modifiable(str);
06994 rb_enc_associate(str, rb_to_encoding(enc));
06995 ENC_CODERANGE_CLEAR(str);
06996 return str;
06997 }
06998
06999
07000
07001
07002
07003
07004
07005
07006
07007
07008
07009
07010 static VALUE
07011 rb_str_valid_encoding_p(VALUE str)
07012 {
07013 int cr = rb_enc_str_coderange(str);
07014
07015 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07016 }
07017
07018
07019
07020
07021
07022
07023
07024
07025
07026
07027
07028 static VALUE
07029 rb_str_is_ascii_only_p(VALUE str)
07030 {
07031 int cr = rb_enc_str_coderange(str);
07032
07033 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07034 }
07035
07036
07037
07038
07039
07040
07041
07042
07043
07044
07045
07046
07047
07048
07049
07050
07051
07052
07053
07054
07055
07056
07057
07058
07059
07060
07061
07062
07063
07064
07065
07066
07067
07068
07069
07070
07071
07072
07073
07074
07075
07076
07077
07078 static VALUE
07079 sym_equal(VALUE sym1, VALUE sym2)
07080 {
07081 if (sym1 == sym2) return Qtrue;
07082 return Qfalse;
07083 }
07084
07085
07086 static int
07087 sym_printable(const char *s, const char *send, rb_encoding *enc)
07088 {
07089 while (s < send) {
07090 int n;
07091 int c = rb_enc_codepoint_len(s, send, &n, enc);
07092
07093 if (!rb_enc_isprint(c, enc)) return FALSE;
07094 s += n;
07095 }
07096 return TRUE;
07097 }
07098
07099
07100
07101
07102
07103
07104
07105
07106
07107
07108 static VALUE
07109 sym_inspect(VALUE sym)
07110 {
07111 VALUE str;
07112 ID id = SYM2ID(sym);
07113 rb_encoding *enc;
07114 const char *ptr;
07115 long len;
07116 char *dest;
07117 rb_encoding *resenc = rb_default_internal_encoding();
07118
07119 if (resenc == NULL) resenc = rb_default_external_encoding();
07120 sym = rb_id2str(id);
07121 enc = STR_ENC_GET(sym);
07122 ptr = RSTRING_PTR(sym);
07123 len = RSTRING_LEN(sym);
07124 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07125 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07126 str = rb_str_inspect(sym);
07127 len = RSTRING_LEN(str);
07128 rb_str_resize(str, len + 1);
07129 dest = RSTRING_PTR(str);
07130 memmove(dest + 1, dest, len);
07131 dest[0] = ':';
07132 }
07133 else {
07134 char *dest;
07135 str = rb_enc_str_new(0, len + 1, enc);
07136 dest = RSTRING_PTR(str);
07137 dest[0] = ':';
07138 memcpy(dest + 1, ptr, len);
07139 }
07140 return str;
07141 }
07142
07143
07144
07145
07146
07147
07148
07149
07150
07151
07152
07153
07154
07155 VALUE
07156 rb_sym_to_s(VALUE sym)
07157 {
07158 ID id = SYM2ID(sym);
07159
07160 return str_new3(rb_cString, rb_id2str(id));
07161 }
07162
07163
07164
07165
07166
07167
07168
07169
07170
07171
07172
07173
07174 static VALUE
07175 sym_to_sym(VALUE sym)
07176 {
07177 return sym;
07178 }
07179
07180 VALUE rb_funcall_passing_block(VALUE recv, ID mid, int argc, const VALUE *argv);
07181
07182 static VALUE
07183 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07184 {
07185 VALUE obj;
07186
07187 if (argc < 1) {
07188 rb_raise(rb_eArgError, "no receiver given");
07189 }
07190 obj = argv[0];
07191 return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
07192 }
07193
07194
07195
07196
07197
07198
07199
07200
07201
07202
07203 static VALUE
07204 sym_to_proc(VALUE sym)
07205 {
07206 static VALUE sym_proc_cache = Qfalse;
07207 enum {SYM_PROC_CACHE_SIZE = 67};
07208 VALUE proc;
07209 long id, index;
07210 VALUE *aryp;
07211
07212 if (!sym_proc_cache) {
07213 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07214 rb_gc_register_mark_object(sym_proc_cache);
07215 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07216 }
07217
07218 id = SYM2ID(sym);
07219 index = (id % SYM_PROC_CACHE_SIZE) << 1;
07220
07221 aryp = RARRAY_PTR(sym_proc_cache);
07222 if (aryp[index] == sym) {
07223 return aryp[index + 1];
07224 }
07225 else {
07226 proc = rb_proc_new(sym_call, (VALUE)id);
07227 aryp[index] = sym;
07228 aryp[index + 1] = proc;
07229 return proc;
07230 }
07231 }
07232
07233
07234
07235
07236
07237
07238
07239
07240
07241 static VALUE
07242 sym_succ(VALUE sym)
07243 {
07244 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07245 }
07246
07247
07248
07249
07250
07251
07252
07253
07254
07255 static VALUE
07256 sym_cmp(VALUE sym, VALUE other)
07257 {
07258 if (!SYMBOL_P(other)) {
07259 return Qnil;
07260 }
07261 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07262 }
07263
07264
07265
07266
07267
07268
07269
07270
07271
07272 static VALUE
07273 sym_casecmp(VALUE sym, VALUE other)
07274 {
07275 if (!SYMBOL_P(other)) {
07276 return Qnil;
07277 }
07278 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07279 }
07280
07281
07282
07283
07284
07285
07286
07287
07288 static VALUE
07289 sym_match(VALUE sym, VALUE other)
07290 {
07291 return rb_str_match(rb_sym_to_s(sym), other);
07292 }
07293
07294
07295
07296
07297
07298
07299
07300
07301
07302 static VALUE
07303 sym_aref(int argc, VALUE *argv, VALUE sym)
07304 {
07305 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07306 }
07307
07308
07309
07310
07311
07312
07313
07314
07315 static VALUE
07316 sym_length(VALUE sym)
07317 {
07318 return rb_str_length(rb_id2str(SYM2ID(sym)));
07319 }
07320
07321
07322
07323
07324
07325
07326
07327
07328 static VALUE
07329 sym_empty(VALUE sym)
07330 {
07331 return rb_str_empty(rb_id2str(SYM2ID(sym)));
07332 }
07333
07334
07335
07336
07337
07338
07339
07340
07341 static VALUE
07342 sym_upcase(VALUE sym)
07343 {
07344 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07345 }
07346
07347
07348
07349
07350
07351
07352
07353
07354 static VALUE
07355 sym_downcase(VALUE sym)
07356 {
07357 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07358 }
07359
07360
07361
07362
07363
07364
07365
07366
07367 static VALUE
07368 sym_capitalize(VALUE sym)
07369 {
07370 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07371 }
07372
07373
07374
07375
07376
07377
07378
07379
07380 static VALUE
07381 sym_swapcase(VALUE sym)
07382 {
07383 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07384 }
07385
07386
07387
07388
07389
07390
07391
07392
07393 static VALUE
07394 sym_encoding(VALUE sym)
07395 {
07396 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07397 }
07398
07399 ID
07400 rb_to_id(VALUE name)
07401 {
07402 VALUE tmp;
07403 ID id;
07404
07405 switch (TYPE(name)) {
07406 default:
07407 tmp = rb_check_string_type(name);
07408 if (NIL_P(tmp)) {
07409 tmp = rb_inspect(name);
07410 rb_raise(rb_eTypeError, "%s is not a symbol",
07411 RSTRING_PTR(tmp));
07412 }
07413 name = tmp;
07414
07415 case T_STRING:
07416 name = rb_str_intern(name);
07417
07418 case T_SYMBOL:
07419 return SYM2ID(name);
07420 }
07421 return id;
07422 }
07423
07424
07425
07426
07427
07428
07429
07430
07431
07432
07433
07434
07435
07436
07437 void
07438 Init_String(void)
07439 {
07440 #undef rb_intern
07441 #define rb_intern(str) rb_intern_const(str)
07442
07443 rb_cString = rb_define_class("String", rb_cObject);
07444 rb_include_module(rb_cString, rb_mComparable);
07445 rb_define_alloc_func(rb_cString, str_alloc);
07446 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07447 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07448 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07449 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07450 rb_define_method(rb_cString, "==", rb_str_equal, 1);
07451 rb_define_method(rb_cString, "===", rb_str_equal, 1);
07452 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07453 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07454 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07455 rb_define_method(rb_cString, "+", rb_str_plus, 1);
07456 rb_define_method(rb_cString, "*", rb_str_times, 1);
07457 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07458 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07459 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07460 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07461 rb_define_method(rb_cString, "length", rb_str_length, 0);
07462 rb_define_method(rb_cString, "size", rb_str_length, 0);
07463 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07464 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07465 rb_define_method(rb_cString, "=~", rb_str_match, 1);
07466 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07467 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07468 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07469 rb_define_method(rb_cString, "next", rb_str_succ, 0);
07470 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07471 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07472 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07473 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07474 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07475 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07476 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07477 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07478 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07479
07480 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07481 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07482 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07483 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07484 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07485 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07486
07487 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07488 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07489 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07490 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07491
07492 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07493 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07494 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07495 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07496
07497 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07498 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07499 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07500 rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07501 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07502 rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07503 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07504 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07505 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07506 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07507 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07508 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07509 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07510 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07511 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07512
07513 rb_define_method(rb_cString, "include?", rb_str_include, 1);
07514 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07515 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07516
07517 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07518
07519 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07520 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07521 rb_define_method(rb_cString, "center", rb_str_center, -1);
07522
07523 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07524 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07525 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07526 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07527 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07528 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07529 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07530
07531 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07532 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07533 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07534 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07535 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07536 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07537 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07538
07539 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07540 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07541 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07542 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07543 rb_define_method(rb_cString, "count", rb_str_count, -1);
07544
07545 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07546 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07547 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07548 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07549
07550 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07551 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07552 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07553 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07554
07555 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07556
07557 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07558 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07559
07560 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07561 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07562
07563 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
07564 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07565 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07566 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07567
07568 id_to_s = rb_intern("to_s");
07569
07570 rb_fs = Qnil;
07571 rb_define_variable("$;", &rb_fs);
07572 rb_define_variable("$-F", &rb_fs);
07573
07574 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07575 rb_include_module(rb_cSymbol, rb_mComparable);
07576 rb_undef_alloc_func(rb_cSymbol);
07577 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07578 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
07579
07580 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07581 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07582 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07583 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07584 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07585 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07586 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07587 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07588 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07589 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07590
07591 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07592 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07593 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07594
07595 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07596 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07597 rb_define_method(rb_cSymbol, "length", sym_length, 0);
07598 rb_define_method(rb_cSymbol, "size", sym_length, 0);
07599 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07600 rb_define_method(rb_cSymbol, "match", sym_match, 1);
07601
07602 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07603 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07604 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07605 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07606
07607 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07608 }
07609