| Class | RubyLex |
| In: |
parsers/parse_rb.rb
|
| Parent: | Object |
Lexical analyzer for Ruby source
# File parsers/parse_rb.rb, line 447
447: def initialize(content, options)
448: lex_init
449:
450: @options = options
451:
452: @reader = BufferedReader.new content, @options
453:
454: @exp_line_no = @line_no = 1
455: @base_char_no = 0
456: @indent = 0
457:
458: @ltype = nil
459: @quoted = nil
460: @lex_state = EXPR_BEG
461: @space_seen = false
462:
463: @continue = false
464: @line = ""
465:
466: @skip_space = false
467: @read_auto_clean_up = false
468: @exception_on_syntax_error = true
469: end
# File parsers/parse_rb.rb, line 497
497: def gets
498: c = getc or return
499: l = ""
500: begin
501: l.concat c unless c == "\r"
502: break if c == "\n"
503: end while c = getc
504: l
505: end
# File parsers/parse_rb.rb, line 1273
1273: def identify_comment
1274: @ltype = "#"
1275: comment = "#"
1276: while ch = getc
1277: if ch == "\\"
1278: ch = getc
1279: if ch == "\n"
1280: ch = " "
1281: else
1282: comment << "\\"
1283: end
1284: else
1285: if ch == "\n"
1286: @ltype = nil
1287: ungetc
1288: break
1289: end
1290: end
1291: comment << ch
1292: end
1293: return Token(TkCOMMENT).set_text(comment)
1294: end
# File parsers/parse_rb.rb, line 968
968: def identify_gvar
969: @lex_state = EXPR_END
970: str = "$"
971:
972: tk = case ch = getc
973: when /[~_*$?!@\/\\;,=:<>".]/ #"
974: str << ch
975: Token(TkGVAR, str)
976:
977: when "-"
978: str << "-" << getc
979: Token(TkGVAR, str)
980:
981: when "&", "`", "'", "+"
982: str << ch
983: Token(TkBACK_REF, str)
984:
985: when /[1-9]/
986: str << ch
987: while (ch = getc) =~ /[0-9]/
988: str << ch
989: end
990: ungetc
991: Token(TkNTH_REF)
992: when /\w/
993: ungetc
994: ungetc
995: return identify_identifier
996: else
997: ungetc
998: Token("$")
999: end
1000: tk.set_text(str)
1001: end
# File parsers/parse_rb.rb, line 1078
1078: def identify_here_document
1079: ch = getc
1080: if ch == "-"
1081: ch = getc
1082: indent = true
1083: end
1084: if /['"`]/ =~ ch # '
1085: lt = ch
1086: quoted = ""
1087: while (c = getc) && c != lt
1088: quoted.concat c
1089: end
1090: else
1091: lt = '"'
1092: quoted = ch.dup
1093: while (c = getc) && c =~ /\w/
1094: quoted.concat c
1095: end
1096: ungetc
1097: end
1098:
1099: ltback, @ltype = @ltype, lt
1100: reserve = ""
1101:
1102: while ch = getc
1103: reserve << ch
1104: if ch == "\\" #"
1105: ch = getc
1106: reserve << ch
1107: elsif ch == "\n"
1108: break
1109: end
1110: end
1111:
1112: str = ""
1113: while (l = gets)
1114: l.chomp!
1115: l.strip! if indent
1116: break if l == quoted
1117: str << l.chomp << "\n"
1118: end
1119:
1120: @reader.divert_read_from(reserve)
1121:
1122: @ltype = ltback
1123: @lex_state = EXPR_END
1124: Token(Ltype2Token[lt], str).set_text(str.dump)
1125: end
# File parsers/parse_rb.rb, line 1003
1003: def identify_identifier
1004: token = ""
1005: token.concat getc if peek(0) =~ /[$@]/
1006: token.concat getc if peek(0) == "@"
1007:
1008: while (ch = getc) =~ /\w|_/
1009: print ":", ch, ":" if RubyLex.debug?
1010: token.concat ch
1011: end
1012: ungetc
1013:
1014: if ch == "!" or ch == "?"
1015: token.concat getc
1016: end
1017: # fix token
1018:
1019: # $stderr.puts "identifier - #{token}, state = #@lex_state"
1020:
1021: case token
1022: when /^\$/
1023: return Token(TkGVAR, token).set_text(token)
1024: when /^\@/
1025: @lex_state = EXPR_END
1026: return Token(TkIVAR, token).set_text(token)
1027: end
1028:
1029: if @lex_state != EXPR_DOT
1030: print token, "\n" if RubyLex.debug?
1031:
1032: token_c, *trans = TkReading2Token[token]
1033: if token_c
1034: # reserved word?
1035:
1036: if (@lex_state != EXPR_BEG &&
1037: @lex_state != EXPR_FNAME &&
1038: trans[1])
1039: # modifiers
1040: token_c = TkSymbol2Token[trans[1]]
1041: @lex_state = trans[0]
1042: else
1043: if @lex_state != EXPR_FNAME
1044: if ENINDENT_CLAUSE.include?(token)
1045: @indent += 1
1046: elsif DEINDENT_CLAUSE.include?(token)
1047: @indent -= 1
1048: end
1049: @lex_state = trans[0]
1050: else
1051: @lex_state = EXPR_END
1052: end
1053: end
1054: return Token(token_c, token).set_text(token)
1055: end
1056: end
1057:
1058: if @lex_state == EXPR_FNAME
1059: @lex_state = EXPR_END
1060: if peek(0) == '='
1061: token.concat getc
1062: end
1063: elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT
1064: @lex_state = EXPR_ARG
1065: else
1066: @lex_state = EXPR_END
1067: end
1068:
1069: if token[0, 1] =~ /[A-Z]/
1070: return Token(TkCONSTANT, token).set_text(token)
1071: elsif token[token.size - 1, 1] =~ /[!?]/
1072: return Token(TkFID, token).set_text(token)
1073: else
1074: return Token(TkIDENTIFIER, token).set_text(token)
1075: end
1076: end
# File parsers/parse_rb.rb, line 1146
1146: def identify_number(start)
1147: str = start.dup
1148:
1149: if start == "+" or start == "-" or start == ""
1150: start = getc
1151: str << start
1152: end
1153:
1154: @lex_state = EXPR_END
1155:
1156: if start == "0"
1157: if peek(0) == "x"
1158: ch = getc
1159: str << ch
1160: match = /[0-9a-f_]/
1161: else
1162: match = /[0-7_]/
1163: end
1164: while ch = getc
1165: if ch !~ match
1166: ungetc
1167: break
1168: else
1169: str << ch
1170: end
1171: end
1172: return Token(TkINTEGER).set_text(str)
1173: end
1174:
1175: type = TkINTEGER
1176: allow_point = TRUE
1177: allow_e = TRUE
1178: while ch = getc
1179: case ch
1180: when /[0-9_]/
1181: str << ch
1182:
1183: when allow_point && "."
1184: type = TkFLOAT
1185: if peek(0) !~ /[0-9]/
1186: ungetc
1187: break
1188: end
1189: str << ch
1190: allow_point = false
1191:
1192: when allow_e && "e", allow_e && "E"
1193: str << ch
1194: type = TkFLOAT
1195: if peek(0) =~ /[+-]/
1196: str << getc
1197: end
1198: allow_e = false
1199: allow_point = false
1200: else
1201: ungetc
1202: break
1203: end
1204: end
1205: Token(type).set_text(str)
1206: end
# File parsers/parse_rb.rb, line 1127
1127: def identify_quotation(initial_char)
1128: ch = getc
1129: if lt = PERCENT_LTYPE[ch]
1130: initial_char += ch
1131: ch = getc
1132: elsif ch =~ /\W/
1133: lt = "\""
1134: else
1135: fail SyntaxError, "unknown type of %string ('#{ch}')"
1136: end
1137: # if ch !~ /\W/
1138: # ungetc
1139: # next
1140: # end
1141: #@ltype = lt
1142: @quoted = ch unless @quoted = PERCENT_PAREN[ch]
1143: identify_string(lt, @quoted, ch, initial_char)
1144: end
# File parsers/parse_rb.rb, line 1208
1208: def identify_string(ltype, quoted = ltype, opener=nil, initial_char = nil)
1209: @ltype = ltype
1210: @quoted = quoted
1211: subtype = nil
1212:
1213: str = ""
1214: str << initial_char if initial_char
1215: str << (opener||quoted)
1216:
1217: nest = 0
1218: begin
1219: while ch = getc
1220: str << ch
1221: if @quoted == ch
1222: if nest == 0
1223: break
1224: else
1225: nest -= 1
1226: end
1227: elsif opener == ch
1228: nest += 1
1229: elsif @ltype != "'" && @ltype != "]" and ch == "#"
1230: ch = getc
1231: if ch == "{"
1232: subtype = true
1233: str << ch << skip_inner_expression
1234: else
1235: ungetc(ch)
1236: end
1237: elsif ch == '\\' #'
1238: str << read_escape
1239: end
1240: end
1241: if @ltype == "/"
1242: if peek(0) =~ /i|o|n|e|s/
1243: str << getc
1244: end
1245: end
1246: if subtype
1247: Token(DLtype2Token[ltype], str)
1248: else
1249: Token(Ltype2Token[ltype], str)
1250: end.set_text(str)
1251: ensure
1252: @ltype = nil
1253: @quoted = nil
1254: @lex_state = EXPR_END
1255: end
1256: end
# File parsers/parse_rb.rb, line 520
520: def lex
521: until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) &&
522: !@continue or
523: tk.nil?)
524: end
525: line = get_read
526:
527: if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil?
528: nil
529: else
530: line
531: end
532: end
# File parsers/parse_rb.rb, line 589
589: def lex_init()
590: if RUBY_VERSION.to_f < 1.9
591: @OP = SLex.new
592: else
593: @OP = IRB::SLex.new
594: end
595: @OP.def_rules("\0", "\004", "\032") do |chars, io|
596: Token(TkEND_OF_SCRIPT).set_text(chars)
597: end
598:
599: @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |chars, io|
600: @space_seen = TRUE
601: while (ch = getc) =~ /[ \t\f\r\13]/
602: chars << ch
603: end
604: ungetc
605: Token(TkSPACE).set_text(chars)
606: end
607:
608: @OP.def_rule("#") do
609: |op, io|
610: identify_comment
611: end
612:
613: @OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do
614: |op, io|
615: str = op
616: @ltype = "="
617:
618:
619: begin
620: line = ""
621: begin
622: ch = getc
623: line << ch
624: end until ch == "\n"
625: str << line
626: end until line =~ /^=end/
627:
628: ungetc
629:
630: @ltype = nil
631:
632: if str =~ /\A=begin\s+rdoc/i
633: str.sub!(/\A=begin.*\n/, '')
634: str.sub!(/^=end.*/m, '')
635: Token(TkCOMMENT).set_text(str)
636: else
637: Token(TkRD_COMMENT)#.set_text(str)
638: end
639: end
640:
641: @OP.def_rule("\n") do
642: print "\\n\n" if RubyLex.debug?
643: case @lex_state
644: when EXPR_BEG, EXPR_FNAME, EXPR_DOT
645: @continue = TRUE
646: else
647: @continue = FALSE
648: @lex_state = EXPR_BEG
649: end
650: Token(TkNL).set_text("\n")
651: end
652:
653: @OP.def_rules("*", "**",
654: "!", "!=", "!~",
655: "=", "==", "===",
656: "=~", "<=>",
657: "<", "<=",
658: ">", ">=", ">>") do
659: |op, io|
660: @lex_state = EXPR_BEG
661: Token(op).set_text(op)
662: end
663:
664: @OP.def_rules("<<") do
665: |op, io|
666: tk = nil
667: if @lex_state != EXPR_END && @lex_state != EXPR_CLASS &&
668: (@lex_state != EXPR_ARG || @space_seen)
669: c = peek(0)
670: if /[-\w_\"\'\`]/ =~ c
671: tk = identify_here_document
672: end
673: end
674: if !tk
675: @lex_state = EXPR_BEG
676: tk = Token(op).set_text(op)
677: end
678: tk
679: end
680:
681: @OP.def_rules("'", '"') do
682: |op, io|
683: identify_string(op)
684: end
685:
686: @OP.def_rules("`") do
687: |op, io|
688: if @lex_state == EXPR_FNAME
689: Token(op).set_text(op)
690: else
691: identify_string(op)
692: end
693: end
694:
695: @OP.def_rules('?') do
696: |op, io|
697: if @lex_state == EXPR_END
698: @lex_state = EXPR_BEG
699: Token(TkQUESTION).set_text(op)
700: else
701: ch = getc
702: if @lex_state == EXPR_ARG && ch !~ /\s/
703: ungetc
704: @lex_state = EXPR_BEG;
705: Token(TkQUESTION).set_text(op)
706: else
707: str = op
708: str << ch
709: if (ch == '\\') #'
710: str << read_escape
711: end
712: @lex_state = EXPR_END
713: Token(TkINTEGER).set_text(str)
714: end
715: end
716: end
717:
718: @OP.def_rules("&", "&&", "|", "||") do
719: |op, io|
720: @lex_state = EXPR_BEG
721: Token(op).set_text(op)
722: end
723:
724: @OP.def_rules("+=", "-=", "*=", "**=",
725: "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do
726: |op, io|
727: @lex_state = EXPR_BEG
728: op =~ /^(.*)=$/
729: Token(TkOPASGN, $1).set_text(op)
730: end
731:
732: @OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do |op, io|
733: Token(TkUPLUS).set_text(op)
734: end
735:
736: @OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do |op, io|
737: Token(TkUMINUS).set_text(op)
738: end
739:
740: @OP.def_rules("+", "-") do
741: |op, io|
742: catch(:RET) do
743: if @lex_state == EXPR_ARG
744: if @space_seen and peek(0) =~ /[0-9]/
745: throw :RET, identify_number(op)
746: else
747: @lex_state = EXPR_BEG
748: end
749: elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/
750: throw :RET, identify_number(op)
751: else
752: @lex_state = EXPR_BEG
753: end
754: Token(op).set_text(op)
755: end
756: end
757:
758: @OP.def_rule(".") do
759: @lex_state = EXPR_BEG
760: if peek(0) =~ /[0-9]/
761: ungetc
762: identify_number("")
763: else
764: # for obj.if
765: @lex_state = EXPR_DOT
766: Token(TkDOT).set_text(".")
767: end
768: end
769:
770: @OP.def_rules("..", "...") do
771: |op, io|
772: @lex_state = EXPR_BEG
773: Token(op).set_text(op)
774: end
775:
776: lex_int2
777: end
# File parsers/parse_rb.rb, line 779
779: def lex_int2
780: @OP.def_rules("]", "}", ")") do
781: |op, io|
782: @lex_state = EXPR_END
783: @indent -= 1
784: Token(op).set_text(op)
785: end
786:
787: @OP.def_rule(":") do
788: if @lex_state == EXPR_END || peek(0) =~ /\s/
789: @lex_state = EXPR_BEG
790: tk = Token(TkCOLON)
791: else
792: @lex_state = EXPR_FNAME;
793: tk = Token(TkSYMBEG)
794: end
795: tk.set_text(":")
796: end
797:
798: @OP.def_rule("::") do
799: # p @lex_state.id2name, @space_seen
800: if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen
801: @lex_state = EXPR_BEG
802: tk = Token(TkCOLON3)
803: else
804: @lex_state = EXPR_DOT
805: tk = Token(TkCOLON2)
806: end
807: tk.set_text("::")
808: end
809:
810: @OP.def_rule("/") do
811: |op, io|
812: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
813: identify_string(op)
814: elsif peek(0) == '='
815: getc
816: @lex_state = EXPR_BEG
817: Token(TkOPASGN, :/).set_text("/=") #")
818: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
819: identify_string(op)
820: else
821: @lex_state = EXPR_BEG
822: Token("/").set_text(op)
823: end
824: end
825:
826: @OP.def_rules("^") do
827: @lex_state = EXPR_BEG
828: Token("^").set_text("^")
829: end
830:
831: # @OP.def_rules("^=") do
832: # @lex_state = EXPR_BEG
833: # Token(TkOPASGN, :^)
834: # end
835:
836: @OP.def_rules(",", ";") do
837: |op, io|
838: @lex_state = EXPR_BEG
839: Token(op).set_text(op)
840: end
841:
842: @OP.def_rule("~") do
843: @lex_state = EXPR_BEG
844: Token("~").set_text("~")
845: end
846:
847: @OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do
848: @lex_state = EXPR_BEG
849: Token("~").set_text("~@")
850: end
851:
852: @OP.def_rule("(") do
853: @indent += 1
854: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
855: @lex_state = EXPR_BEG
856: tk = Token(TkfLPAREN)
857: else
858: @lex_state = EXPR_BEG
859: tk = Token(TkLPAREN)
860: end
861: tk.set_text("(")
862: end
863:
864: @OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do
865: Token("[]").set_text("[]")
866: end
867:
868: @OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do
869: Token("[]=").set_text("[]=")
870: end
871:
872: @OP.def_rule("[") do
873: @indent += 1
874: if @lex_state == EXPR_FNAME
875: t = Token(TkfLBRACK)
876: else
877: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
878: t = Token(TkLBRACK)
879: elsif @lex_state == EXPR_ARG && @space_seen
880: t = Token(TkLBRACK)
881: else
882: t = Token(TkfLBRACK)
883: end
884: @lex_state = EXPR_BEG
885: end
886: t.set_text("[")
887: end
888:
889: @OP.def_rule("{") do
890: @indent += 1
891: if @lex_state != EXPR_END && @lex_state != EXPR_ARG
892: t = Token(TkLBRACE)
893: else
894: t = Token(TkfLBRACE)
895: end
896: @lex_state = EXPR_BEG
897: t.set_text("{")
898: end
899:
900: @OP.def_rule('\\') do #'
901: if getc == "\n"
902: @space_seen = true
903: @continue = true
904: Token(TkSPACE).set_text("\\\n")
905: else
906: ungetc
907: Token("\\").set_text("\\") #"
908: end
909: end
910:
911: @OP.def_rule('%') do
912: |op, io|
913: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
914: identify_quotation('%')
915: elsif peek(0) == '='
916: getc
917: Token(TkOPASGN, "%").set_text("%=")
918: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
919: identify_quotation('%')
920: else
921: @lex_state = EXPR_BEG
922: Token("%").set_text("%")
923: end
924: end
925:
926: @OP.def_rule('$') do #'
927: identify_gvar
928: end
929:
930: @OP.def_rule('@') do
931: if peek(0) =~ /[@\w_]/
932: ungetc
933: identify_identifier
934: else
935: Token("@").set_text("@")
936: end
937: end
938:
939: # @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do
940: # |op, io|
941: # @indent += 1
942: # @lex_state = EXPR_FNAME
943: # # @lex_state = EXPR_END
944: # # until @rests[0] == "\n" or @rests[0] == ";"
945: # # rests.shift
946: # # end
947: # end
948:
949: @OP.def_rule("__END__", proc{@prev_char_no == 0 && peek(0) =~ /[\r\n]/}) do
950: throw :eof
951: end
952:
953: @OP.def_rule("") do
954: |op, io|
955: printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug?
956: if peek(0) =~ /[0-9]/
957: t = identify_number("")
958: elsif peek(0) =~ /[\w_]/
959: t = identify_identifier
960: end
961: printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug?
962: t
963: end
964:
965: p @OP if RubyLex.debug?
966: end
# File parsers/parse_rb.rb, line 512
512: def peek_equal?(str)
513: @reader.peek_equal(str)
514: end
# File parsers/parse_rb.rb, line 1296
1296: def read_escape
1297: res = ""
1298: case ch = getc
1299: when /[0-7]/
1300: ungetc ch
1301: 3.times do
1302: case ch = getc
1303: when /[0-7]/
1304: when nil
1305: break
1306: else
1307: ungetc
1308: break
1309: end
1310: res << ch
1311: end
1312:
1313: when "x"
1314: res << ch
1315: 2.times do
1316: case ch = getc
1317: when /[0-9a-fA-F]/
1318: when nil
1319: break
1320: else
1321: ungetc
1322: break
1323: end
1324: res << ch
1325: end
1326:
1327: when "M"
1328: res << ch
1329: if (ch = getc) != '-'
1330: ungetc
1331: else
1332: res << ch
1333: if (ch = getc) == "\\" #"
1334: res << ch
1335: res << read_escape
1336: else
1337: res << ch
1338: end
1339: end
1340:
1341: when "C", "c" #, "^"
1342: res << ch
1343: if ch == "C" and (ch = getc) != "-"
1344: ungetc
1345: else
1346: res << ch
1347: if (ch = getc) == "\\" #"
1348: res << ch
1349: res << read_escape
1350: else
1351: res << ch
1352: end
1353: end
1354: else
1355: res << ch
1356: end
1357: res
1358: end
# File parsers/parse_rb.rb, line 1258
1258: def skip_inner_expression
1259: res = ""
1260: nest = 0
1261: while (ch = getc)
1262: res << ch
1263: if ch == '}'
1264: break if nest.zero?
1265: nest -= 1
1266: elsif ch == '{'
1267: nest += 1
1268: end
1269: end
1270: res
1271: end
# File parsers/parse_rb.rb, line 534
534: def token
535: set_token_position(line_no, char_no)
536: begin
537: begin
538: tk = @OP.match(self)
539: @space_seen = tk.kind_of?(TkSPACE)
540: rescue SyntaxError
541: abort if @exception_on_syntax_error
542: tk = TkError.new(line_no, char_no)
543: end
544: end while @skip_space and tk.kind_of?(TkSPACE)
545: if @read_auto_clean_up
546: get_read
547: end
548: # throw :eof unless tk
549: tk
550: end