| Class | RubyLex |
| In: |
parsers/parse_rb.rb
|
| Parent: | Object |
Lexical analyzer for Ruby source
| ENINDENT_CLAUSE | = | [ "case", "class", "def", "do", "for", "if", "module", "unless", "until", "while", "begin" |
| DEINDENT_CLAUSE | = | ["end" |
| PERCENT_LTYPE | = | { "q" => "\'", "Q" => "\"", "x" => "\`", "r" => "/", "w" => "]" |
| PERCENT_PAREN | = | { "{" => "}", "[" => "]", "<" => ">", "(" => ")" |
| Ltype2Token | = | { "\'" => TkSTRING, "\"" => TkSTRING, "\`" => TkXSTRING, "/" => TkREGEXP, "]" => TkDSTRING |
| DLtype2Token | = | { "\"" => TkDSTRING, "\`" => TkDXSTRING, "/" => TkDREGEXP, } |
| continue | [R] | include IRB # 1.8.2 doesn‘t support IRB::SLex |
| exception_on_syntax_error | [RW] | |
| indent | [R] | |
| lex_state | [R] | |
| read_auto_clean_up | [RW] | |
| skip_space | [RW] |
# File parsers/parse_rb.rb, line 447
447: def initialize(content)
448: lex_init
449:
450: @reader = BufferedReader.new(content)
451:
452: @exp_line_no = @line_no = 1
453: @base_char_no = 0
454: @indent = 0
455:
456: @ltype = nil
457: @quoted = nil
458: @lex_state = EXPR_BEG
459: @space_seen = false
460:
461: @continue = false
462: @line = ""
463:
464: @skip_space = false
465: @read_auto_clean_up = false
466: @exception_on_syntax_error = true
467: end
# File parsers/parse_rb.rb, line 495
495: def gets
496: c = getc or return
497: l = ""
498: begin
499: l.concat c unless c == "\r"
500: break if c == "\n"
501: end while c = getc
502: l
503: end
# File parsers/parse_rb.rb, line 1269
1269: def identify_comment
1270: @ltype = "#"
1271: comment = "#"
1272: while ch = getc
1273: if ch == "\\"
1274: ch = getc
1275: if ch == "\n"
1276: ch = " "
1277: else
1278: comment << "\\"
1279: end
1280: else
1281: if ch == "\n"
1282: @ltype = nil
1283: ungetc
1284: break
1285: end
1286: end
1287: comment << ch
1288: end
1289: return Token(TkCOMMENT).set_text(comment)
1290: end
# File parsers/parse_rb.rb, line 964
964: def identify_gvar
965: @lex_state = EXPR_END
966: str = "$"
967:
968: tk = case ch = getc
969: when /[~_*$?!@\/\\;,=:<>".]/ #"
970: str << ch
971: Token(TkGVAR, str)
972:
973: when "-"
974: str << "-" << getc
975: Token(TkGVAR, str)
976:
977: when "&", "`", "'", "+"
978: str << ch
979: Token(TkBACK_REF, str)
980:
981: when /[1-9]/
982: str << ch
983: while (ch = getc) =~ /[0-9]/
984: str << ch
985: end
986: ungetc
987: Token(TkNTH_REF)
988: when /\w/
989: ungetc
990: ungetc
991: return identify_identifier
992: else
993: ungetc
994: Token("$")
995: end
996: tk.set_text(str)
997: end
# File parsers/parse_rb.rb, line 1074
1074: def identify_here_document
1075: ch = getc
1076: if ch == "-"
1077: ch = getc
1078: indent = true
1079: end
1080: if /['"`]/ =~ ch # '
1081: lt = ch
1082: quoted = ""
1083: while (c = getc) && c != lt
1084: quoted.concat c
1085: end
1086: else
1087: lt = '"'
1088: quoted = ch.dup
1089: while (c = getc) && c =~ /\w/
1090: quoted.concat c
1091: end
1092: ungetc
1093: end
1094:
1095: ltback, @ltype = @ltype, lt
1096: reserve = ""
1097:
1098: while ch = getc
1099: reserve << ch
1100: if ch == "\\" #"
1101: ch = getc
1102: reserve << ch
1103: elsif ch == "\n"
1104: break
1105: end
1106: end
1107:
1108: str = ""
1109: while (l = gets)
1110: l.chomp!
1111: l.strip! if indent
1112: break if l == quoted
1113: str << l.chomp << "\n"
1114: end
1115:
1116: @reader.divert_read_from(reserve)
1117:
1118: @ltype = ltback
1119: @lex_state = EXPR_END
1120: Token(Ltype2Token[lt], str).set_text(str.dump)
1121: end
# File parsers/parse_rb.rb, line 999
999: def identify_identifier
1000: token = ""
1001: token.concat getc if peek(0) =~ /[$@]/
1002: token.concat getc if peek(0) == "@"
1003:
1004: while (ch = getc) =~ /\w|_/
1005: print ":", ch, ":" if RubyLex.debug?
1006: token.concat ch
1007: end
1008: ungetc
1009:
1010: if ch == "!" or ch == "?"
1011: token.concat getc
1012: end
1013: # fix token
1014:
1015: # $stderr.puts "identifier - #{token}, state = #@lex_state"
1016:
1017: case token
1018: when /^\$/
1019: return Token(TkGVAR, token).set_text(token)
1020: when /^\@/
1021: @lex_state = EXPR_END
1022: return Token(TkIVAR, token).set_text(token)
1023: end
1024:
1025: if @lex_state != EXPR_DOT
1026: print token, "\n" if RubyLex.debug?
1027:
1028: token_c, *trans = TkReading2Token[token]
1029: if token_c
1030: # reserved word?
1031:
1032: if (@lex_state != EXPR_BEG &&
1033: @lex_state != EXPR_FNAME &&
1034: trans[1])
1035: # modifiers
1036: token_c = TkSymbol2Token[trans[1]]
1037: @lex_state = trans[0]
1038: else
1039: if @lex_state != EXPR_FNAME
1040: if ENINDENT_CLAUSE.include?(token)
1041: @indent += 1
1042: elsif DEINDENT_CLAUSE.include?(token)
1043: @indent -= 1
1044: end
1045: @lex_state = trans[0]
1046: else
1047: @lex_state = EXPR_END
1048: end
1049: end
1050: return Token(token_c, token).set_text(token)
1051: end
1052: end
1053:
1054: if @lex_state == EXPR_FNAME
1055: @lex_state = EXPR_END
1056: if peek(0) == '='
1057: token.concat getc
1058: end
1059: elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT
1060: @lex_state = EXPR_ARG
1061: else
1062: @lex_state = EXPR_END
1063: end
1064:
1065: if token[0, 1] =~ /[A-Z]/
1066: return Token(TkCONSTANT, token).set_text(token)
1067: elsif token[token.size - 1, 1] =~ /[!?]/
1068: return Token(TkFID, token).set_text(token)
1069: else
1070: return Token(TkIDENTIFIER, token).set_text(token)
1071: end
1072: end
# File parsers/parse_rb.rb, line 1142
1142: def identify_number(start)
1143: str = start.dup
1144:
1145: if start == "+" or start == "-" or start == ""
1146: start = getc
1147: str << start
1148: end
1149:
1150: @lex_state = EXPR_END
1151:
1152: if start == "0"
1153: if peek(0) == "x"
1154: ch = getc
1155: str << ch
1156: match = /[0-9a-f_]/
1157: else
1158: match = /[0-7_]/
1159: end
1160: while ch = getc
1161: if ch !~ match
1162: ungetc
1163: break
1164: else
1165: str << ch
1166: end
1167: end
1168: return Token(TkINTEGER).set_text(str)
1169: end
1170:
1171: type = TkINTEGER
1172: allow_point = TRUE
1173: allow_e = TRUE
1174: while ch = getc
1175: case ch
1176: when /[0-9_]/
1177: str << ch
1178:
1179: when allow_point && "."
1180: type = TkFLOAT
1181: if peek(0) !~ /[0-9]/
1182: ungetc
1183: break
1184: end
1185: str << ch
1186: allow_point = false
1187:
1188: when allow_e && "e", allow_e && "E"
1189: str << ch
1190: type = TkFLOAT
1191: if peek(0) =~ /[+-]/
1192: str << getc
1193: end
1194: allow_e = false
1195: allow_point = false
1196: else
1197: ungetc
1198: break
1199: end
1200: end
1201: Token(type).set_text(str)
1202: end
# File parsers/parse_rb.rb, line 1123
1123: def identify_quotation(initial_char)
1124: ch = getc
1125: if lt = PERCENT_LTYPE[ch]
1126: initial_char += ch
1127: ch = getc
1128: elsif ch =~ /\W/
1129: lt = "\""
1130: else
1131: RubyLex.fail SyntaxError, "unknown type of %string ('#{ch}')"
1132: end
1133: # if ch !~ /\W/
1134: # ungetc
1135: # next
1136: # end
1137: #@ltype = lt
1138: @quoted = ch unless @quoted = PERCENT_PAREN[ch]
1139: identify_string(lt, @quoted, ch, initial_char)
1140: end
# File parsers/parse_rb.rb, line 1204
1204: def identify_string(ltype, quoted = ltype, opener=nil, initial_char = nil)
1205: @ltype = ltype
1206: @quoted = quoted
1207: subtype = nil
1208:
1209: str = ""
1210: str << initial_char if initial_char
1211: str << (opener||quoted)
1212:
1213: nest = 0
1214: begin
1215: while ch = getc
1216: str << ch
1217: if @quoted == ch
1218: if nest == 0
1219: break
1220: else
1221: nest -= 1
1222: end
1223: elsif opener == ch
1224: nest += 1
1225: elsif @ltype != "'" && @ltype != "]" and ch == "#"
1226: ch = getc
1227: if ch == "{"
1228: subtype = true
1229: str << ch << skip_inner_expression
1230: else
1231: ungetc(ch)
1232: end
1233: elsif ch == '\\' #'
1234: str << read_escape
1235: end
1236: end
1237: if @ltype == "/"
1238: if peek(0) =~ /i|o|n|e|s/
1239: str << getc
1240: end
1241: end
1242: if subtype
1243: Token(DLtype2Token[ltype], str)
1244: else
1245: Token(Ltype2Token[ltype], str)
1246: end.set_text(str)
1247: ensure
1248: @ltype = nil
1249: @quoted = nil
1250: @lex_state = EXPR_END
1251: end
1252: end
# File parsers/parse_rb.rb, line 518
518: def lex
519: until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) &&
520: !@continue or
521: tk.nil?)
522: end
523: line = get_read
524:
525: if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil?
526: nil
527: else
528: line
529: end
530: end
# File parsers/parse_rb.rb, line 588
588: def lex_init()
589: @OP = SLex.new
590: # @OP = IRB::SLex.new # 1.8.2 doesn't support #IRB::SLex
591: @OP.def_rules("\0", "\004", "\032") do |chars, io|
592: Token(TkEND_OF_SCRIPT).set_text(chars)
593: end
594:
595: @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |chars, io|
596: @space_seen = TRUE
597: while (ch = getc) =~ /[ \t\f\r\13]/
598: chars << ch
599: end
600: ungetc
601: Token(TkSPACE).set_text(chars)
602: end
603:
604: @OP.def_rule("#") do
605: |op, io|
606: identify_comment
607: end
608:
609: @OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do
610: |op, io|
611: str = op
612: @ltype = "="
613:
614:
615: begin
616: line = ""
617: begin
618: ch = getc
619: line << ch
620: end until ch == "\n"
621: str << line
622: end until line =~ /^=end/
623:
624: ungetc
625:
626: @ltype = nil
627:
628: if str =~ /\A=begin\s+rdoc/i
629: str.sub!(/\A=begin.*\n/, '')
630: str.sub!(/^=end.*/m, '')
631: Token(TkCOMMENT).set_text(str)
632: else
633: Token(TkRD_COMMENT)#.set_text(str)
634: end
635: end
636:
637: @OP.def_rule("\n") do
638: print "\\n\n" if RubyLex.debug?
639: case @lex_state
640: when EXPR_BEG, EXPR_FNAME, EXPR_DOT
641: @continue = TRUE
642: else
643: @continue = FALSE
644: @lex_state = EXPR_BEG
645: end
646: Token(TkNL).set_text("\n")
647: end
648:
649: @OP.def_rules("*", "**",
650: "!", "!=", "!~",
651: "=", "==", "===",
652: "=~", "<=>",
653: "<", "<=",
654: ">", ">=", ">>") do
655: |op, io|
656: @lex_state = EXPR_BEG
657: Token(op).set_text(op)
658: end
659:
660: @OP.def_rules("<<") do
661: |op, io|
662: tk = nil
663: if @lex_state != EXPR_END && @lex_state != EXPR_CLASS &&
664: (@lex_state != EXPR_ARG || @space_seen)
665: c = peek(0)
666: if /[-\w_\"\'\`]/ =~ c
667: tk = identify_here_document
668: end
669: end
670: if !tk
671: @lex_state = EXPR_BEG
672: tk = Token(op).set_text(op)
673: end
674: tk
675: end
676:
677: @OP.def_rules("'", '"') do
678: |op, io|
679: identify_string(op)
680: end
681:
682: @OP.def_rules("`") do
683: |op, io|
684: if @lex_state == EXPR_FNAME
685: Token(op).set_text(op)
686: else
687: identify_string(op)
688: end
689: end
690:
691: @OP.def_rules('?') do
692: |op, io|
693: if @lex_state == EXPR_END
694: @lex_state = EXPR_BEG
695: Token(TkQUESTION).set_text(op)
696: else
697: ch = getc
698: if @lex_state == EXPR_ARG && ch !~ /\s/
699: ungetc
700: @lex_state = EXPR_BEG;
701: Token(TkQUESTION).set_text(op)
702: else
703: str = op
704: str << ch
705: if (ch == '\\') #'
706: str << read_escape
707: end
708: @lex_state = EXPR_END
709: Token(TkINTEGER).set_text(str)
710: end
711: end
712: end
713:
714: @OP.def_rules("&", "&&", "|", "||") do
715: |op, io|
716: @lex_state = EXPR_BEG
717: Token(op).set_text(op)
718: end
719:
720: @OP.def_rules("+=", "-=", "*=", "**=",
721: "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do
722: |op, io|
723: @lex_state = EXPR_BEG
724: op =~ /^(.*)=$/
725: Token(TkOPASGN, $1).set_text(op)
726: end
727:
728: @OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do |op, io|
729: Token(TkUPLUS).set_text(op)
730: end
731:
732: @OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do |op, io|
733: Token(TkUMINUS).set_text(op)
734: end
735:
736: @OP.def_rules("+", "-") do
737: |op, io|
738: catch(:RET) do
739: if @lex_state == EXPR_ARG
740: if @space_seen and peek(0) =~ /[0-9]/
741: throw :RET, identify_number(op)
742: else
743: @lex_state = EXPR_BEG
744: end
745: elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/
746: throw :RET, identify_number(op)
747: else
748: @lex_state = EXPR_BEG
749: end
750: Token(op).set_text(op)
751: end
752: end
753:
754: @OP.def_rule(".") do
755: @lex_state = EXPR_BEG
756: if peek(0) =~ /[0-9]/
757: ungetc
758: identify_number("")
759: else
760: # for obj.if
761: @lex_state = EXPR_DOT
762: Token(TkDOT).set_text(".")
763: end
764: end
765:
766: @OP.def_rules("..", "...") do
767: |op, io|
768: @lex_state = EXPR_BEG
769: Token(op).set_text(op)
770: end
771:
772: lex_int2
773: end
# File parsers/parse_rb.rb, line 775
775: def lex_int2
776: @OP.def_rules("]", "}", ")") do
777: |op, io|
778: @lex_state = EXPR_END
779: @indent -= 1
780: Token(op).set_text(op)
781: end
782:
783: @OP.def_rule(":") do
784: if @lex_state == EXPR_END || peek(0) =~ /\s/
785: @lex_state = EXPR_BEG
786: tk = Token(TkCOLON)
787: else
788: @lex_state = EXPR_FNAME;
789: tk = Token(TkSYMBEG)
790: end
791: tk.set_text(":")
792: end
793:
794: @OP.def_rule("::") do
795: # p @lex_state.id2name, @space_seen
796: if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen
797: @lex_state = EXPR_BEG
798: tk = Token(TkCOLON3)
799: else
800: @lex_state = EXPR_DOT
801: tk = Token(TkCOLON2)
802: end
803: tk.set_text("::")
804: end
805:
806: @OP.def_rule("/") do
807: |op, io|
808: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
809: identify_string(op)
810: elsif peek(0) == '='
811: getc
812: @lex_state = EXPR_BEG
813: Token(TkOPASGN, :/).set_text("/=") #")
814: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
815: identify_string(op)
816: else
817: @lex_state = EXPR_BEG
818: Token("/").set_text(op)
819: end
820: end
821:
822: @OP.def_rules("^") do
823: @lex_state = EXPR_BEG
824: Token("^").set_text("^")
825: end
826:
827: # @OP.def_rules("^=") do
828: # @lex_state = EXPR_BEG
829: # Token(TkOPASGN, :^)
830: # end
831:
832: @OP.def_rules(",", ";") do
833: |op, io|
834: @lex_state = EXPR_BEG
835: Token(op).set_text(op)
836: end
837:
838: @OP.def_rule("~") do
839: @lex_state = EXPR_BEG
840: Token("~").set_text("~")
841: end
842:
843: @OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do
844: @lex_state = EXPR_BEG
845: Token("~").set_text("~@")
846: end
847:
848: @OP.def_rule("(") do
849: @indent += 1
850: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
851: @lex_state = EXPR_BEG
852: tk = Token(TkfLPAREN)
853: else
854: @lex_state = EXPR_BEG
855: tk = Token(TkLPAREN)
856: end
857: tk.set_text("(")
858: end
859:
860: @OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do
861: Token("[]").set_text("[]")
862: end
863:
864: @OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do
865: Token("[]=").set_text("[]=")
866: end
867:
868: @OP.def_rule("[") do
869: @indent += 1
870: if @lex_state == EXPR_FNAME
871: t = Token(TkfLBRACK)
872: else
873: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
874: t = Token(TkLBRACK)
875: elsif @lex_state == EXPR_ARG && @space_seen
876: t = Token(TkLBRACK)
877: else
878: t = Token(TkfLBRACK)
879: end
880: @lex_state = EXPR_BEG
881: end
882: t.set_text("[")
883: end
884:
885: @OP.def_rule("{") do
886: @indent += 1
887: if @lex_state != EXPR_END && @lex_state != EXPR_ARG
888: t = Token(TkLBRACE)
889: else
890: t = Token(TkfLBRACE)
891: end
892: @lex_state = EXPR_BEG
893: t.set_text("{")
894: end
895:
896: @OP.def_rule('\\') do #'
897: if getc == "\n"
898: @space_seen = true
899: @continue = true
900: Token(TkSPACE).set_text("\\\n")
901: else
902: ungetc
903: Token("\\").set_text("\\") #"
904: end
905: end
906:
907: @OP.def_rule('%') do
908: |op, io|
909: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
910: identify_quotation('%')
911: elsif peek(0) == '='
912: getc
913: Token(TkOPASGN, "%").set_text("%=")
914: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
915: identify_quotation('%')
916: else
917: @lex_state = EXPR_BEG
918: Token("%").set_text("%")
919: end
920: end
921:
922: @OP.def_rule('$') do #'
923: identify_gvar
924: end
925:
926: @OP.def_rule('@') do
927: if peek(0) =~ /[@\w_]/
928: ungetc
929: identify_identifier
930: else
931: Token("@").set_text("@")
932: end
933: end
934:
935: # @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do
936: # |op, io|
937: # @indent += 1
938: # @lex_state = EXPR_FNAME
939: # # @lex_state = EXPR_END
940: # # until @rests[0] == "\n" or @rests[0] == ";"
941: # # rests.shift
942: # # end
943: # end
944:
945: @OP.def_rule("__END__", proc{@prev_char_no == 0 && peek(0) =~ /[\r\n]/}) do
946: throw :eof
947: end
948:
949: @OP.def_rule("") do
950: |op, io|
951: printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug?
952: if peek(0) =~ /[0-9]/
953: t = identify_number("")
954: elsif peek(0) =~ /[\w_]/
955: t = identify_identifier
956: end
957: printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug?
958: t
959: end
960:
961: p @OP if RubyLex.debug?
962: end
# File parsers/parse_rb.rb, line 510
510: def peek_equal?(str)
511: @reader.peek_equal(str)
512: end
# File parsers/parse_rb.rb, line 1292
1292: def read_escape
1293: res = ""
1294: case ch = getc
1295: when /[0-7]/
1296: ungetc ch
1297: 3.times do
1298: case ch = getc
1299: when /[0-7]/
1300: when nil
1301: break
1302: else
1303: ungetc
1304: break
1305: end
1306: res << ch
1307: end
1308:
1309: when "x"
1310: res << ch
1311: 2.times do
1312: case ch = getc
1313: when /[0-9a-fA-F]/
1314: when nil
1315: break
1316: else
1317: ungetc
1318: break
1319: end
1320: res << ch
1321: end
1322:
1323: when "M"
1324: res << ch
1325: if (ch = getc) != '-'
1326: ungetc
1327: else
1328: res << ch
1329: if (ch = getc) == "\\" #"
1330: res << ch
1331: res << read_escape
1332: else
1333: res << ch
1334: end
1335: end
1336:
1337: when "C", "c" #, "^"
1338: res << ch
1339: if ch == "C" and (ch = getc) != "-"
1340: ungetc
1341: else
1342: res << ch
1343: if (ch = getc) == "\\" #"
1344: res << ch
1345: res << read_escape
1346: else
1347: res << ch
1348: end
1349: end
1350: else
1351: res << ch
1352: end
1353: res
1354: end
# File parsers/parse_rb.rb, line 1254
1254: def skip_inner_expression
1255: res = ""
1256: nest = 0
1257: while (ch = getc)
1258: res << ch
1259: if ch == '}'
1260: break if nest.zero?
1261: nest -= 1
1262: elsif ch == '{'
1263: nest += 1
1264: end
1265: end
1266: res
1267: end
# File parsers/parse_rb.rb, line 532
532: def token
533: set_token_position(line_no, char_no)
534: begin
535: begin
536: tk = @OP.match(self)
537: @space_seen = tk.kind_of?(TkSPACE)
538: rescue SyntaxError
539: abort if @exception_on_syntax_error
540: tk = TkError.new(line_no, char_no)
541: end
542: end while @skip_space and tk.kind_of?(TkSPACE)
543: if @read_auto_clean_up
544: get_read
545: end
546: # throw :eof unless tk
547: p tk if $DEBUG
548: tk
549: end