| Class | RubyLex |
| In: |
parsers/parse_rb.rb
|
| Parent: | Object |
Lexical analyzer for Ruby source
| ENINDENT_CLAUSE | = | [ "case", "class", "def", "do", "for", "if", "module", "unless", "until", "while", "begin" |
| DEINDENT_CLAUSE | = | ["end" |
| PERCENT_LTYPE | = | { "q" => "\'", "Q" => "\"", "x" => "\`", "r" => "\/", "w" => "]" |
| PERCENT_PAREN | = | { "{" => "}", "[" => "]", "<" => ">", "(" => ")" |
| Ltype2Token | = | { "\'" => TkSTRING, "\"" => TkSTRING, "\`" => TkXSTRING, "\/" => TkREGEXP, "]" => TkDSTRING |
| DLtype2Token | = | { "\"" => TkDSTRING, "\`" => TkDXSTRING, "\/" => TkDREGEXP, } |
| continue | [R] | |
| exception_on_syntax_error | [RW] | |
| indent | [R] | |
| lex_state | [R] | |
| read_auto_clean_up | [RW] | |
| skip_space | [RW] |
# File parsers/parse_rb.rb, line 447
447: def initialize(content)
448: lex_init
449:
450: @reader = BufferedReader.new(content)
451:
452: @exp_line_no = @line_no = 1
453: @base_char_no = 0
454: @indent = 0
455:
456: @ltype = nil
457: @quoted = nil
458: @lex_state = EXPR_BEG
459: @space_seen = false
460:
461: @continue = false
462: @line = ""
463:
464: @skip_space = false
465: @read_auto_clean_up = false
466: @exception_on_syntax_error = true
467: end
# File parsers/parse_rb.rb, line 496
496: def gets
497: c = getc or return
498: l = ""
499: begin
500: l.concat c unless c == "\r"
501: break if c == "\n"
502: end while c = getc
503: l
504: end
# File parsers/parse_rb.rb, line 1270
1270: def identify_comment
1271: @ltype = "#"
1272: comment = "#"
1273: while ch = getc
1274: if ch == "\\"
1275: ch = getc
1276: if ch == "\n"
1277: ch = " "
1278: else
1279: comment << "\\"
1280: end
1281: else
1282: if ch == "\n"
1283: @ltype = nil
1284: ungetc
1285: break
1286: end
1287: end
1288: comment << ch
1289: end
1290: return Token(TkCOMMENT).set_text(comment)
1291: end
# File parsers/parse_rb.rb, line 965
965: def identify_gvar
966: @lex_state = EXPR_END
967: str = "$"
968:
969: tk = case ch = getc
970: when /[~_*$?!@\/\\;,=:<>".]/ #"
971: str << ch
972: Token(TkGVAR, str)
973:
974: when "-"
975: str << "-" << getc
976: Token(TkGVAR, str)
977:
978: when "&", "`", "'", "+"
979: str << ch
980: Token(TkBACK_REF, str)
981:
982: when /[1-9]/
983: str << ch
984: while (ch = getc) =~ /[0-9]/
985: str << ch
986: end
987: ungetc
988: Token(TkNTH_REF)
989: when /\w/
990: ungetc
991: ungetc
992: return identify_identifier
993: else
994: ungetc
995: Token("$")
996: end
997: tk.set_text(str)
998: end
# File parsers/parse_rb.rb, line 1075
1075: def identify_here_document
1076: ch = getc
1077: if ch == "-"
1078: ch = getc
1079: indent = true
1080: end
1081: if /['"`]/ =~ ch # '
1082: lt = ch
1083: quoted = ""
1084: while (c = getc) && c != lt
1085: quoted.concat c
1086: end
1087: else
1088: lt = '"'
1089: quoted = ch.dup
1090: while (c = getc) && c =~ /\w/
1091: quoted.concat c
1092: end
1093: ungetc
1094: end
1095:
1096: ltback, @ltype = @ltype, lt
1097: reserve = ""
1098:
1099: while ch = getc
1100: reserve << ch
1101: if ch == "\\" #"
1102: ch = getc
1103: reserve << ch
1104: elsif ch == "\n"
1105: break
1106: end
1107: end
1108:
1109: str = ""
1110: while (l = gets)
1111: l.chomp!
1112: l.strip! if indent
1113: break if l == quoted
1114: str << l.chomp << "\n"
1115: end
1116:
1117: @reader.divert_read_from(reserve)
1118:
1119: @ltype = ltback
1120: @lex_state = EXPR_END
1121: Token(Ltype2Token[lt], str).set_text(str.dump)
1122: end
# File parsers/parse_rb.rb, line 1000
1000: def identify_identifier
1001: token = ""
1002: token.concat getc if peek(0) =~ /[$@]/
1003: token.concat getc if peek(0) == "@"
1004:
1005: while (ch = getc) =~ /\w|_/
1006: print ":", ch, ":" if RubyLex.debug?
1007: token.concat ch
1008: end
1009: ungetc
1010:
1011: if ch == "!" or ch == "?"
1012: token.concat getc
1013: end
1014: # fix token
1015:
1016: # $stderr.puts "identifier - #{token}, state = #@lex_state"
1017:
1018: case token
1019: when /^\$/
1020: return Token(TkGVAR, token).set_text(token)
1021: when /^\@/
1022: @lex_state = EXPR_END
1023: return Token(TkIVAR, token).set_text(token)
1024: end
1025:
1026: if @lex_state != EXPR_DOT
1027: print token, "\n" if RubyLex.debug?
1028:
1029: token_c, *trans = TkReading2Token[token]
1030: if token_c
1031: # reserved word?
1032:
1033: if (@lex_state != EXPR_BEG &&
1034: @lex_state != EXPR_FNAME &&
1035: trans[1])
1036: # modifiers
1037: token_c = TkSymbol2Token[trans[1]]
1038: @lex_state = trans[0]
1039: else
1040: if @lex_state != EXPR_FNAME
1041: if ENINDENT_CLAUSE.include?(token)
1042: @indent += 1
1043: elsif DEINDENT_CLAUSE.include?(token)
1044: @indent -= 1
1045: end
1046: @lex_state = trans[0]
1047: else
1048: @lex_state = EXPR_END
1049: end
1050: end
1051: return Token(token_c, token).set_text(token)
1052: end
1053: end
1054:
1055: if @lex_state == EXPR_FNAME
1056: @lex_state = EXPR_END
1057: if peek(0) == '='
1058: token.concat getc
1059: end
1060: elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT
1061: @lex_state = EXPR_ARG
1062: else
1063: @lex_state = EXPR_END
1064: end
1065:
1066: if token[0, 1] =~ /[A-Z]/
1067: return Token(TkCONSTANT, token).set_text(token)
1068: elsif token[token.size - 1, 1] =~ /[!?]/
1069: return Token(TkFID, token).set_text(token)
1070: else
1071: return Token(TkIDENTIFIER, token).set_text(token)
1072: end
1073: end
# File parsers/parse_rb.rb, line 1143
1143: def identify_number(start)
1144: str = start.dup
1145:
1146: if start == "+" or start == "-" or start == ""
1147: start = getc
1148: str << start
1149: end
1150:
1151: @lex_state = EXPR_END
1152:
1153: if start == "0"
1154: if peek(0) == "x"
1155: ch = getc
1156: str << ch
1157: match = /[0-9a-f_]/
1158: else
1159: match = /[0-7_]/
1160: end
1161: while ch = getc
1162: if ch !~ match
1163: ungetc
1164: break
1165: else
1166: str << ch
1167: end
1168: end
1169: return Token(TkINTEGER).set_text(str)
1170: end
1171:
1172: type = TkINTEGER
1173: allow_point = TRUE
1174: allow_e = TRUE
1175: while ch = getc
1176: case ch
1177: when /[0-9_]/
1178: str << ch
1179:
1180: when allow_point && "."
1181: type = TkFLOAT
1182: if peek(0) !~ /[0-9]/
1183: ungetc
1184: break
1185: end
1186: str << ch
1187: allow_point = false
1188:
1189: when allow_e && "e", allow_e && "E"
1190: str << ch
1191: type = TkFLOAT
1192: if peek(0) =~ /[+-]/
1193: str << getc
1194: end
1195: allow_e = false
1196: allow_point = false
1197: else
1198: ungetc
1199: break
1200: end
1201: end
1202: Token(type).set_text(str)
1203: end
# File parsers/parse_rb.rb, line 1124
1124: def identify_quotation(initial_char)
1125: ch = getc
1126: if lt = PERCENT_LTYPE[ch]
1127: initial_char += ch
1128: ch = getc
1129: elsif ch =~ /\W/
1130: lt = "\""
1131: else
1132: RubyLex.fail SyntaxError, "unknown type of %string ('#{ch}')"
1133: end
1134: # if ch !~ /\W/
1135: # ungetc
1136: # next
1137: # end
1138: #@ltype = lt
1139: @quoted = ch unless @quoted = PERCENT_PAREN[ch]
1140: identify_string(lt, @quoted, ch, initial_char)
1141: end
# File parsers/parse_rb.rb, line 1205
1205: def identify_string(ltype, quoted = ltype, opener=nil, initial_char = nil)
1206: @ltype = ltype
1207: @quoted = quoted
1208: subtype = nil
1209:
1210: str = ""
1211: str << initial_char if initial_char
1212: str << (opener||quoted)
1213:
1214: nest = 0
1215: begin
1216: while ch = getc
1217: str << ch
1218: if @quoted == ch
1219: if nest == 0
1220: break
1221: else
1222: nest -= 1
1223: end
1224: elsif opener == ch
1225: nest += 1
1226: elsif @ltype != "'" && @ltype != "]" and ch == "#"
1227: ch = getc
1228: if ch == "{"
1229: subtype = true
1230: str << ch << skip_inner_expression
1231: else
1232: ungetc(ch)
1233: end
1234: elsif ch == '\\' #'
1235: str << read_escape
1236: end
1237: end
1238: if @ltype == "/"
1239: if peek(0) =~ /i|o|n|e|s/
1240: str << getc
1241: end
1242: end
1243: if subtype
1244: Token(DLtype2Token[ltype], str)
1245: else
1246: Token(Ltype2Token[ltype], str)
1247: end.set_text(str)
1248: ensure
1249: @ltype = nil
1250: @quoted = nil
1251: @lex_state = EXPR_END
1252: end
1253: end
# File parsers/parse_rb.rb, line 519
519: def lex
520: until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) &&
521: !@continue or
522: tk.nil?)
523: end
524: line = get_read
525:
526: if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil?
527: nil
528: else
529: line
530: end
531: end
# File parsers/parse_rb.rb, line 589
589: def lex_init()
590: @OP = SLex.new
591: # @OP = IRB::SLex.new # 1.8 doesn't support #IRB::SLex
592: @OP.def_rules("\0", "\004", "\032") do |chars, io|
593: Token(TkEND_OF_SCRIPT).set_text(chars)
594: end
595:
596: @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |chars, io|
597: @space_seen = TRUE
598: while (ch = getc) =~ /[ \t\f\r\13]/
599: chars << ch
600: end
601: ungetc
602: Token(TkSPACE).set_text(chars)
603: end
604:
605: @OP.def_rule("#") do
606: |op, io|
607: identify_comment
608: end
609:
610: @OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do
611: |op, io|
612: str = op
613: @ltype = "="
614:
615:
616: begin
617: line = ""
618: begin
619: ch = getc
620: line << ch
621: end until ch == "\n"
622: str << line
623: end until line =~ /^=end/
624:
625: ungetc
626:
627: @ltype = nil
628:
629: if str =~ /\A=begin\s+rdoc/i
630: str.sub!(/\A=begin.*\n/, '')
631: str.sub!(/^=end.*/m, '')
632: Token(TkCOMMENT).set_text(str)
633: else
634: Token(TkRD_COMMENT)#.set_text(str)
635: end
636: end
637:
638: @OP.def_rule("\n") do
639: print "\\n\n" if RubyLex.debug?
640: case @lex_state
641: when EXPR_BEG, EXPR_FNAME, EXPR_DOT
642: @continue = TRUE
643: else
644: @continue = FALSE
645: @lex_state = EXPR_BEG
646: end
647: Token(TkNL).set_text("\n")
648: end
649:
650: @OP.def_rules("*", "**",
651: "!", "!=", "!~",
652: "=", "==", "===",
653: "=~", "<=>",
654: "<", "<=",
655: ">", ">=", ">>") do
656: |op, io|
657: @lex_state = EXPR_BEG
658: Token(op).set_text(op)
659: end
660:
661: @OP.def_rules("<<") do
662: |op, io|
663: tk = nil
664: if @lex_state != EXPR_END && @lex_state != EXPR_CLASS &&
665: (@lex_state != EXPR_ARG || @space_seen)
666: c = peek(0)
667: if /[-\w_\"\'\`]/ =~ c
668: tk = identify_here_document
669: end
670: end
671: if !tk
672: @lex_state = EXPR_BEG
673: tk = Token(op).set_text(op)
674: end
675: tk
676: end
677:
678: @OP.def_rules("'", '"') do
679: |op, io|
680: identify_string(op)
681: end
682:
683: @OP.def_rules("`") do
684: |op, io|
685: if @lex_state == EXPR_FNAME
686: Token(op).set_text(op)
687: else
688: identify_string(op)
689: end
690: end
691:
692: @OP.def_rules('?') do
693: |op, io|
694: if @lex_state == EXPR_END
695: @lex_state = EXPR_BEG
696: Token(TkQUESTION).set_text(op)
697: else
698: ch = getc
699: if @lex_state == EXPR_ARG && ch !~ /\s/
700: ungetc
701: @lex_state = EXPR_BEG;
702: Token(TkQUESTION).set_text(op)
703: else
704: str = op
705: str << ch
706: if (ch == '\\') #'
707: str << read_escape
708: end
709: @lex_state = EXPR_END
710: Token(TkINTEGER).set_text(str)
711: end
712: end
713: end
714:
715: @OP.def_rules("&", "&&", "|", "||") do
716: |op, io|
717: @lex_state = EXPR_BEG
718: Token(op).set_text(op)
719: end
720:
721: @OP.def_rules("+=", "-=", "*=", "**=",
722: "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do
723: |op, io|
724: @lex_state = EXPR_BEG
725: op =~ /^(.*)=$/
726: Token(TkOPASGN, $1).set_text(op)
727: end
728:
729: @OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do |op, io|
730: Token(TkUPLUS).set_text(op)
731: end
732:
733: @OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do |op, io|
734: Token(TkUMINUS).set_text(op)
735: end
736:
737: @OP.def_rules("+", "-") do
738: |op, io|
739: catch(:RET) do
740: if @lex_state == EXPR_ARG
741: if @space_seen and peek(0) =~ /[0-9]/
742: throw :RET, identify_number(op)
743: else
744: @lex_state = EXPR_BEG
745: end
746: elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/
747: throw :RET, identify_number(op)
748: else
749: @lex_state = EXPR_BEG
750: end
751: Token(op).set_text(op)
752: end
753: end
754:
755: @OP.def_rule(".") do
756: @lex_state = EXPR_BEG
757: if peek(0) =~ /[0-9]/
758: ungetc
759: identify_number("")
760: else
761: # for obj.if
762: @lex_state = EXPR_DOT
763: Token(TkDOT).set_text(".")
764: end
765: end
766:
767: @OP.def_rules("..", "...") do
768: |op, io|
769: @lex_state = EXPR_BEG
770: Token(op).set_text(op)
771: end
772:
773: lex_int2
774: end
# File parsers/parse_rb.rb, line 776
776: def lex_int2
777: @OP.def_rules("]", "}", ")") do
778: |op, io|
779: @lex_state = EXPR_END
780: @indent -= 1
781: Token(op).set_text(op)
782: end
783:
784: @OP.def_rule(":") do
785: if @lex_state == EXPR_END || peek(0) =~ /\s/
786: @lex_state = EXPR_BEG
787: tk = Token(TkCOLON)
788: else
789: @lex_state = EXPR_FNAME;
790: tk = Token(TkSYMBEG)
791: end
792: tk.set_text(":")
793: end
794:
795: @OP.def_rule("::") do
796: # p @lex_state.id2name, @space_seen
797: if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen
798: @lex_state = EXPR_BEG
799: tk = Token(TkCOLON3)
800: else
801: @lex_state = EXPR_DOT
802: tk = Token(TkCOLON2)
803: end
804: tk.set_text("::")
805: end
806:
807: @OP.def_rule("/") do
808: |op, io|
809: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
810: identify_string(op)
811: elsif peek(0) == '='
812: getc
813: @lex_state = EXPR_BEG
814: Token(TkOPASGN, :/).set_text("/=") #")
815: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
816: identify_string(op)
817: else
818: @lex_state = EXPR_BEG
819: Token("/").set_text(op)
820: end
821: end
822:
823: @OP.def_rules("^") do
824: @lex_state = EXPR_BEG
825: Token("^").set_text("^")
826: end
827:
828: # @OP.def_rules("^=") do
829: # @lex_state = EXPR_BEG
830: # Token(TkOPASGN, :^)
831: # end
832:
833: @OP.def_rules(",", ";") do
834: |op, io|
835: @lex_state = EXPR_BEG
836: Token(op).set_text(op)
837: end
838:
839: @OP.def_rule("~") do
840: @lex_state = EXPR_BEG
841: Token("~").set_text("~")
842: end
843:
844: @OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do
845: @lex_state = EXPR_BEG
846: Token("~").set_text("~@")
847: end
848:
849: @OP.def_rule("(") do
850: @indent += 1
851: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
852: @lex_state = EXPR_BEG
853: tk = Token(TkfLPAREN)
854: else
855: @lex_state = EXPR_BEG
856: tk = Token(TkLPAREN)
857: end
858: tk.set_text("(")
859: end
860:
861: @OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do
862: Token("[]").set_text("[]")
863: end
864:
865: @OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do
866: Token("[]=").set_text("[]=")
867: end
868:
869: @OP.def_rule("[") do
870: @indent += 1
871: if @lex_state == EXPR_FNAME
872: t = Token(TkfLBRACK)
873: else
874: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
875: t = Token(TkLBRACK)
876: elsif @lex_state == EXPR_ARG && @space_seen
877: t = Token(TkLBRACK)
878: else
879: t = Token(TkfLBRACK)
880: end
881: @lex_state = EXPR_BEG
882: end
883: t.set_text("[")
884: end
885:
886: @OP.def_rule("{") do
887: @indent += 1
888: if @lex_state != EXPR_END && @lex_state != EXPR_ARG
889: t = Token(TkLBRACE)
890: else
891: t = Token(TkfLBRACE)
892: end
893: @lex_state = EXPR_BEG
894: t.set_text("{")
895: end
896:
897: @OP.def_rule('\\') do #'
898: if getc == "\n"
899: @space_seen = true
900: @continue = true
901: Token(TkSPACE).set_text("\\\n")
902: else
903: ungetc
904: Token("\\").set_text("\\") #"
905: end
906: end
907:
908: @OP.def_rule('%') do
909: |op, io|
910: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
911: identify_quotation('%')
912: elsif peek(0) == '='
913: getc
914: Token(TkOPASGN, "%").set_text("%=")
915: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
916: identify_quotation('%')
917: else
918: @lex_state = EXPR_BEG
919: Token("%").set_text("%")
920: end
921: end
922:
923: @OP.def_rule('$') do #'
924: identify_gvar
925: end
926:
927: @OP.def_rule('@') do
928: if peek(0) =~ /[@\w_]/
929: ungetc
930: identify_identifier
931: else
932: Token("@").set_text("@")
933: end
934: end
935:
936: # @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do
937: # |op, io|
938: # @indent += 1
939: # @lex_state = EXPR_FNAME
940: # # @lex_state = EXPR_END
941: # # until @rests[0] == "\n" or @rests[0] == ";"
942: # # rests.shift
943: # # end
944: # end
945:
946: @OP.def_rule("__END__", proc{@prev_char_no == 0 && peek(0) =~ /[\r\n]/}) do
947: throw :eof
948: end
949:
950: @OP.def_rule("") do
951: |op, io|
952: printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug?
953: if peek(0) =~ /[0-9]/
954: t = identify_number("")
955: elsif peek(0) =~ /[\w_]/
956: t = identify_identifier
957: end
958: printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug?
959: t
960: end
961:
962: p @OP if RubyLex.debug?
963: end
# File parsers/parse_rb.rb, line 511
511: def peek_equal?(str)
512: @reader.peek_equal(str)
513: end
# File parsers/parse_rb.rb, line 1293
1293: def read_escape
1294: res = ""
1295: case ch = getc
1296: when /[0-7]/
1297: ungetc ch
1298: 3.times do
1299: case ch = getc
1300: when /[0-7]/
1301: when nil
1302: break
1303: else
1304: ungetc
1305: break
1306: end
1307: res << ch
1308: end
1309:
1310: when "x"
1311: res << ch
1312: 2.times do
1313: case ch = getc
1314: when /[0-9a-fA-F]/
1315: when nil
1316: break
1317: else
1318: ungetc
1319: break
1320: end
1321: res << ch
1322: end
1323:
1324: when "M"
1325: res << ch
1326: if (ch = getc) != '-'
1327: ungetc
1328: else
1329: res << ch
1330: if (ch = getc) == "\\" #"
1331: res << ch
1332: res << read_escape
1333: else
1334: res << ch
1335: end
1336: end
1337:
1338: when "C", "c", "^"
1339: res << ch
1340: if ch == "C" and (ch = getc) != "-"
1341: ungetc
1342: else
1343: res << ch
1344: if (ch = getc) == "\\" #"
1345: res << ch
1346: res << read_escape
1347: else
1348: res << ch
1349: end
1350: end
1351: else
1352: res << ch
1353: end
1354: res
1355: end
# File parsers/parse_rb.rb, line 1255
1255: def skip_inner_expression
1256: res = ""
1257: nest = 0
1258: while (ch = getc)
1259: res << ch
1260: if ch == '}'
1261: break if nest.zero?
1262: nest -= 1
1263: elsif ch == '{'
1264: nest += 1
1265: end
1266: end
1267: res
1268: end
# File parsers/parse_rb.rb, line 533
533: def token
534: set_token_position(line_no, char_no)
535: begin
536: begin
537: tk = @OP.match(self)
538: @space_seen = tk.kind_of?(TkSPACE)
539: rescue SyntaxError
540: abort if @exception_on_syntax_error
541: tk = TkError.new(line_no, char_no)
542: end
543: end while @skip_space and tk.kind_of?(TkSPACE)
544: if @read_auto_clean_up
545: get_read
546: end
547: # throw :eof unless tk
548: p tk if $DEBUG
549: tk
550: end