編譯之前對(duì)源代碼進(jìn)行字符集映射,續(xù)行符處理,注釋處理,詞法分析等過(guò)程,具體是怎樣的?
源代碼 Parser/Tokenizer.c 進(jìn)行詞法分析。首先是依照編碼進(jìn)行解碼。然后通過(guò)tok_get 函數(shù)進(jìn)行詞法分析。
首先處理縮進(jìn);
然后是消除空白字符、注釋以及空白行;
再然后是標(biāo)識(shí)符、數(shù)字、字符串;
再然后是續(xù)行符('\'+直接輸入換行);
最后是操作符。
static int
tok_get(struct tok_state *tok, char **p_start, char **p_end)
{
int c;
int blankline, nonascii;
*p_start = *p_end = NULL;
nextline:
tok->start = NULL;
blankline = 0;
/* Get indentation level */
if (tok->atbol) {
int col = 0;
int altcol = 0;
tok->atbol = 0;
for (;;) {
c = tok_nextc(tok);
if (c == ' ') {
col++, altcol++;
}
else if (c == '\t') {
col = (col/tok->tabsize + 1) * tok->tabsize;
altcol = (altcol/tok->alttabsize + 1)
* tok->alttabsize;
}
else if (c == '\014') {/* Control-L (formfeed) */
col = altcol = 0; /* For Emacs users */
}
else {
break;
}
}
tok_backup(tok, c);
if (c == '#' || c == '\n') {
/* Lines with only whitespace and/or comments
shouldn't affect the indentation and are
not passed to the parser as NEWLINE tokens,
except *totally* empty lines in interactive
mode, which signal the end of a command group. */
if (col == 0 && c == '\n' && tok->prompt != NULL) {
blankline = 0; /* Let it through */
}
else {
blankline = 1; /* Ignore completely */
}
/* We can't jump back right here since we still
may need to skip to the end of a comment */
}
if (!blankline && tok->level == 0) {
if (col == tok->indstack[tok->indent]) {
/* No change */
if (altcol != tok->altindstack[tok->indent]) {
if (indenterror(tok)) {
return ERRORTOKEN;
}
}
}
else if (col > tok->indstack[tok->indent]) {
/* Indent -- always one */
if (tok->indent+1 >= MAXINDENT) {
tok->done = E_TOODEEP;
tok->cur = tok->inp;
return ERRORTOKEN;
}
if (altcol <= tok->altindstack[tok->indent]) {
if (indenterror(tok)) {
return ERRORTOKEN;
}
}
tok->pendin++;
tok->indstack[++tok->indent] = col;
tok->altindstack[tok->indent] = altcol;
}
else /* col < tok->indstack[tok->indent] */ {
/* Dedent -- any number, must be consistent */
while (tok->indent > 0 &&
col < tok->indstack[tok->indent]) {
tok->pendin--;
tok->indent--;
}
if (col != tok->indstack[tok->indent]) {
tok->done = E_DEDENT;
tok->cur = tok->inp;
return ERRORTOKEN;
}
if (altcol != tok->altindstack[tok->indent]) {
if (indenterror(tok)) {
return ERRORTOKEN;
}
}
}
}
}
tok->start = tok->cur;
/* Return pending indents/dedents */
if (tok->pendin != 0) {
if (tok->pendin < 0) {
tok->pendin++;
return DEDENT;
}
else {
tok->pendin--;
return INDENT;
}
}
if (tok->async_def
&& !blankline
&& tok->level == 0
/* There was a NEWLINE after ASYNC DEF,
so we're past the signature. */
&& tok->async_def_nl
/* Current indentation level is less than where
the async function was defined */
&& tok->async_def_indent >= tok->indent)
{
tok->async_def = 0;
tok->async_def_indent = 0;
tok->async_def_nl = 0;
}
again:
tok->start = NULL;
/* Skip spaces */
do {
c = tok_nextc(tok);
} while (c == ' ' || c == '\t' || c == '\014');
/* Set start of current token */
tok->start = tok->cur - 1;
/* Skip comment */
if (c == '#') {
while (c != EOF && c != '\n') {
c = tok_nextc(tok);
}
}
/* Check for EOF and errors now */
if (c == EOF) {
return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
}
/* Identifier (most frequent token!) */
nonascii = 0;
if (is_potential_identifier_start(c)) {
/* Process the various legal combinations of b"", r"", u"", and f"". */
int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
while (1) {
if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
saw_b = 1;
/* Since this is a backwards compatibility support literal we don't
want to support it in arbitrary order like byte literals. */
else if (!(saw_b || saw_u || saw_r || saw_f)
&& (c == 'u'|| c == 'U')) {
saw_u = 1;
}
/* ur"" and ru"" are not supported */
else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
saw_r = 1;
}
else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
saw_f = 1;
}
else {
break;
}
c = tok_nextc(tok);
if (c == '"' || c == '\'') {
goto letter_quote;
}
}
while (is_potential_identifier_char(c)) {
if (c >= 128) {
nonascii = 1;
}
c = tok_nextc(tok);
}
tok_backup(tok, c);
if (nonascii && !verify_identifier(tok)) {
return ERRORTOKEN;
}
*p_start = tok->start;
*p_end = tok->cur;
/* async/await parsing block. */
if (tok->cur - tok->start == 5) {
/* Current token length is 5. */
if (tok->async_def) {
/* We're inside an 'async def' function. */
if (memcmp(tok->start, "async", 5) == 0) {
return ASYNC;
}
if (memcmp(tok->start, "await", 5) == 0) {
return AWAIT;
}
}
else if (memcmp(tok->start, "async", 5) == 0) {
/* The current token is 'async'.
Look ahead one token.*/
struct tok_state ahead_tok;
char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
int ahead_tok_kind;
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
&ahead_tok_end);
if (ahead_tok_kind == NAME
&& ahead_tok.cur - ahead_tok.start == 3
&& memcmp(ahead_tok.start, "def", 3) == 0)
{
/* The next token is going to be 'def', so instead of
returning 'async' NAME token, we return ASYNC. */
tok->async_def_indent = tok->indent;
tok->async_def = 1;
return ASYNC;
}
}
}
return NAME;
}
/* Newline */
if (c == '\n') {
tok->atbol = 1;
if (blankline || tok->level > 0) {
goto nextline;
}
*p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
if (tok->async_def) {
/* We're somewhere inside an 'async def' function, and
we've encountered a NEWLINE after its signature. */
tok->async_def_nl = 1;
}
return NEWLINE;
}
/* Period or number starting with period? */
if (c == '.') {
c = tok_nextc(tok);
if (isdigit(c)) {
goto fraction;
} else if (c == '.') {
c = tok_nextc(tok);
if (c == '.') {
*p_start = tok->start;
*p_end = tok->cur;
return ELLIPSIS;
}
else {
tok_backup(tok, c);
}
tok_backup(tok, '.');
}
else {
tok_backup(tok, c);
}
*p_start = tok->start;
*p_end = tok->cur;
return DOT;
}
/* Number */
if (isdigit(c)) {
if (c == '0') {
/* Hex, octal or binary -- maybe. */
c = tok_nextc(tok);
if (c == 'x' || c == 'X') {
/* Hex */
c = tok_nextc(tok);
do {
if (c == '_') {
c = tok_nextc(tok);
}
if (!isxdigit(c)) {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
}
do {
c = tok_nextc(tok);
} while (isxdigit(c));
} while (c == '_');
}
else if (c == 'o' || c == 'O') {
/* Octal */
c = tok_nextc(tok);
do {
if (c == '_') {
c = tok_nextc(tok);
}
if (c < '0' || c >= '8') {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
}
do {
c = tok_nextc(tok);
} while ('0' <= c && c < '8');
} while (c == '_');
}
else if (c == 'b' || c == 'B') {
/* Binary */
c = tok_nextc(tok);
do {
if (c == '_') {
c = tok_nextc(tok);
}
if (c != '0' && c != '1') {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
}
do {
c = tok_nextc(tok);
} while (c == '0' || c == '1');
} while (c == '_');
}
else {
int nonzero = 0;
/* maybe old-style octal; c is first char of it */
/* in any case, allow '0' as a literal */
while (1) {
if (c == '_') {
c = tok_nextc(tok);
if (!isdigit(c)) {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
}
}
if (c != '0') {
break;
}
c = tok_nextc(tok);
}
if (isdigit(c)) {
nonzero = 1;
c = tok_decimal_tail(tok);
if (c == 0) {
return ERRORTOKEN;
}
}
if (c == '.') {
c = tok_nextc(tok);
goto fraction;
}
else if (c == 'e' || c == 'E') {
goto exponent;
}
else if (c == 'j' || c == 'J') {
goto imaginary;
}
else if (nonzero) {
/* Old-style octal: now disallowed. */
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
}
}
}
else {
/* Decimal */
c = tok_decimal_tail(tok);
if (c == 0) {
return ERRORTOKEN;
}
{
/* Accept floating point numbers. */
if (c == '.') {
c = tok_nextc(tok);
fraction:
/* Fraction */
if (isdigit(c)) {
c = tok_decimal_tail(tok);
if (c == 0) {
return ERRORTOKEN;
}
}
}
if (c == 'e' || c == 'E') {
int e;
exponent:
e = c;
/* Exponent part */
c = tok_nextc(tok);
if (c == '+' || c == '-') {
c = tok_nextc(tok);
if (!isdigit(c)) {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
}
} else if (!isdigit(c)) {
tok_backup(tok, c);
tok_backup(tok, e);
*p_start = tok->start;
*p_end = tok->cur;
return NUMBER;
}
c = tok_decimal_tail(tok);
if (c == 0) {
return ERRORTOKEN;
}
}
if (c == 'j' || c == 'J') {
/* Imaginary part */
imaginary:
c = tok_nextc(tok);
}
}
}
tok_backup(tok, c);
*p_start = tok->start;
*p_end = tok->cur;
return NUMBER;
}
letter_quote:
/* String */
if (c == '\'' || c == '"') {
int quote = c;
int quote_size = 1; /* 1 or 3 */
int end_quote_size = 0;
/* Find the quote size and start of string */
c = tok_nextc(tok);
if (c == quote) {
c = tok_nextc(tok);
if (c == quote) {
quote_size = 3;
}
else {
end_quote_size = 1; /* empty string found */
}
}
if (c != quote) {
tok_backup(tok, c);
}
/* Get rest of string */
while (end_quote_size != quote_size) {
c = tok_nextc(tok);
if (c == EOF) {
if (quote_size == 3) {
tok->done = E_EOFS;
}
else {
tok->done = E_EOLS;
}
tok->cur = tok->inp;
return ERRORTOKEN;
}
if (quote_size == 1 && c == '\n') {
tok->done = E_EOLS;
tok->cur = tok->inp;
return ERRORTOKEN;
}
if (c == quote) {
end_quote_size += 1;
}
else {
end_quote_size = 0;
if (c == '\\') {
tok_nextc(tok); /* skip escaped char */
}
}
}
*p_start = tok->start;
*p_end = tok->cur;
return STRING;
}
/* Line continuation */
if (c == '\\') {
c = tok_nextc(tok);
if (c != '\n') {
tok->done = E_LINECONT;
tok->cur = tok->inp;
return ERRORTOKEN;
}
tok->cont_line = 1;
goto again; /* Read next line */
}
/* Check for two-character token */
{
int c2 = tok_nextc(tok);
int token = PyToken_TwoChars(c, c2);
if (token != OP) {
int c3 = tok_nextc(tok);
int token3 = PyToken_ThreeChars(c, c2, c3);
if (token3 != OP) {
token = token3;
}
else {
tok_backup(tok, c3);
}
*p_start = tok->start;
*p_end = tok->cur;
return token;
}
tok_backup(tok, c2);
}
/* Keep track of parentheses nesting level */
switch (c) {
case '(':
case '[':
case '{':
tok->level++;
break;
case ')':
case ']':
case '}':
tok->level--;
break;
}
/* Punctuation character */
*p_start = tok->start;
*p_end = tok->cur;
return PyToken_OneChar(c);
}
北大青鳥(niǎo)APTECH成立于1999年。依托北京大學(xué)優(yōu)質(zhì)雄厚的教育資源和背景,秉承“教育改變生活”的發(fā)展理念,致力于培養(yǎng)中國(guó)IT技能型緊缺人才,是大數(shù)據(jù)專業(yè)的國(guó)家
達(dá)內(nèi)教育集團(tuán)成立于2002年,是一家由留學(xué)海歸創(chuàng)辦的高端職業(yè)教育培訓(xùn)機(jī)構(gòu),是中國(guó)一站式人才培養(yǎng)平臺(tái)、一站式人才輸送平臺(tái)。2014年4月3日在美國(guó)成功上市,融資1
北大課工場(chǎng)是北京大學(xué)校辦產(chǎn)業(yè)為響應(yīng)國(guó)家深化產(chǎn)教融合/校企合作的政策,積極推進(jìn)“中國(guó)制造2025”,實(shí)現(xiàn)中華民族偉大復(fù)興的升級(jí)產(chǎn)業(yè)鏈。利用北京大學(xué)優(yōu)質(zhì)教育資源及背
博為峰,中國(guó)職業(yè)人才培訓(xùn)領(lǐng)域的先行者
曾工作于聯(lián)想擔(dān)任系統(tǒng)開(kāi)發(fā)工程師,曾在博彥科技股份有限公司擔(dān)任項(xiàng)目經(jīng)理從事移動(dòng)互聯(lián)網(wǎng)管理及研發(fā)工作,曾創(chuàng)辦藍(lán)懿科技有限責(zé)任公司從事總經(jīng)理職務(wù)負(fù)責(zé)iOS教學(xué)及管理工作。
浪潮集團(tuán)項(xiàng)目經(jīng)理。精通Java與.NET 技術(shù), 熟練的跨平臺(tái)面向?qū)ο箝_(kāi)發(fā)經(jīng)驗(yàn),技術(shù)功底深厚。 授課風(fēng)格 授課風(fēng)格清新自然、條理清晰、主次分明、重點(diǎn)難點(diǎn)突出、引人入勝。
精通HTML5和CSS3;Javascript及主流js庫(kù),具有快速界面開(kāi)發(fā)的能力,對(duì)瀏覽器兼容性、前端性能優(yōu)化等有深入理解。精通網(wǎng)頁(yè)制作和網(wǎng)頁(yè)游戲開(kāi)發(fā)。
具有10 年的Java 企業(yè)應(yīng)用開(kāi)發(fā)經(jīng)驗(yàn)。曾經(jīng)歷任德國(guó)Software AG 技術(shù)顧問(wèn),美國(guó)Dachieve 系統(tǒng)架構(gòu)師,美國(guó)AngelEngineers Inc. 系統(tǒng)架構(gòu)師。