南京大学教育研究院 孙继祖
2016-05-03
来源:BLOG@CACM - Python is Now the Most Popular Introductory Teaching Language at Top U.S. Universities
A good engineering decision made by Google:
“ Python where we can, C++ where we must.” ”
vagrant box add ubuntu-trusty /path/to/trusty-server-cloudimg-amd64-vagrant-disk1.box
vagrant init ubuntu-trusty
vagrant up
vagrant ssh
sudo mv /etc/apt/sources.list /etc/apt/sources.list.bak
sudo wget http://mirrors.163.com/.help/sources.list.trusty -O /etc/apt/sources.list
sudo apt-get update
sudo apt-get install python-pip
public class Main{
public static void main(String[] args)
{System.out.println("Hello, World");
for(int i=0; i<10; i++){System.out.print(i);}
}
}
public class Main{
public static void main(String[] args){
System.out.println("Hello, World");
for(int i=0; i<10; i++){
System.out.print(i)
}
}
}
def say_hello():
name = input('What is your name? ')
print('Hello, ' + name)
if some_var > 10:
print "some_var is totally bigger than 10."
elif some_var < 10:
print "some_var is smaller than 10."
else:
print "some_var is indeed 10.
# Single line comments start with a number symbol.
""" Multiline strings can be written
using three "s, and are often used
as comments
"""
# 布尔运算
# 注意:大小写敏感
True and False #=> False
False or True #=> True
# 取反
not True # => False
not False # => True
# 相等比较,用 ==
1 == 1 # => True
2 == 1 # => False
# 不相等比较,用 !=
1 != 1 # => False
2 != 1 # => True
# 字符串格式化,用%
x = 'apple'
y = 'lemon'
z = "The items in the basket are %s and %s" % (x,y)
# 输出到屏幕
print "I'm Python. Nice to meet you!" # => I'm Python. Nice to meet you!
# 从终端输入
input_string_var = raw_input("Enter some data: ") # 将输入的内容保存为字符串
# 赋值前无需声明
some_var = 5 # 变量一般用小写,用下划线分隔
some_var # => 5
# 列表(List)是有序的集合,可自动伸缩
li = []
# 可以使用预填充的列表来初始化
other_li = [4, 5, 6]
# 使用append方法,向列表末尾添加元素
li.append(1) # li 现在为 [1]
li.append(2) # li 现在为 [1, 2]
li.append(4) # li 现在为 [1, 2, 4]
li.append(3) # li is now [1, 2, 4, 3]
# 使用pop方法,从列表末尾移除元素
li.pop() # => 3 and li 现在为 [1, 2, 4]
# 像访问数组元素一样,访问列表的元素
li[0] # => 1
# 使用 = 给列表的某元素赋新值
li[0] = 42
li[0] # => 42
# 访问最后一个元素
li[-1] # => 3
# 访问子列表,注意是左闭右开的区间
li[1:3] # => [2, 4]
li[2:] # => [4, 3]
li[:3] # => [1, 2, 4]
# 检查列表中是否存在某元素
1 in li # => True
# 使用 "len()" 来检查长度
len(li) # => 6
# 元组 (Tuples) 类似于列表,但不可变(immutable)
tup = (1, 2, 3)
tup[0] # => 1
tup[0] = 3 # 出错
# 词典(Dictionary)用来存储映射(键-值对)
empty_dict = {}
# 填充词典
filled_dict = {"one": 1, "two": 2, "three": 3}
# 使用 [] 寻找某个键所对应的值
filled_dict["one"] # => 1
# 使用 "get()" 避免因为键不存在而出错
filled_dict.get("four") # => None
# 使用 "keys()" 获得所有键(返回值类型为list)
filled_dict.keys() # => ["three", "two", "one"]
# 使用 "values()" 获得所有值(返回值类型为list)
filled_dict.values() # => [3, 2, 1]
# if语句,注意缩进在Python中非常重要
# 将会打印 "some_var is smaller than 10"
some_var = 5
if some_var > 10:
print "some_var is totally bigger than 10."
elif some_var < 10: # 非必须
print "some_var is smaller than 10."
else: # 非必须
print "some_var is indeed 10."
"""
使用循环对list进行遍历
将会打印:
dog is a mammal
cat is a mammal
mouse is a mammal
"""
for animal in ["dog", "cat", "mouse"]:
# 字符串格式化的另一种方式
print "{0} is a mammal".format(animal)
"""
"range(number)" 返回从0到指定数的list
将会打印:
0
1
2
3
"""
for i in range(4):
print i
"""
"range(lower, upper)" 返回从lower到upper(不包含)的list,左闭右开区间
将会打印:
4
5
6
7
"""
for i in range(4, 8):
print i
# 使用 "def" 来创建函数
def add(x, y):
print "x is {0} and y is {1}".format(x, y)
return x + y # 返回值
# 带参数的函数调用
add(5, 6) # => 打印 "x is 5 and y is 6" 并返回 11
# 使用关键字参数来进行函数调用
add(y=6, x=5) # 关键词参数的顺序可以随意
# 导入模块
import math
print math.sqrt(16) # => 4
# 导入模块的某个函数
from math import ceil, floor
print ceil(3.7) # => 4.0
print floor(3.7) # => 3.0
爬虫:按照一定的规则,自动地、批量地获取网络信息的脚本
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests, bs4
import re
import os
# 用于跟网站子目录拼接
root_url = "http://qhzk.lib.tsinghua.edu.cn:8080"
index_url = root_url + "/Tsinghua_Journal/year.html"
# 创建文件夹
download_dir = 'tsinghua_journal_downloads'
if not os.path.exists(download_dir):
os.makedirs(download_dir)
def get_journal_page_urls():
# 用于保存每一期刊物的元数据
journals = []
response = requests.get(index_url)
# 默认编码为ISO-8859-1,此处将编码修正为UTF-8
# https://github.com/kennethreitz/requests/issues/1604
response.encoding = 'utf-8'
soup = bs4.BeautifulSoup(response.text)
for link in soup.select('table a'):
title = link.getText()
url = link.get('href')
if(url):
one_journal = {
'title': title,
'link': root_url + url,
}
journals.append(one_journal)
print one_journal['title'],one_journal['link']
return journals
def get_page_count(journal):
response = requests.get(journal['link'])
soup = bs4.BeautifulSoup(response.text)
page_count = soup.select('div.command-bar a')[-1].get('href')
# page_count的格式如 'javascript:gotoPage(17)',需要提取数字
page_count = re.findall('\d+', page_count)[0]
return int(page_count)
# 下载指定范围期数的指定范围页
def get_pages_in_range(page_limit=None, journal_limit=None):
journals = get_journal_page_urls()
# 遍历限定范围内的期刊
for journal in journals[:journal_limit]:
count = page_limit or get_page_count(journal)
for page_no in range(1, 1+count):
# 发送POST请求,翻页
data = {
'action': 'image',
'jumpPage': page_no,
}
turnpage_response = requests.post(journal['link'], data=data)
# print turnpage_response.url
# 发送GET请求,获取图片
showimage_payload = {
'rand': 'aaa'
}
showimage_url = journal['link'].replace('turnPage', 'showImage')
image_response = requests.get(showimage_url, params=showimage_payload)
# print image_response.url
# 保存图片到本地
filename = u'%s-第%d页.png' % (journal['title'], page_no)
filepath = os.path.join(download_dir, filename)
with open(filepath, 'wb') as f:
f.write(image_response.content)
print '%s saved at %s' % (filename, filepath)
if __name__ == '__main__':
get_pages_in_range(page_limit=3, journal_limit=5)