Python Programming 2¶

NumPy arrays¶

NumPy arrays are like vectors, matrices, and arrays in R.

In [2]:
import numpy as np
In [18]:
x = np.array([1,5,2,8,9,0,-1,3]) # this is like a numeric vector
type(x)
print(x)
print(x*2)
[ 1  5  2  8  9  0 -1  3]
[ 2 10  4 16 18  0 -2  6]
In [19]:
a = -1
b = 1
seq = np.arange(a,b,1/4)  # kind of like the seq() function in R
print(seq)
[-1.   -0.75 -0.5  -0.25  0.    0.25  0.5   0.75]

Get the size/dimension of a numpy array:

In [20]:
np.shape(seq)
Out[20]:
(8,)
In [21]:
np.ndim(seq)
Out[21]:
1
In [3]:
A = np.array([3,6,8,2,4,6,7,0,3])
print(A)
[3 6 8 2 4 6 7 0 3]
In [4]:
B = A.reshape((3,3))
print(B)
[[3 6 8]
 [2 4 6]
 [7 0 3]]
In [5]:
E = (B % 2 == 0 ) # this is my Boolean mask
print(E)
[[False  True  True]
 [ True  True  True]
 [False  True False]]
In [6]:
print(B[E])
[6 8 2 4 6 0]
In [7]:
B[E] = -99
print(B)
[[  3 -99 -99]
 [-99 -99 -99]
 [  7 -99   3]]
In [8]:
print(np.size(B))
print(B.size)
9
9
In [9]:
print(np.shape(B))
print(B.shape)
(3, 3)
(3, 3)
In [10]:
print(np.ndim(B))
print(B.ndim)
2
2
In [11]:
B.dtype
Out[11]:
dtype('int64')

NumPy arrays must have values all of one type:¶

In [22]:
A = np.array([[1,2,3],['a','b','c']])
print(A)
[['1' '2' '3']
 ['a' 'b' 'c']]
In [23]:
A = np.array([[False,False,True],['a','b','c']]) # coercion to character strings
print(A)
[['False' 'False' 'True']
 ['a' 'b' 'c']]
In [24]:
Z = np.zeros((4,2))
print(Z)
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
In [25]:
type(Z)
Out[25]:
numpy.ndarray
In [26]:
Z = np.ones((3,4))
print(Z)
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]

Super useful:

In [27]:
x = np.linspace(0,1,21) #  This one is also like the seq() function in R
print(x)
[0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95 1.  ]
In [28]:
?np.linspace
Signature:      
np.linspace(
    start,
    stop,
    num=50,
    endpoint=True,
    retstep=False,
    dtype=None,
    axis=0,
    *,
    device=None,
)
Call signature:  np.linspace(*args, **kwargs)
Type:            _ArrayFunctionDispatcher
String form:     <function linspace at 0x111c1e200>
File:            /opt/anaconda3/lib/python3.13/site-packages/numpy/_core/function_base.py
Docstring:      
Return evenly spaced numbers over a specified interval.

Returns `num` evenly spaced samples, calculated over the
interval [`start`, `stop`].

The endpoint of the interval can optionally be excluded.

.. versionchanged:: 1.16.0
    Non-scalar `start` and `stop` are now supported.

.. versionchanged:: 1.20.0
    Values are rounded towards ``-inf`` instead of ``0`` when an
    integer ``dtype`` is specified. The old behavior can
    still be obtained with ``np.linspace(start, stop, num).astype(int)``

Parameters
----------
start : array_like
    The starting value of the sequence.
stop : array_like
    The end value of the sequence, unless `endpoint` is set to False.
    In that case, the sequence consists of all but the last of ``num + 1``
    evenly spaced samples, so that `stop` is excluded.  Note that the step
    size changes when `endpoint` is False.
num : int, optional
    Number of samples to generate. Default is 50. Must be non-negative.
endpoint : bool, optional
    If True, `stop` is the last sample. Otherwise, it is not included.
    Default is True.
retstep : bool, optional
    If True, return (`samples`, `step`), where `step` is the spacing
    between samples.
dtype : dtype, optional
    The type of the output array.  If `dtype` is not given, the data type
    is inferred from `start` and `stop`. The inferred dtype will never be
    an integer; `float` is chosen even if the arguments would produce an
    array of integers.

    .. versionadded:: 1.9.0
axis : int, optional
    The axis in the result to store the samples.  Relevant only if start
    or stop are array-like.  By default (0), the samples will be along a
    new axis inserted at the beginning. Use -1 to get an axis at the end.

    .. versionadded:: 1.16.0
device : str, optional
    The device on which to place the created array. Default: None.
    For Array-API interoperability only, so must be ``"cpu"`` if passed.

    .. versionadded:: 2.0.0

Returns
-------
samples : ndarray
    There are `num` equally spaced samples in the closed interval
    ``[start, stop]`` or the half-open interval ``[start, stop)``
    (depending on whether `endpoint` is True or False).
step : float, optional
    Only returned if `retstep` is True

    Size of spacing between samples.


See Also
--------
arange : Similar to `linspace`, but uses a step size (instead of the
         number of samples).
geomspace : Similar to `linspace`, but with numbers spaced evenly on a log
            scale (a geometric progression).
logspace : Similar to `geomspace`, but with the end points specified as
           logarithms.
:ref:`how-to-partition`

Examples
--------
>>> import numpy as np
>>> np.linspace(2.0, 3.0, num=5)
array([2.  , 2.25, 2.5 , 2.75, 3.  ])
>>> np.linspace(2.0, 3.0, num=5, endpoint=False)
array([2. ,  2.2,  2.4,  2.6,  2.8])
>>> np.linspace(2.0, 3.0, num=5, retstep=True)
(array([2.  ,  2.25,  2.5 ,  2.75,  3.  ]), 0.25)

Graphical illustration:

>>> import matplotlib.pyplot as plt
>>> N = 8
>>> y = np.zeros(N)
>>> x1 = np.linspace(0, 10, N, endpoint=True)
>>> x2 = np.linspace(0, 10, N, endpoint=False)
>>> plt.plot(x1, y, 'o')
[<matplotlib.lines.Line2D object at 0x...>]
>>> plt.plot(x2, y + 0.5, 'o')
[<matplotlib.lines.Line2D object at 0x...>]
>>> plt.ylim([-0.5, 1])
(-0.5, 1)
>>> plt.show()
Class docstring:
Class to wrap functions with checks for __array_function__ overrides.

All arguments are required, and can only be passed by position.

Parameters
----------
dispatcher : function or None
    The dispatcher function that returns a single sequence-like object
    of all arguments relevant.  It must have the same signature (except
    the default values) as the actual implementation.
    If ``None``, this is a ``like=`` dispatcher and the
    ``_ArrayFunctionDispatcher`` must be called with ``like`` as the
    first (additional and positional) argument.
implementation : function
    Function that implements the operation on NumPy arrays without
    overrides.  Arguments passed calling the ``_ArrayFunctionDispatcher``
    will be forwarded to this (and the ``dispatcher``) as if using
    ``*args, **kwargs``.

Attributes
----------
_implementation : function
    The original implementation passed in.
In [29]:
x = np.arange(1,4)
y = np.arange(1,6)
print(x)
print(y)
print(np.outer(x,y)) # gives a matrix with products of all combinations of x and y entries
[1 2 3]
[1 2 3 4 5]
[[ 1  2  3  4  5]
 [ 2  4  6  8 10]
 [ 3  6  9 12 15]]

Accessing entries of a NumPy array¶

In [39]:
M=np.array([[1,2,4],[5,0,-1],[3,10, 12], [4, 99, 41]])
print(M)
print(M[0,0])
print(M[0,:]) # take first row
print(M[:,0]) # take first column
print(M[-1,1]) # can use negative indexing to come from "the end"
print(M[1,1]) #
M[0,2] = 99
print(M)
[[ 1  2  4]
 [ 5  0 -1]
 [ 3 10 12]
 [ 4 99 41]]
1
[1 2 4]
[1 5 3 4]
99
0
[[ 1  2 99]
 [ 5  0 -1]
 [ 3 10 12]
 [ 4 99 41]]

Accessing "methods" for Python objects¶

In [41]:
print(M)
print(np.transpose(M))
print(M.transpose()) # another way to apply the np.transpose function
[[ 1  2 99]
 [ 5  0 -1]
 [ 3 10 12]
 [ 4 99 41]]
[[ 1  5  3  4]
 [ 2  0 10 99]
 [99 -1 12 41]]
[[ 1  5  3  4]
 [ 2  0 10 99]
 [99 -1 12 41]]
In [42]:
print(M)
print(np.sum(M))
print(M.sum())
print(M.sum(axis = 1)) # Axis 1 represents the columns (collapse columns)
print(M.sum(axis = 0)) # Axis 0 represents the rows (vertical direction)
[[ 1  2 99]
 [ 5  0 -1]
 [ 3 10 12]
 [ 4 99 41]]
275
275
[102   4  25 144]
[ 13 111 151]

Making copies of a NumPy array¶

In [13]:
M = np.reshape(np.arange(0,100,5),(2,10))
print(M)
[[ 0  5 10 15 20 25 30 35 40 45]
 [50 55 60 65 70 75 80 85 90 95]]
In [14]:
M2 = M[:2,:2] # take the first two rows and columns
print(M2)  # note that the above only creates a "view" of M
[[ 0  5]
 [50 55]]
In [15]:
M2[0,0] = 8
print(M2)

print(M)  # the same entry changed also in the original array.
[[ 8  5]
 [50 55]]
[[ 8  5 10 15 20 25 30 35 40 45]
 [50 55 60 65 70 75 80 85 90 95]]
In [16]:
M2_copy = M[:2,:2].copy()
print(M2_copy)

M2_copy[1,1] = 999
print(M2_copy)

print(M)
[[ 8  5]
 [50 55]]
[[  8   5]
 [ 50 999]]
[[ 8  5 10 15 20 25 30 35 40 45]
 [50 55 60 65 70 75 80 85 90 95]]

Generating numbers in NumPy¶

Initializing a random number generator (RNG):

In [17]:
RANDOM_STATE_SEED = 42
np.random.seed(RANDOM_STATE_SEED)
In [53]:
n_samples = 5
n_variants = 10
dat=np.random.rand(n_samples, n_variants)
dat
Out[53]:
array([[0.28227303, 0.8984957 , 0.98304609, 0.0810617 , 0.39764763,
        0.30184106, 0.35199013, 0.26535489, 0.38266486, 0.14141459],
       [0.27981828, 0.74781788, 0.34404653, 0.2078349 , 0.10844404,
        0.34800415, 0.64340909, 0.25330487, 0.42244569, 0.95239858],
       [0.55236285, 0.53193862, 0.40994331, 0.55683998, 0.98456727,
        0.25380691, 0.94192789, 0.14907098, 0.11110536, 0.6546695 ],
       [0.25117902, 0.44439231, 0.72965477, 0.5348415 , 0.01374935,
        0.30584323, 0.06468667, 0.92611455, 0.79464859, 0.98384168],
       [0.88510951, 0.47533202, 0.88259752, 0.67243861, 0.52459966,
        0.37064605, 0.68096618, 0.59013342, 0.61418498, 0.39983133]])
In [48]:
b = np.random.randint(1, 101, size=(3, 4))
print(b)
[[46 75 44 45]
 [75 49  4 80]
 [57 56 38  6]]
In [49]:
c= np.random.normal(10)
c
Out[49]:
11.05255124665024
In [51]:
import numpy as np
import matplotlib.pyplot as plt

# 1. Generate random numbers from a normal distribution
# Parameters: loc (mean), scale (standard deviation), size (number of samples)
mean = 0
std_dev = 1
num_samples = 10000

# Using the recommended Generator instance for modern NumPy
rng = np.random.default_rng()
normal_data = rng.normal(loc=mean, scale=std_dev, size=num_samples)

# 2. Plot a histogram of the generated data
plt.figure(figsize=(4, 3)) # Optional: adjust figure size
plt.hist(normal_data, bins=50, density=True, alpha=0.6, color='blue')

# Add labels and a title
plt.title("Histogram of a Normal Distribution (Mean=0, Std Dev=1)")
plt.xlabel("Value")
plt.ylabel("Density") # Density=True makes the y-axis represent probability density
plt.grid(True)

# 3. Display the plot
plt.show()
No description has been provided for this image
In [72]:
n_samples=10
n_variants=2
pA = 0.2
pB = 0.5
probsA = [ (1-pA)**2, 2*pA*(1-pA), pA**2 ]
probsB = [ (1-pB)**2, 2*pB*(1-pB), pB**2 ]
print(probsA)
print(probsB)
[0.6400000000000001, 0.32000000000000006, 0.04000000000000001]
[0.25, 0.5, 0.25]
In [81]:
genotypes = np.full((n_samples, n_variants), np.nan)
genotypes[:n_samples//2, 0] = np.random.choice([0,1,2], size=n_samples//2, p=probsA)
genotypes[n_samples//2:, 0] = np.random.choice([0,1,2], size=n_samples - n_samples//2, p=probsB)
genotypes[:n_samples//2, 1] = np.random.choice([0,1,2], size=n_samples//2, p=probsA)
genotypes[n_samples//2:, 1] = np.random.choice([0,1,2], size=n_samples - n_samples//2, p=probsB)
print(genotypes)
[[0. 0.]
 [0. 1.]
 [0. 0.]
 [1. 0.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [2. 2.]]

Missing values in NumPy arrays¶

In [55]:
mask_missing = dat < 0.02
dat[mask_missing] = np.nan
print(dat)
Out[55]:
array([[0.28227303, 0.8984957 , 0.98304609, 0.0810617 , 0.39764763,
        0.30184106, 0.35199013, 0.26535489, 0.38266486, 0.14141459],
       [0.27981828, 0.74781788, 0.34404653, 0.2078349 , 0.10844404,
        0.34800415, 0.64340909, 0.25330487, 0.42244569, 0.95239858],
       [0.55236285, 0.53193862, 0.40994331, 0.55683998, 0.98456727,
        0.25380691, 0.94192789, 0.14907098, 0.11110536, 0.6546695 ],
       [0.25117902, 0.44439231, 0.72965477, 0.5348415 ,        nan,
        0.30584323, 0.06468667, 0.92611455, 0.79464859, 0.98384168],
       [0.88510951, 0.47533202, 0.88259752, 0.67243861, 0.52459966,
        0.37064605, 0.68096618, 0.59013342, 0.61418498, 0.39983133]])
In [56]:
print(dat.sum())
print(dat.max())
print(np.nansum(dat))
print(np.nanmax(dat))
nan
nan
24.67058797743921
0.9845672743971778
In [ ]: