* added log tab
* added references text box * added options to choose embedding models
This commit is contained in:
parent
eedbb1b81a
commit
18f35b28c2
42 changed files with 911 additions and 441 deletions
13
backend/embeddings_manager.py
Normal file
13
backend/embeddings_manager.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
|
||||
import torch
|
||||
|
||||
# dict : huggingface url -> max token length (will be chunk size)
|
||||
MODELS_DICT = {"intfloat/multilingual-e5-large": 512,
|
||||
"intfloat/multilingual-e5-large-instruct": 512}
|
||||
|
||||
|
||||
def get_embedding_model(name: str):
|
||||
if name in MODELS_DICT:
|
||||
return HuggingFaceEmbeddings(model_name=name, model_kwargs={'device': 'cuda'} if torch.cuda.is_available() else {})
|
||||
else:
|
||||
raise ValueError(f"Model {name} not found in the list of available models")
|
|
@ -15,22 +15,22 @@ class InferenceInstance:
|
|||
self.vector_db_manager = vector_db_manager
|
||||
self.history = []
|
||||
self.nb_chunks_retrieved = nb_chunks_retrieved
|
||||
flush_relevant_content()
|
||||
|
||||
def get_next_token(self, input_user: str, doc_name: str) -> Iterator[Dict[str, str]]:
|
||||
is_pdf = doc_name.endswith(".pdf")
|
||||
print(f"doc_name: {doc_name}")
|
||||
new_assistant_message = {"role": "assistant", "content": ""}
|
||||
search_results = self._get_search_results(input_user, doc_name)
|
||||
self._update_history(input_user, search_results, is_pdf)
|
||||
print(f"search results: {search_results}")
|
||||
pages = self._update_history(input_user, search_results, is_pdf)
|
||||
pages_info = f"pages used : p" + " p".join(pages)
|
||||
print(f"history: {self.history}")
|
||||
|
||||
completion = self._get_completion()
|
||||
|
||||
for chunk in completion:
|
||||
if chunk.choices[0].delta.content:
|
||||
new_assistant_message["content"] += chunk.choices[0].delta.content
|
||||
yield pages_info + "\n\n " + new_assistant_message["content"]
|
||||
yield new_assistant_message["content"]
|
||||
|
||||
def _get_search_results(self, input_user: str, doc_name: str):
|
||||
print(f"input_user: {input_user}")
|
||||
|
@ -38,12 +38,21 @@ class InferenceInstance:
|
|||
return vector_db.similarity_search(input_user, k=4)
|
||||
|
||||
def _update_history(self, input_user: str, search_results, is_pdf):
|
||||
references_textbox_content = ""
|
||||
some_context = ""
|
||||
pages = []
|
||||
for result in search_results:
|
||||
if is_pdf:
|
||||
pages.append(str(result.metadata['page']))
|
||||
some_context += result.page_content + "\n\n"
|
||||
pages_info = f'on page {result.metadata["page"]}' if is_pdf else 'in the document'
|
||||
references_textbox_content += f"**Relevant content viewed {pages_info}**: \n\n" \
|
||||
f" \n\n {result.page_content}\n\n" \
|
||||
"-----------------------------------\n\n"
|
||||
|
||||
with open("../temp_file/relevant_content.mmd", "w") as f:
|
||||
f.write(references_textbox_content)
|
||||
|
||||
self.history.append({"role": "system", "content": f"relevant content for user question {some_context}"})
|
||||
self.history.append({"role": "user", "content": input_user})
|
||||
return pages
|
||||
|
@ -56,3 +65,12 @@ class InferenceInstance:
|
|||
stream=True,
|
||||
)
|
||||
|
||||
|
||||
def read_relevant_content():
|
||||
with open("../temp_file/relevant_content.mmd", "r") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def flush_relevant_content():
|
||||
with open("../temp_file/relevant_content.mmd", "w") as f:
|
||||
f.write("")
|
33
backend/logger.py
Normal file
33
backend/logger.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
import gradio as gr
|
||||
import sys
|
||||
|
||||
|
||||
class Logger:
|
||||
"""
|
||||
Logger class to redirect the output to a file.
|
||||
will be used to the log textbox in the frontend.
|
||||
|
||||
Adapted from : https://github.com/gradio-app/gradio/issues/2362#issuecomment-1424446778
|
||||
"""
|
||||
def __init__(self, filename):
|
||||
self.terminal = sys.stdout
|
||||
self.log = open(filename, "w")
|
||||
|
||||
def write(self, message):
|
||||
self.terminal.write(message)
|
||||
self.log.write(message)
|
||||
|
||||
def flush(self):
|
||||
self.terminal.flush()
|
||||
self.log.flush()
|
||||
|
||||
def isatty(self):
|
||||
return False
|
||||
|
||||
|
||||
def read_logs():
|
||||
sys.stdout.flush()
|
||||
with open("../temp_file/output.log", "r") as f:
|
||||
return f.read()
|
||||
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import subprocess
|
||||
from pathlib import Path
|
||||
import time
|
||||
from gradio import Info
|
||||
|
||||
|
||||
def pdf_to_mmd(path_input: str):
|
||||
|
@ -10,6 +11,13 @@ def pdf_to_mmd(path_input: str):
|
|||
|
||||
stream stderr to the front end
|
||||
"""
|
||||
text = f"Converting {path_input} to LaTex, " \
|
||||
f"it can take some time especially for big documents check progress in your terminal." \
|
||||
f"Wait until the conversion is done to ask questions to the models."
|
||||
|
||||
print(text)
|
||||
Info(text)
|
||||
|
||||
output_dir = "../documents/mmds"
|
||||
command = ['nougat', path_input, "-o", output_dir]
|
||||
subprocess.run(command)
|
||||
|
@ -17,7 +25,6 @@ def pdf_to_mmd(path_input: str):
|
|||
# Change the math delimiter to the common delimiter used in MMD
|
||||
with open(f"{output_dir}/{str(Path(path_input).stem)}.mmd", "r+") as doc:
|
||||
content = doc.read()
|
||||
print(content)
|
||||
|
||||
content = content.replace(r"\[", "$$").replace(r"\]", "$$")
|
||||
content = content.replace(r"\(", "$").replace(r"\)", "$")
|
||||
|
|
488
documents/mmds/Chapter 1. The Euclidean Space.2016-2.mmd
Normal file
488
documents/mmds/Chapter 1. The Euclidean Space.2016-2.mmd
Normal file
|
@ -0,0 +1,488 @@
|
|||
## Chapter 1 The Euclidean Space
|
||||
|
||||
The objects of study in advanced calculus are differentiable functions of several variables. To set the stage for the study, the Euclidean space as a vector space endowed with the dot product is defined in Section 1.1. To aid visualizing points in the Euclidean space, the notion of a vector is introduced in Section 1.2. In Section 1.3 Euclidean motions, mappings preserving the Euclidean distance, are briefly discussed. The last Section 1.4 contains a discussion on the cross product which is only defined for vectors in the three dimensional Euclidean space, that is, our physical space.
|
||||
|
||||
### 1.1 The Dot Product
|
||||
|
||||
An $n$-tuple is given by
|
||||
|
||||
$${\bf x}=(x_{1},x_{2},\cdots,x_{n})\,\quad x_{j}\in\mathbb{R},\ \ j=1,\cdots,n\.$$
|
||||
|
||||
It is called an ordered pair when $n=2$. Denote by $\mathbb{R}^{n}$ the collection of all $n$-tuples. The zero $n$-tuple, $(0,0,\cdots,0)$, will be written as ${\bf 0}$ from time to time. There are two algebraic operations defined on $\mathbb{R}^{n}$, namely, the addition
|
||||
|
||||
$${\bf x}+{\bf y}\ =(x_{1}+y_{1},x_{2}+y_{2},\cdots,x_{n}+y_{n})\,$$
|
||||
|
||||
and the scalar multiplication
|
||||
|
||||
$$\alpha{\bf x}\ =(\alpha x_{1},\alpha x_{2},\cdots,\alpha x_{n})\,\quad\alpha \in\mathbb{R}\,$$
|
||||
|
||||
where
|
||||
|
||||
$${\bf x}=(x_{1},x_{2},\cdots,x_{n}),\quad{\bf y}=(y_{1},y_{2},\cdots,y_{n})\.$$
|
||||
|
||||
Recall that the ordinary multiplication assigns a number as the product of two numbers, so it can be regarded as a map from $\mathbb{R}\times\mathbb{R}$ to $\mathbb{R}$. One may expect a multiplication on
|
||||
|
||||
[MISSING_PAGE_EMPTY:2]
|
||||
|
||||
One has no difficulty in verifying the dot product satisfies these three axioms. Alternatively one may use $\langle\textbf{x},\textbf{y}\rangle$ to denote $\textbf{x}\cdot\textbf{y}$. We will do this to avoid confusion occasionally. Note that $\textbf{x}\cdot\textbf{y}=\textbf{y}\cdot\ \textbf{x}$, so the dot product between **x** and **y** is the same as the dot product between **x** and **y**. Things would be very different when we study the cross product later.
|
||||
|
||||
At this point let us make a digression to establish a fundamental inequality. Some of you may already learn this inequality, but its interesting proof is worth to go through once more.
|
||||
|
||||
**Theorem 1.1** (**Cauchy-Schwarz Inequality**).: _For $\textbf{x},\textbf{y}\in\mathbb{R}^{n}$,_
|
||||
|
||||
$$\left|\sum_{j=1}^{n}x_{j}y_{j}\right|\leq\sqrt{\sum_{j=1}^{n}x_{j}^{2}}\sqrt{ \sum_{j=1}^{n}y_{j}^{2}}\.$$
|
||||
|
||||
_Furthermore, equality sign holds if and only if either one of $\textbf{x},\textbf{y}$ is zero $n$-tuple or there is some $\alpha\neq 0$ such that $\textbf{y}=\alpha\textbf{x}.$_
|
||||
|
||||
The condition $\textbf{x}=\alpha\textbf{y}$ means **x** and **y** are proportional to each other. Using the language of linear algebra, the equality condition is simply **x** and **y** are linearly dependent.
|
||||
|
||||
Proof.: First assume not all $x_{j}$'s are zero in **x**. Consider the expression
|
||||
|
||||
$$\sum_{j=1}^{n}(x_{j}t-y_{j})^{2}\,$$
|
||||
|
||||
which is a sum of squares and so must be non-negative for all $t\in\mathbb{R}$. We can express it as a quadratic polynomial in $t$ as
|
||||
|
||||
$$p(t)\equiv at^{2}-2bt+c\,$$
|
||||
|
||||
where
|
||||
|
||||
$$a=\sum_{j=1}^{n}x_{j}^{2}\,\quad b=\sum_{j=1}^{n}x_{j}y_{j},\quad c=\sum_{j=1} ^{n}y_{j}^{2}\.$$
|
||||
|
||||
Since $a>0$, $p(t)$ tends to $\infty$ as $t\rightarrow\pm\infty$. Therefore, it is non-negative if and only if its discriminant is non-positive, that is, $4b^{2}-4ac\leq 0$, which yields $|b|\leq\sqrt{ac}$ after taking square root. Our inequality follows. Moreover, the equality sign holds if and only if $4b^{2}-4ac=0$. In this case the quadratic equation $at^{2}-2bt+c=0$ has a (double) root, say, $t_{1}$. Going back to the original expression, we have
|
||||
|
||||
$$\sum_{j=1}^{n}(x_{j}t_{1}-y_{j})^{2}=0\,$$which forces $t_{1}x_{j}=y_{j}$ for all $j=1,\cdots,n$. So we can take $\alpha=t_{1}$ in case $c=\sum_{j}y_{j}^{2}>0$.
|
||||
|
||||
When all $x_{j}$'s vanish but not all $y_{j}$'s, we exchange $\mathbf{x}$ and $\mathbf{y}$ to get the same conclusion.
|
||||
|
||||
Finally, when all $x_{j}$'s and $y_{j}$'s vanish, the inequality clearly holds.
|
||||
|
||||
Accompanying with the notion of the inner product are those of the norm and the distance. Indeed, the **Euclidean norm** of an $n$-tuple is defined to be
|
||||
|
||||
$$|\mathbf{x}| = (\mathbf{x}\cdot\mathbf{x})^{1/2}$$ $$= \left(\sum_{j=1}^{n}x_{j}^{2}\right)^{1/2}$$ $$= \sqrt{x_{1}^{2}+x_{2}^{2}+\cdots x_{n}^{2}}\.$$
|
||||
|
||||
The **Euclidean distance** between $\mathbf{x}$ and $\mathbf{y}$ is defined by
|
||||
|
||||
$$|\mathbf{x}-\mathbf{y}|=\sqrt{(x_{1}-y_{1})^{2}+(x_{2}-y_{2})^{2}+\cdots+(x_{ n}-y_{n})^{2}}\.$$
|
||||
|
||||
In terms of these notions, Cauchy-Schwarz Inequality can be rewritten in a compact form
|
||||
|
||||
$$|\mathbf{x}\cdot\mathbf{y}|\leq|\mathbf{x}||\mathbf{y}|\.$$
|
||||
|
||||
In mathematics, a distance is a rule to assign a non-negative number to any pair of elements in a set under consideration. The rule consists of three "axioms": For $a,b,c$ in this set,
|
||||
|
||||
* $d(a,b)\geq 0\,$ and equal to 0 iff $a=b$,
|
||||
* $d(a,b)=d(b,a)$, and
|
||||
* $d(a,b)\leq d(a,c)+d(c,b)$.
|
||||
|
||||
Now, taking $d(\mathbf{x},\mathbf{y})=|\mathbf{x}-\mathbf{y}|$, we see that it satisfies all these three axioms: $\mathbf{x},\mathbf{y},\mathbf{z}\in\mathbb{R}^{n}$,
|
||||
|
||||
* $|\mathbf{x}-\mathbf{y}|\geq 0$ and equal to 0 if and only if $\mathbf{x}=\mathbf{y}$,
|
||||
* $|\mathbf{x}-\mathbf{y}|=|\mathbf{y}-\mathbf{x}|$,
|
||||
* $|\mathbf{x}-\mathbf{y}|\leq|\mathbf{x}-\mathbf{z}|+|\mathbf{z}-\mathbf{y}|$.
|
||||
|
||||
### 1.1 The Dot Product
|
||||
|
||||
Indeed, (a) and (b) are obvious. To prove (c), write ${\bf u}={\bf x}-{\bf z},{\bf v}={\bf z}-{\bf y}$ to get
|
||||
|
||||
$$|{\bf u}+{\bf v}|\leq|{\bf u}|+|{\bf v}|\,$$
|
||||
|
||||
which holds after taking square of both sides and then applying the Cauchy-Schwarz Inequality. I let you work out the details in the exercise. Note that $|{\bf x}|=|{\bf x}-{\bf 0}|$, so the norm of ${\bf x}$ is its distance to the zero $n$-tuple. In these notes, norm and distance are referred to Euclidean norm and Euclidean distance without further specification.
|
||||
|
||||
Note that the same notation $|x|$ stands for the absolute value of $x$ when $x$ is a real number. The norm of ${\bf x}$ is the same as its absolute value when $n=1$. When $n\geq 2$, the notation $|{\bf x}|$ stands for the norm only as there is no such a definition of absolute value for an $n$-tuple. The notation $\|{\bf x}\|$ is also used to denote the norm of ${\bf x}$, but it will not be used here.
|
||||
|
||||
Recall that the cosine function $\cos t$ is strictly decreasing from $1$ to $-1$ as $t$ goes from $0$ to $\pi$. Keeping this in mind, we are going to define the angle between two non-zero $n$-tuples. By Cauchy-Schwarz Inequality, the absolute value of the expression ${\bf x}\cdot{\bf y}/|{\bf x}||{\bf y}|$ lies in the interval $[-1,1]$. Therefore, by what we just said, there exists a unique $\theta\in[0,\pi]$ satisfying $\cos\theta={\bf x}\cdot{\bf y}/|{\bf x}||{\bf y}|$, that is,
|
||||
|
||||
$${\bf x}\cdot{\bf y}=|{\bf x}||{\bf y}|\cos\theta\.$$
|
||||
|
||||
We define the **angle** between two non-zero n-tuples ${\bf x}$ and ${\bf y}$ to be $\theta$ where ${\bf x},{\bf y}$ are arbitrary $n$-tuples. The angle between two $n$-tuples makes no sense when of them is zero. By definition this angle must belong to $[0,\pi]$. Moreover, it is symmetric, that is, the angle between ${\bf x}$ and ${\bf y}$ is the same as the angle between ${\bf y}$ and ${\bf x}$. At this stage, the notion of an angle is defined purely in an analytical manner and does not bear any geometric meaning. We will link it to geometry in the next section.
|
||||
|
||||
Two $n$-tuples ${\bf x}$ and ${\bf y}$ are **perpendicular** or **orthogonal** to each other if ${\bf x}\cdot{\bf y}=0$. In terms of the angle, they are perpendicular if and only if their angle is $\pi/2$. The zero $n$-tuple is perpendicular to all $n$-tuples. By Cauchy-Schwarz Inequality, we also know that two non-zero ${\bf x}$ and ${\bf y}$ satisfy ${\bf x}=c{\bf y}$ for some $c>0$ when their angle $\theta=0$ and satisfy ${\bf x}=c{\bf y},c<0$ when $\theta=\pi$.
|
||||
|
||||
**Example 1.1**.: Find all $n$-tuples ${\bf x}$ that are perpendicular to $(1,-1,2)$ and $(-1,0,3)$. These points satisfy
|
||||
|
||||
$$(1,-1,2)\cdot{\bf x}=0\,\quad(-1,0,3)\cdot{\bf x}=0\,$$
|
||||
|
||||
that is, the linear system
|
||||
|
||||
$$\begin{cases}x-y+2z&=0\,\\ -x&+3z&=0\.\end{cases}$$We solve this system (see Comments at the end of this chapter) to get ${\bf x}=(x,y,z)=a(3,5,1),a\in\mathbb{R}$. By varying $a$, we obtain infinitely many solutions.
|
||||
|
||||
Summing up, we have defined the Euclidean space $(\mathbb{R}^{n},+,\cdot,\langle\cdot,\cdot\rangle)$ which is the set of all $n$-tuples with two algebraic operations-- the addition and the scalar multiplication-- as well as the dot product. From now on it will be abbreviated in a single symbol $\mathbb{R}^{n}$.
|
||||
|
||||
### 1.2 Vector Representation
|
||||
|
||||
Visualizing $n$-tuples for $n=2,3$ as vectors has been used widely in physics and engineering. In this section we discuss how to do it. However, despite its convenience and usefulness, it should be understood that the notion of a vector is only an auxiliary tool. Analysis on the Euclidean space can be carried out without referring to this notion.
|
||||
|
||||
To start off, imagine that the coordinates axes have been introduced in the plane. The $x$-axis consists of all ordered pairs of the form $(x,0),\ x\in\mathbb{R}$, and the $y$-axis all ordered pairs of the form $(0,y),\ y\in\mathbb{R}$. So every ordered pair $(x,y)$ can be written as $x(1,0)+y(0,1)$ and $(x,y)$ is a point in the coordinate plane. Points in the plane are in one-to-one correspondence with ordered pairs. The same idea applies to all other dimensions. However, since the two dimensional case is easy to see and the three dimensional case can be seen with some imagination, in the following we will focus on these two spaces. It will be apparent that most of our discussions can be extended to all dimensions.
|
||||
|
||||
For a point $(x,y)$ in the plane, it associates to a **vector** which is an arrow pointing from the base $(0,0)$ to the tip $(x,y)$. The vector degenerates into a point when it is the zero ordered pair $(0,0)$. We call it the **zero vector** and denoted it by $(0,0)$ or simply ${\bf 0}$.
|
||||
|
||||
After the introduction of the vector for an ordered pair, we interpret the algebraic operations of $\mathbb{R}^{2}$ as follows. Indeed, by drawing pictures, it is not hard to convince oneself that the addition of two ordered pairs is accomplished by the parallelogram law. Specifically, first form the parallelogram using the two vectors as line segments corresponding to the two points. Then the diagonal of this resulting parallelogram, regarded as a vector pointing from the origin to the other end, is the sum of these two vectors. The same situation holds in $n\geq 3$, as one can always restrict to the plane containing these two vectors provided they are linearly independent. When they are linearly dependent, the geometric interpretation is apparent.
|
||||
|
||||
The scalar multiplication $(x,y)\mapsto\alpha(x,y),\alpha>0$, means changing the vector by a scale of $\alpha$ along the same direction. It is a prolongation if $\alpha>1$ and a shortening if $\alpha\in(0,1)$
|
||||
|
||||
### 1.2 Vector Representation
|
||||
|
||||
On the other hand, when $\alpha<0$, the resulting vector points in the opposite direction of the original vector with size changes equal to $|\alpha|$.
|
||||
|
||||
How about substraction of two vectors? Let ${\bf w}={\bf v}-{\bf u}$. Then ${\bf w}$ can be obtained by first drawing the triangle with vertices at $(0,0),{\bf u}$ and ${\bf v}$ and then translate the side from ${\bf u}$ to ${\bf v}$ so that its base is located at the origin. The translated side is ${\bf w}$.
|
||||
|
||||
We may also find the midpoint of two ordered pairs. For ${\bf u},{\bf v}$, its midpoint is given by $({\bf u}+{\bf v})/2$. Regarding as a vector, this midpoint can be described in the following way. First draw the parallelogram formed by ${\bf u}$ and ${\bf v}$. Then the intersection point of the two diagonal lines of this parallelogram is the tip of the midpoint (vector).
|
||||
|
||||
When we regard an $n$-tuple ${\bf x}$ as a vector, it is more convenient to call its norm the **magnitude** of the vector. It is a **unit vector** if its magnitude is equal to 1. Likewise, the distance between two points may be called the **length** of the line segment connecting ${\bf x}$ and ${\bf y}$. It is consistent with the classical Pythagoras theorem. In fact, the definition of the Euclidean norm and distance were inspired by this classical theorem.
|
||||
|
||||
Next we show that the angle defined in the last section, which purely depends on analytical terms, is the same as the "geometric angle". To see it, let ${\bf x}=(a,b)$ and ${\bf y}=(c,d)$ be two non-zero vectors in the plane. By the Law of Cosines in trigonometry (see Comments at the end of this chapter),
|
||||
|
||||
$$(c-a)^{2}+(d-b)^{2}=(a^{2}+b^{2})+(c^{2}+d^{2})-2\sqrt{c^{2}+d^{2}}\sqrt{a^{2} +b^{2}}\cos\phi\,$$
|
||||
|
||||
where $\phi\in[0,\pi]$ is the "geometric angle" between ${\bf x}$ and ${\bf y}$. Simplifying, we have
|
||||
|
||||
$$-2(ac+db)=-2\sqrt{c^{2}+d^{2}}\sqrt{a^{2}+b^{2}}\cos\phi\,$$
|
||||
|
||||
which is equal to
|
||||
|
||||
$${\bf x}\cdot{\bf y}=|{\bf x}||{\bf y}|\cos\phi\.$$
|
||||
|
||||
Comparing with the definition of $\theta$, we have $\cos\phi=\cos\theta$ so that $\phi=\theta$. In other words, the geometric angle coincides with the analytical angle. The same argument works in higher dimensions as we can restrict to the plane containing any two given vectors.
|
||||
|
||||
A vector is uniquely determined by its magnitude and direction. To be more precise we fix them in definition. Any vector with unit length is called a **direction.** Each non-zero vector ${\bf x}$ can be written as
|
||||
|
||||
$${\bf x}=|{\bf x}|\ \mathbf{\xi},$$
|
||||
|
||||
where $|{\bf x}|$ is its magnitude and
|
||||
|
||||
$$\mathbf{\xi}=\frac{{\bf x}}{|{\bf x}|}$$its direction. Every direction $\boldsymbol{\xi}=(\xi_{1},\xi_{2},\cdots,\xi_{n})$ can further be expressed as
|
||||
|
||||
$$\boldsymbol{\xi}=(\cos\alpha_{1},\cos\alpha_{2},\cdots,\cos\alpha_{n})\,$$
|
||||
|
||||
where $\alpha_{k}\in[0,\pi]$ are called the **direction angles** of $\boldsymbol{\xi}$. From $\boldsymbol{\xi}\cdot\mathbf{e}_{k}=\cos\alpha_{k}$ we see that $\alpha_{k}$ is the angle between $\boldsymbol{\xi}$ and the $e_{k}$-axis. These $\cos\alpha_{k}$'s are called the **direction cosines** of $\mathbf{x}$.
|
||||
|
||||
**Example 1.2**.: Find the magnitude and direction of $(1,2,-7)$ and determine the vector $(2,a,6)$ that is perpendicular to $(1,2,-7)$. The magnitude of $(1,2,-7)$ is
|
||||
|
||||
$$|(1,2,-7)|=\sqrt{1^{2}+2^{2}+(-7)^{2}}=\sqrt{54}\,$$
|
||||
|
||||
and its direction is $(1,2,-7)/\sqrt{54}$. By orthogonality,
|
||||
|
||||
$$0=(1,2,-7)\cdot(2,a,6)=2+2a-42=0\,$$
|
||||
|
||||
which implies $a=20$. The vector $(2,20,6)$ is perpendicular to $(1,2,-7)$.
|
||||
|
||||
One may also consider the **vector from the initial point $\mathbf{x}$ to the terminal point $\mathbf{y}$,** or **the vector based at some point**. Unlike a vector, a vector from $\mathbf{x}$ to $\mathbf{y}$ is an arrow whose base and tip are $\mathbf{x}$ and $\mathbf{y}$ respectively. Obviously such a "vector" is parallel to the position vector of $\mathbf{y}-x$ whose base is now at the origin. The length and direction of a vector from $\mathbf{x}$ to $\mathbf{y}$ are defined as the respective length and direction of $\mathbf{y}-\mathbf{x}$.
|
||||
|
||||
**Example 1.3**.: Consider the triangle with vertices at $(1,2),(3,4),(0,-1)$. Find the direction of the vector pointing at the midpoint of the side connecting $(1,2)$ and $(3,4)$ from $(0,-1)$. Well, first we translate $(0,-1)$ to the origin so that the triangle is congruent to the one whose vertices are $((1,2)-(0,-1),(3,4)-(0,-1),(0,-1)-(0,-1)$, that is, $(1,3),(3,5),(0,0)$. The midpoint of the side from $(1,3)$ and $(3,5)$ is given by
|
||||
|
||||
$$\frac{1}{2}\left((1,3)+(3,5)\right)=(2,4)\,$$
|
||||
|
||||
and its direction is given by
|
||||
|
||||
$$\frac{(2,4)}{\sqrt{2^{2}+4^{2}}}=\frac{(2,4)}{\sqrt{20}}=\frac{(1,2)}{\sqrt{5 }}\.$$
|
||||
|
||||
(No need to simplify further.)
|
||||
|
||||
**Example 1.4**.: (a) Find the magnitude and direction of the vector from $(1,-1)$ to $(-2,5)$. (b) Find all directions that are perpendicular to the vector in (a).
|
||||
|
||||
The magnitude and direction of the vector from $(1,-1)$ to $(-2,5)$ are the same as those of the position vector $(-2,5)-(1,-1)=(-3,6)$. Its magnitude is given by
|
||||
|
||||
$$|(-3,6)|=\sqrt{(-3)^{2}+6^{2}}=3\sqrt{5}\,$$
|
||||
|
||||
and the direction is given by
|
||||
|
||||
$$\frac{(-3,6)}{3\sqrt{5}}=\frac{(-1,2)}{\sqrt{5}}\.$$
|
||||
|
||||
(No need to simplify further.)
|
||||
|
||||
A vector $(a,b)$ perpendicular to $(-3,6)$ satisfies
|
||||
|
||||
$$(-3,6)\cdot(a,b)=-3a+6b=0\.$$
|
||||
|
||||
By varying $a$ and $b$ according to this relation, there are infinitely many vectors $(a,b)$ satisfying this requirement. For instance, we may take $a=2,b=1$ so $(2,1)$ is one choice. However, to be a direction there is another condition, namely, the length of the vector has to equal to one. There are two such vectors:
|
||||
|
||||
$$\frac{(2,1)}{\sqrt{5}}\,\quad-\frac{(2,1)}{\sqrt{5}}\.$$
|
||||
|
||||
(Again no need to simplify.)
|
||||
|
||||
### 1.3 Euclidean Motions
|
||||
|
||||
A **Euclidean motion** is a map from $\mathbb{R}^{n}$ to itself of the form
|
||||
|
||||
$$T\mathbf{x}=A\mathbf{x}+\mathbf{b}\,$$
|
||||
|
||||
where $\mathbf{b}\in\mathbb{R}^{n}$ and $A$ is an $n\times n$-matrix, that preserves the distance between two points, that is, for $\mathbf{x},\mathbf{y}\in\mathbb{R}^{n}$,
|
||||
|
||||
$$|T\mathbf{x}-T\mathbf{y}|=|\mathbf{x}-\mathbf{y}|\.$$
|
||||
|
||||
Here in $A\mathbf{x}$ the vector $\mathbf{x}$ should be understood as a column vector.
|
||||
|
||||
Recall that a square matrix $R$ is called an orthogonal matrix if $R^{\prime}R=RR^{\prime}=I$, where $R^{\prime}$ is the transpose of $R$ and $I$ is the identity matrix.
|
||||
|
||||
**Proposition 1.2**.: _A map $T\mathbf{x}=A\mathbf{x}+\mathbf{b}$ is a Euclidean motion if and only if $A$ is an orthogonal matrix._
|
||||
|
||||
Proof.: In the following we use $\langle\mathbf{x},\mathbf{y}\rangle$ instead $\mathbf{x}\cdot\mathbf{y}$ to denote the dot product. First of all, let $T$ be a Euclidean motion. Then it follows from the definition that
|
||||
|
||||
$$|\mathbf{x}-\mathbf{y}|=|T\mathbf{x}-T\mathbf{y}|=|A\mathbf{x}-A\mathbf{y}|= |A(\mathbf{x}-\mathbf{y})|,$$which yields immediately that
|
||||
|
||||
$$|A({\bf x}-{\bf y})|^{2} = \langle A({\bf x}-{\bf y}),A({\bf x}-{\bf y})\rangle$$ $$= |{\bf x}-{\bf y}|^{2}$$ $$= |{\bf x}|^{2}-2\langle{\bf x},{\bf y}\rangle+|{\bf y}|^{2}\.$$
|
||||
|
||||
On the other hand, a direct calculation shows that
|
||||
|
||||
$$\langle A({\bf x}-{\bf y}),A({\bf x}-{\bf y})\rangle = \langle A{\bf x}-A{\bf y},A{\bf x}-A{\bf y}\rangle$$ $$= |A{\bf x}|^{2}-2\langle A{\bf x},A{\bf y}\rangle+|A{\bf y}|^{2}.$$
|
||||
|
||||
By comparing, we see that for all ${\bf x},{\bf y}$,
|
||||
|
||||
$$\langle A^{\prime}A{\bf x},{\bf y}\rangle=\langle A{\bf x},A{\bf y}\rangle= \langle{\bf x},{\bf y}\rangle\,$$
|
||||
|
||||
which implies that $A^{\prime}A=I$. Thus $A$ is a orthogonal matrix. Finally, this relation also shows that $T$ is a Euclidean motion whenever $A$ is orthogonal. $\Box$
|
||||
|
||||
Here we have used the following derivation in linear algebra: For a matrix $(B{\bf x})_{j}=\sum_{k}b_{kj}x_{k}$,
|
||||
|
||||
$$\langle{\bf x},B{\bf y}\rangle=\langle B^{\prime}{\bf x},{\bf y}\rangle\.$$
|
||||
|
||||
Indeed,
|
||||
|
||||
$$\langle{\bf x},B{\bf y}\rangle = \sum_{j}x_{j}\sum_{k}b_{kj}y_{k}$$ $$= \sum_{k}\sum_{j}b_{kj}x_{j}y_{k}$$ $$= \sum_{k}y_{k}\sum_{j}b^{\prime}_{jk}x_{j}=\langle B^{\prime}{ \bf x},{\bf y}\rangle\.$$
|
||||
|
||||
Here are some examples of Euclidean motions.
|
||||
|
||||
1. Take $A$ to be the identity and ${\bf b}$ a nonzero vector. Then $T{\bf x}={\bf x}+{\bf b}$ is a translation. The origin is moved to ${\bf b}$ after the motion.
|
||||
2. When $n=2$, the Euclidean motion $$T{\bf x}=\begin{bmatrix}1&0\\ 0&-1\end{bmatrix}\begin{bmatrix}x_{1}\\ x_{2}\end{bmatrix}$$ is the reflection with respect to the $x$-axis and $$T{\bf x}=\begin{bmatrix}-1&0\\ 0&1\end{bmatrix}\begin{bmatrix}x_{1}\\ x_{2}\end{bmatrix}$$is the reflection with respect to the $y$-axis. (In matrix form the vector $\mathbf{x}$ is understood as a column vector.) On the other hand, given any plane in $\mathbb{R}^{3}$, one may consider the reflection with respect to this plane. For instance, $$T\mathbf{x}=\begin{bmatrix}1&0&0\\ 0&1&0\\ 0&0&-1\end{bmatrix}\begin{bmatrix}x_{1}\\ x_{2}\\ x_{3}\end{bmatrix}$$ is the reflection with respect to the $xy$-plane in $\mathbb{R}^{3}$. The reflection with respect to any straight line in $\mathbb{R}^{2}$ or with respect to any plane in $\mathbb{R}^{3}$ can be defined similarly.
|
||||
3. The (counterclockwise) rotation of angle $\theta$ in $\mathbb{R}^{2}$ is given by the Euclidean motion $$T\mathbf{x}=\begin{bmatrix}\cos\theta&-\sin\theta\\ \sin\theta&\cos\theta\end{bmatrix}\begin{bmatrix}x_{1}\\ x_{2}\end{bmatrix}\,\quad\theta\in(0,2\pi)\.$$ In $\mathbb{R}^{3}$, one can perform a rotation around a fixed axis. For instance, the rotation $$T\mathbf{x}=\begin{bmatrix}\cos\theta&-\sin\theta&0\\ \sin\theta&\cos\theta&0\\ 0&0&1\end{bmatrix}\begin{bmatrix}x_{1}\\ x_{2}\\ x_{3}\end{bmatrix}$$ takes the $z$-axis as its axis of rotation.
|
||||
|
||||
Let us verify that Euclidean motions are closed under compositions. Let $T\mathbf{x}=A\mathbf{x}+\mathbf{b}$ and $S\mathbf{x}=B\mathbf{x}+\mathbf{c}$ be two Euclidean motions. Its composition is given by
|
||||
|
||||
$$ST\mathbf{x}=B(A\mathbf{x}+\mathbf{b})+\mathbf{c}=C\mathbf{x}+\mathbf{d}\, \quad C\equiv BA,\quad\mathbf{d}=B\mathbf{b}+\mathbf{c}\.$$
|
||||
|
||||
As
|
||||
|
||||
$$C^{\prime}C = (BA)^{\prime}BA$$ $$= A^{\prime}B^{\prime}BA$$ $$= A^{\prime}IA$$ $$= I,$$
|
||||
|
||||
we conclude that $ST$ is again a Euclidean motion. Furthermore, we claim that each Euclidean motion admits an inverse. Indeed, letting $U\mathbf{x}=A^{-1}\mathbf{x}-A^{-1}\mathbf{b}$ which is obviously an Euclidean motion, we have
|
||||
|
||||
$$UT\mathbf{x}=A^{-1}(A\mathbf{x}+\mathbf{b})-A^{-1}\mathbf{b}=\mathbf{x}\.$$
|
||||
|
||||
Summing up, the collection of all Euclidean motions forms a group called the Euclidean group of $\mathbb{R}^{n}$. (It is alright if you have not learned what a group is. You will learn it in MATH2070.)
|
||||
|
||||
In the following we study the structure of Euclidean motions for $n=2,3$. Apparently it suffices to look at the orthogonal matrix $A$.
|
||||
|
||||
**Theorem 1.3**.: _In $\mathbb{R}^{2}$, every orthogonal matrix can be written as_
|
||||
|
||||
1. $$\begin{bmatrix}\cos\theta&-\sin\theta\\ \sin\theta&\cos\theta\end{bmatrix}\,$$
|
||||
2. $$\begin{bmatrix}\cos\theta&-\sin\theta\\ \sin\theta&\cos\theta\end{bmatrix}\begin{bmatrix}1&0\\ 0&-1\end{bmatrix}\,\quad\theta\in[0,2\pi)\.$$
|
||||
|
||||
_Case (a) is a genuine rotation for $\theta\in(0,2\pi)$ and reduces to the identity at $\theta=0$. Case (b) is the reflection with respect to the $x$-axis and then followed by a rotation._
|
||||
|
||||
Proof.: Let
|
||||
|
||||
$$A=\begin{bmatrix}a&b\\ c&d\end{bmatrix}\.$$
|
||||
|
||||
By orthogonality $A^{\prime}A=I$ we have
|
||||
|
||||
$$a^{2}+c^{2}=1\,\quad b^{2}+d^{2}=1\,\quad ab+cd=0\.$$
|
||||
|
||||
Since $a$ is a number between $-1$ and $1$, we can find a unique $\theta\in[0,2\pi)$ such that $a=\cos\theta,\ c=\sin\theta$. Then either $b=-\sin\theta,\ d=\cos\theta$ or $b=\sin\theta,d=-\cos\theta$, so (a) or (b) must hold.
|
||||
|
||||
In the following we consider the three dimensional case. Denote by $R_{z}(\theta)$ the rotation around the $z$-axis by an angle $\theta$:
|
||||
|
||||
$$R_{z}(\theta)=\begin{bmatrix}\cos\theta&-\sin\theta&0\\ \sin\theta&\cos\theta&0\\ 0&0&1\end{bmatrix}\.$$
|
||||
|
||||
Similarly we define
|
||||
|
||||
$$R_{x}(\theta)=\begin{bmatrix}1&0&0\\ 0&\cos\theta&-\sin\theta\\ 0&\sin\theta&\cos\theta\end{bmatrix}\,$$
|
||||
|
||||
and
|
||||
|
||||
$$R_{y}(\theta)=\begin{bmatrix}\cos\theta&0&-\sin\theta\\ 0&1&0\\ \sin\theta&0&\cos\theta\end{bmatrix}\.$$
|
||||
|
||||
Also denote by $L_{z}$ the reflection with respect to the $xy$-plane:
|
||||
|
||||
$$L_{z}=\begin{bmatrix}1&0&0\\ 0&1&0\\ 0&0&-1\end{bmatrix}\.$$Similarly
|
||||
|
||||
$$L_{x}=\begin{bmatrix}-1&0&0\\ 0&1&0\\ 0&0&1\end{bmatrix}\,$$
|
||||
|
||||
and
|
||||
|
||||
$$L_{y}=\begin{bmatrix}1&0&0\\ 0&-1&0\\ 0&0&1\end{bmatrix}\.$$
|
||||
|
||||
**Theorem 1.4**.: _* In $\mathbb{R}^{3}$, every orthogonal matrix can be written as_
|
||||
|
||||
1. $R_{z}(\alpha)R_{x}(\beta)R_{z}(\gamma)$__, _or_
|
||||
2. $R_{z}(\alpha)R_{x}(\beta)R_{z}(\gamma)L_{z}$__,_
|
||||
|
||||
_for some $\alpha,\beta,$ and $\gamma$._
|
||||
|
||||
Proof.: * Let $A=\left(a_{ij}\right),i,j=1,2,3,$ be orthogonal. We have
|
||||
|
||||
$$\begin{bmatrix}\cos\theta&-\sin\theta&0\\ \sin\theta&\cos\theta&0\\ 0&0&1\end{bmatrix}\begin{bmatrix}a_{11}&a_{12}&a_{13}\\ a_{21}&a_{22}&a_{23}\\ a_{31}&a_{32}&a_{33}\end{bmatrix}$$
|
||||
|
||||
$$=\begin{bmatrix}\cos\theta a_{11}-\sin\theta a_{21}&\cos\theta a_{12}-\sin \theta a_{22}&\cos\theta a_{13}-\sin\theta a_{23}\\ \sin\theta a_{11}+\cos\theta a_{21}&\sin\theta a_{12}+\cos\theta a_{22}&\sin \theta a_{13}+\cos\theta a_{23}\\ a_{31}&a_{32}&a_{33}\end{bmatrix}\.$$
|
||||
|
||||
Choose $\theta$ so that $\cos\theta a_{13}-\sin\theta a_{23}=0$ and write the resulting matrix as
|
||||
|
||||
$$\begin{bmatrix}b_{11}&b_{12}&0\\ b_{21}&b_{22}&b_{23}\\ b_{31}&b_{32}&b_{33}\end{bmatrix}\.$$
|
||||
|
||||
We further have
|
||||
|
||||
$$=\begin{bmatrix}b_{11}&b_{12}&0\\ \cos\varphi b_{21}-\sin\varphi b_{31}&\cos\varphi b_{22}-\sin\varphi b_{32}& \cos\varphi b_{23}-\sin\varphi b_{33}\\ \sin\varphi b_{21}+\cos\varphi b_{31}&\sin\varphi b_{22}+\cos\varphi b_{32}& \sin\varphi b_{23}+\cos\varphi b_{33}\end{bmatrix}\.$$
|
||||
|
||||
Choose $\varphi$ so that $\cos\varphi b_{23}-\sin\varphi b_{33}=0$ and write the resulting matrix as
|
||||
|
||||
$$\begin{bmatrix}c_{11}&c_{12}&0\\ c_{21}&c_{22}&0\\ c_{31}&c_{32}&c_{33}\end{bmatrix}\.$$This matrix is the product of three orthogonal matrices, again it is orthogonal. Therefore, $c_{33}=\pm 1$. Moreover, from
|
||||
|
||||
$$c_{11}\times 0+c_{21}\times 0+c_{31}\times c_{33}=0\,$$
|
||||
|
||||
we deduce $c_{31}=0$. Similarly, $c_{32}=0$. The matrix is in fact of the form
|
||||
|
||||
$$\begin{bmatrix}c_{11}&c_{12}&0\\ c_{21}&c_{22}&0\\ 0&0&\pm 1\end{bmatrix}\,$$
|
||||
|
||||
where the $2\times 2$-matrix is orthogonal. It can be written as $R_{z}(\gamma)$ or $R_{z}(\gamma)L_{z}$ for some $\gamma$ according to Proposition 1.3. We conclude that
|
||||
|
||||
$$R_{x}(\varphi)R_{z}(\theta)A=R_{z}(\gamma)\,$$
|
||||
|
||||
or
|
||||
|
||||
$$R_{x}(\varphi)R_{z}(\theta)A=R_{z}(\gamma)L_{z}\,$$
|
||||
|
||||
that is,
|
||||
|
||||
$$A=R_{z}(-\theta)R_{x}(-\varphi)R_{z}(\gamma)\,$$
|
||||
|
||||
or
|
||||
|
||||
$$A=R_{z}(-\theta)R_{x}(-\varphi)R_{z}(\gamma)L_{z}\.$$
|
||||
|
||||
The desired result follows by taking $\alpha=-\theta$ and $\beta=-\varphi$.
|
||||
|
||||
### 1.4 The Cross Product in $\mathbb{R}^{3}$
|
||||
|
||||
The cross product assigns a 3-vector to two given 3-vectors. There is no such product in the general dimension. Somehow it shows how special our physical space is. The cross product is important due to its relevance in physics and engineering.
|
||||
|
||||
Notations like $\mathbf{x},\mathbf{y},\mathbf{u}$ and $\mathbf{v}$ are common for vectors. We have used $\mathbf{x},\mathbf{y}$ in the previous sections. Now we use $\mathbf{u},\mathbf{v}$ in this one.
|
||||
|
||||
First the definition. Let $\mathbf{u},\mathbf{v}\in\mathbb{R}^{3}$, define the cross product of $\mathbf{u}=(u_{1},u_{2},u_{3})$ and $\mathbf{v}=(v_{1},v_{2},v_{3})$ to be
|
||||
|
||||
$$\mathbf{u}\times\mathbf{v}=\left(u_{2}v_{3}-u_{3}v_{2},-(u_{1}v_{3}-u_{3}v_{1 }),u_{1}v_{2}-u_{2}v_{1}\right)\.$$
|
||||
|
||||
In particular, we have
|
||||
|
||||
$$\mathbf{e}_{1}\times\mathbf{e}_{2}=\mathbf{e}_{3},\quad\mathbf{e}_{2}\times \mathbf{e}_{3}=\mathbf{e}_{1},\quad\mathbf{e}_{3}\times\mathbf{e}_{1}= \mathbf{e}_{2}\.$$To aid memorizing, we can formally express it as the determinant
|
||||
|
||||
$$\left|\begin{array}{ccc}\mathbf{e}_{1}&\mathbf{e}_{2}&\mathbf{e}_{3}\\ u_{1}&u_{2}&u_{3}\\ v_{1}&v_{2}&v_{3}\end{array}\right|\.$$
|
||||
|
||||
Expand the determinant by the first row yields the formula above. Here it is formal because it does not make sense to put the unit vectors $\mathbf{e}_{1},\mathbf{e}_{2},\mathbf{e}_{3}$ as entries in a matrix.
|
||||
|
||||
The cross product is in many aspects in sharp contrast with an ordinary product. Some of its properties are list below:
|
||||
|
||||
**Theorem 1.5**.: _For $\boldsymbol{u},\boldsymbol{v},\boldsymbol{w}\in\mathbb{R}^{3}$,_
|
||||
|
||||
1. $(\alpha\boldsymbol{u}+\beta\boldsymbol{v})\times\boldsymbol{w}=\alpha \boldsymbol{u}\times\boldsymbol{w}+\beta\boldsymbol{v}\times\boldsymbol{w}\,\quad\forall \alpha,\beta\in\mathbb{R}$_._
|
||||
2. $\boldsymbol{u}\times\boldsymbol{v}=-\boldsymbol{v}\times\boldsymbol{u}$ _. In particular,_ $\boldsymbol{u}\times\boldsymbol{u}=0$ _._
|
||||
3. $(\boldsymbol{u}\times\boldsymbol{v})\times\boldsymbol{w}=\boldsymbol{u}\times (\boldsymbol{v}\times\boldsymbol{w})$ _is not always true._
|
||||
|
||||
The proofs of Theorem (a) and (b) are straightforward from definition. As for (c), which asserts that the associative law does not hold, some examples suffice:
|
||||
|
||||
$$(\mathbf{e}_{1}\times\mathbf{e}_{2})\times\mathbf{e}_{2}=-\mathbf{e}_{1}\, \quad\mathbf{e}_{1}\times(\mathbf{e}_{2}\times\mathbf{e}_{2})=\boldsymbol{0}\,$$
|
||||
|
||||
and
|
||||
|
||||
$$(1,1,1)\times\big{(}(1,0,-1)\times(2,1,1)\big{)}=(3,-1,-5)\,\quad\big{(}(1,1,1) \times(1,0,-1)\big{)}\times(2,1,1)=(4,0,-4).$$
|
||||
|
||||
As a vector is completely determined by its magnitude and direction, let us consider the magnitude and direction of the cross product. First of all, using the definition of the cross product, one can verify directly that
|
||||
|
||||
$$\mathbf{u}\cdot(\mathbf{u}\times\mathbf{v})=0\,\quad\mathbf{v}\cdot(\mathbf{u} \times\mathbf{v})=0\,$$
|
||||
|
||||
so
|
||||
|
||||
$$(\alpha\mathbf{u}+\beta\mathbf{v})\cdot\mathbf{u}\times\mathbf{v}=0\,$$
|
||||
|
||||
that is, it is perpendicular to the two dimensional subspace spanned by the vectors $\mathbf{u}$ and $\mathbf{v}$. After the definition of a plane is introduced in the next chapter, we can say that the cross product of two linearly independent vectors points in the normal direction of the plane spanned by $\mathbf{u}$ and $\mathbf{v}$. When they are linearly dependent, their cross product is the zero vector which does not form a normal direction. There are two normal directions, pointing upward or downward, so to speak. The choice of the direction of the cross product is determined by the right hand rule. That is, with the thumb making a right angle with the other four fingers of your right hand, first point the four fingers along the direction of $\mathbf{u}$ and then move them to $\mathbf{v}$ in an angle less than $\pi$. The direction of is where your thumb points to. To see this, one identifies the direction of $\mathbf{u}$ with $\mathbf{e}_{1}$. If $\mathbf{v}$ lies in the first or second quadrants, $\mathbf{v}=\alpha\mathbf{e}_{1}+\beta\mathbf{e}_{2}\,\beta>0$, and $\mathbf{u}\times\mathbf{v}$ points to $\mathbf{e}_{3}$. If $\mathbf{v}$ lies in the third or fourth quadrants, $\mathbf{v}=\alpha\mathbf{e}_{1}+\beta\mathbf{e}_{2}\,\beta<0$, and $\mathbf{u}\times\mathbf{v}$ points to $-\mathbf{e}_{3}$. This is the right hand rule.
|
||||
|
||||
We have described the direction of the cross product. How about its magnitude? We have
|
||||
|
||||
**Theorem 1.6**.: _For $\textbf{u},\textbf{v}\in\mathbb{R}^{3}$,_
|
||||
|
||||
$$|\textbf{u}\times\textbf{v}|=|\textbf{u}||\textbf{v}|\sin\theta,\quad\theta \in[0,\pi]\,$$
|
||||
|
||||
_where $\theta$ is the angle between **u** and **v**._
|
||||
|
||||
Proof.: The proof depends on the identity
|
||||
|
||||
$$|\textbf{u}\times\textbf{v}|^{2}=|\textbf{u}|^{2}|\textbf{v}|^{2}-(\textbf{u} \cdot\textbf{v})^{2}\.$$
|
||||
|
||||
Indeed, by brute force
|
||||
|
||||
$$|\textbf{u}\times\textbf{v}|^{2} = (u_{2}v_{3}-u_{3}v_{2})^{2}+(u_{1}v_{3}-u_{3}v_{1})^{2}+(u_{1}v_{ 2}-u_{2}v_{1})^{2}$$ $$= u_{2}^{2}v_{3}^{2}+u_{3}^{2}v_{2}^{2}+u_{1}^{2}v_{3}^{2}+u_{3}^{ 2}v_{1}^{2}+u_{1}^{2}v_{2}^{2}+u_{2}^{2}v_{1}^{2}-2u_{2}v_{3}u_{3}v_{2}-2u_{1}v _{3}u_{3}v_{1}-2u_{1}v_{2}u_{2}v_{1}\.$$
|
||||
|
||||
On the other hand,
|
||||
|
||||
$$|\textbf{u}|^{2}|\textbf{v}|^{2}-(\textbf{u}\cdot\textbf{v})^{2}$$ $$= (u_{1}^{2}+u_{2}^{2}+u_{3}^{2})(v_{1}^{2}+v_{2}^{2}+v_{3}^{2})-(u_ {1}v_{1}+u_{2}v_{2}+u_{3}v_{3})^{2}$$ $$= u_{2}^{2}v_{3}^{2}+u_{3}^{2}v_{2}^{2}+u_{1}^{2}v_{3}^{2}+u_{3}^{ 2}v_{1}^{2}+u_{1}^{2}v_{2}^{2}+u_{2}^{2}v_{1}^{2}-2u_{2}v_{3}u_{3}v_{2}-2u_{1} v_{3}u_{3}v_{1}-2u_{1}v_{2}u_{2}v_{1}\,$$
|
||||
|
||||
whence the identity holds. Now, by the Cosine Law,
|
||||
|
||||
$$|\textbf{u}\times\textbf{v}| = \sqrt{|\textbf{u}|^{2}|\textbf{v}|^{2}-|\textbf{u}|^{2}|\textbf{v }|^{2}\cos^{2}\theta}$$ $$= |\textbf{u}||\textbf{v}||\sin\theta|$$ $$= |\textbf{u}||\textbf{v}||\sin\theta\,$$
|
||||
|
||||
as $\sin\theta\geq 0$ on $[0,\pi]$.
|
||||
|
||||
In conclusion the magnitude and direction of the decomposition of the cross product is given by
|
||||
|
||||
$$\textbf{u}\times\textbf{v}=|\textbf{u}||\textbf{v}|\sin\theta\ \textbf{n}\,$$
|
||||
|
||||
where **n** is the unit vector determined by the right hand rule (when **u** and **v** are linearly independent, that is, when $\sin\theta\neq 0$).
|
||||
|
||||
**Corollary 1.7**.:
|
||||
1. _The area of the parallelogram spanned by_ **u** _and_ **v** _is equal to_ $|\textbf{u}\times\textbf{v}|$_._
|
||||
2. _The area of the triangle with two sides given by_ $u$ _and_ $v$ _is equal to_ $1/2|\textbf{u}\times\textbf{v}|$_._
|
||||
3. _The volume of the parallelepiped spanned by_ $u$,_ $v$ _and_ $w$ _is equal to_ $$V=|\textbf{w}\cdot(\textbf{u}\times\textbf{v})|\ \.$$
|
||||
|
||||
Proof.: (a) follows immediately from Theorem 1.6 and (b) from (a). To prove (c), we may assume **u** and **v** lie on the $xy$-plane after a rotation. The volume of the parallelepiped is given by the product of the area of the parallelogram spanned by **u** and **v** with its height. Now $|\textbf{u}\times\textbf{v}|$ is equal to the area of this parallelogram. On the other hand, its height is given by $|\textbf{w}\cdot\textbf{e}_{3}|$. Therefore, letting $\alpha$ be the angle between **w** and the $z$-axis,
|
||||
|
||||
$$|\textbf{w}\cdot(\textbf{u}\times\textbf{v})| = |\textbf{w}||\textbf{u}\times\textbf{v}||\cos\alpha|$$ $$= |\textbf{u}\times\textbf{v}|\ |\textbf{w}\cdot\textbf{e}_{3}|$$ $$= V\.$$
|
||||
|
||||
**Example 1.5**.: Determine if the four points
|
||||
|
||||
$$(1,0,1),\ (2,4,-6),\ (3,-1,5),\ (1,-9,19)\,$$
|
||||
|
||||
lie on the same plane in $\mathbb{R}^{3}$. Well, they lie on the same plane if and only if the parallelepiped formed by these vectors has zero volume. We compute the volume using this corollary after subtracting the first vector from the last three vectors (to make sure that the vectors are based at the origin):
|
||||
|
||||
$$(1,4,-7)\cdot\big{(}(2,-1,4)\times(0,-9,18)\big{)} = (1,4,-7)\cdot(18,-36,-18)$$ $$= 0\,$$
|
||||
|
||||
so they lie on the same plane.
|
||||
|
||||
**Comments on Chapter 1.**
|
||||
|
||||
**1.1.** A helping hand from linear algebra. In several occasions we need to solve homogeneous systems of linear equations. Let us review it by looking at some examples. First, consider the single equation
|
||||
|
||||
$$x-2y+6z=0,\quad(x,y,z)\in\mathbb{R}^{3}\.$$
|
||||
|
||||
To solve this equation means to find all possible $(x,y,z)$ satisfying this relation. Obviously $(0,0,0)$ is a solution, but there are many others, for instance, $(-6,0,1)$ and $(2,1,0)$ are also solutions. To find all solutions, we set $y=a$ and $z=b$. Then $x=2a-6b$, so $(2a-6b,a,b)=a(2,1,0)+b(-6,0,1)$ gives all solutions. A solution is obtained whenever values to $a$ and $b$ are assigned. We may say the solution is described by two parameters. Next, consider the system
|
||||
|
||||
$$\begin{cases}x-y+5z&=0\,\\ x+y-3z&=0\.\end{cases}$$
|
||||
|
||||
Setting $z=a$, the system becomes
|
||||
|
||||
$$\begin{cases}x-y&=-5a\,\\ x+y&=3a\,\end{cases}$$
|
||||
|
||||
which is readily solved to yield $x=-a$ and $y=4a$, so the general solution is given by $(x,y,z)=a(-1,4,1)$ where $a$ is the only parameter. From these two examples, we see there are three principles governing homogeneous linear systems.
|
||||
|
||||
* $\mathbf{0}$ is always a solution (the trivial solution).
|
||||
* The general solution consists of several free parameters. In most cases, the number of parameters is equal to $n-m$ where $n$ is the number of unknowns and $m$ is the number of equations.
|
||||
* In some exceptional cases, the number of parameters is less than $n-m$.
|
||||
|
||||
Exceptional cases come up when the linear system is kind of cheating us. For instance, look at
|
||||
|
||||
$$\begin{cases}x-y+5z&=0\,\\ 2x-2y+10z&=0\.\end{cases}$$
|
||||
|
||||
The second equation in this system comes from multiplying the first equation by 2, so essentially there is only one equation in this system. Its general solution contains two instead of one parameters. This situation occurs in the study of standard forms in Section 2.4.
|
||||
|
||||
**1.2.** The Law of Cosines states that, let $\Delta ABC$ be a triangle and $a=\overline{CB},\ b=\overline{AC}$ and $c=\overline{BA}$, and $\phi=\angle ACB$. Then
|
||||
|
||||
$$c^{2}=a^{2}+b^{2}-2ab\cos\phi\.$$
|
||||
|
||||
To prove it let $h=\overline{AH}$ be the height of the triangle from $A$ and $s$ be $\overline{CH}$. By Pythagoras Theorem,
|
||||
|
||||
$$c^{2}=h^{2}+(a-s)^{2}=b^{2}-s^{2}+(a-s)^{2}=b^{2}-2as+a^{2}\,$$
|
||||
|
||||
implies $s=(c^{2}-a^{2}-b^{2})/2a$ and the Cosine Law follows after noting $s=b\cos\phi$.
|
||||
|
||||
**1.3.** You may look up Wikipedia under "Euclidean motion" and "orthogonal matrix" to find further information on these interesting topics. How the cross product is used in physics can be also be found in Wikipedia under "cross product". From the same source you will see how the cross product arises from the Lie algebra of the orthogonal group.
|
||||
|
||||
**Supplementary Reading**
|
||||
|
||||
1.1 and 1.2 in chapter 1, [Au].
|
161
documents/mmds/linear_algebra_for_nn.mmd
Normal file
161
documents/mmds/linear_algebra_for_nn.mmd
Normal file
|
@ -0,0 +1,161 @@
|
|||
from:
|
||||
|
||||
**N.J., Smelter, & P.B., Baltes (Eds.) (2001).**
|
||||
|
||||
**Encyclopedia of the Social and Behavioral Sciences.**
|
||||
|
||||
**London: Elsevier Science.**
|
||||
|
||||
**Article Title: Linear Algebra for Neural Networks**
|
||||
|
||||
**By: Herve Abdi**
|
||||
|
||||
**Author Address:** Herve Abdi, School of Human Development, MS: Gr.4.1, The University of Texas at Dallas, Richardson, TX 750833-0688, USA
|
||||
|
||||
**Phone:** 972 883 2065, **fax:** 972 883 2491 **Date:** June 1, 2001
|
||||
|
||||
**E-mail:** herve@utdallas.edu
|
||||
|
||||
**Abstract**
|
||||
|
||||
Neural networks are quantitative models which learn to associate input and output patterns adaptively with the use of learning algorithms. We expose four main concepts from linear algebra which are essential for analyzing these models: 1) the projection of a vector, 2) the eigen and singular value decomposition, 3) the gradient vector and Hessian matrix of a vector function, and 4) the Taylor expansion of a vector function. We illustrate these concepts by the analysis of the Hebbian and Widrow-Hoff rules and some basic neural network architectures (i.e., the linear autoassociator, the linear heteroassociator, and the error backpropagation network). We show also that neural networks are equivalent to iterative versions of standard statistical and optimization models such as multiple regression analysis and principal component analysis.
|
||||
|
||||
## 1 Introduction
|
||||
|
||||
Linear algebra is particularly well suited to analyze the class of neural networks called _associators_. These quantitative models learn to associate input and out
|
||||
|
||||
[MISSING_PAGE_FAIL:2]
|
||||
|
||||
### Projection of one vector onto another vector
|
||||
|
||||
The (orthogonal) projection of vector $\boldsymbol{x}$ on vector $\boldsymbol{w}$ is defined as
|
||||
|
||||
$$\mathsf{proj}_{\boldsymbol{w}}\boldsymbol{x}=\frac{\boldsymbol{x}^{\mskip-1.5mu \mathsf{T}} \boldsymbol{w}}{\boldsymbol{w}^{\mskip-1.5mu \mathsf{T}}\boldsymbol{w}} \boldsymbol{w}=\mathsf{cos}(\boldsymbol{x},\boldsymbol{w})\times\frac{\| \boldsymbol{x}\|}{\|\boldsymbol{w}\|}\boldsymbol{w}. \tag{3}$$
|
||||
|
||||
The norm of $\mathsf{proj}_{\boldsymbol{w}}\boldsymbol{x}$ is its distance to the origin of the space. It is equal to
|
||||
|
||||
$$\|\mathsf{proj}_{\boldsymbol{w}}\boldsymbol{x}\|=\frac{|\boldsymbol{x}^{ \mskip-1.5mu \mathsf{T}}\boldsymbol{w}|}{\|\boldsymbol{w}\|}=|\mathsf{ cos}(\boldsymbol{x},\boldsymbol{y})|\times\|\boldsymbol{x}\|\enspace. \tag{4}$$
|
||||
|
||||
### Illustration: Hebbian and Widrow-Hoff learning rules
|
||||
|
||||
A neural network consists of cells connected to other cells via modifiable weighted _connections_ called _synapses_. Consider a network made of $I$ input cells and only one output cell. The information is transmitted via the synapses, from a set of external input cells to the output cell which gives a response corresponding to its state of activation. If the input pattern and the set of synaptic weights are given by $I$-dimensional vectors denoted $\boldsymbol{x}$, and $\boldsymbol{w}$, the activation of the output cell is obtained as
|
||||
|
||||
$$a=\boldsymbol{x}^{\mskip-1.5mu \mathsf{T}}\boldsymbol{w}. \tag{5}$$
|
||||
|
||||
So the activation is proportional to the norm of the projection of the input vector onto the weight vector. The _response_ or _output_ of the cell is denoted $o$. For a _linear cell_, it is proportional to the activation (for convenience, assume that the proportionality constant is equal to $1$). _Linear heteroassociators_ and _autoassociators_ are made of linear cells. In general, the output of a cell is a _function_ (often, but not necessarily, continuous), called the _transfer function_, of its activation
|
||||
|
||||
$$o=f\left(a\right)\enspace. \tag{6}$$
|
||||
|
||||
For example, in _backpropagation networks_, the (nonlinear) transfer function is usually the logistic function
|
||||
|
||||
$$o=f\left(a\right)=\operatorname{logist}\boldsymbol{w}^{\mskip-1.5mu \mathsf{T}} \boldsymbol{x}=\frac{1}{1+\exp\{-a\}}\enspace. \tag{7}$$
|
||||
|
||||
Often, a neural network is designed to associate, to a given input, a specific response called the target, denoted $t$. _Learning_ is equivalent to defining a rule that specifies how to add a small quantity to each synaptic weight at each learning iteration. Learning makes the output of the network closer to the target.
|
||||
|
||||
Learning rules come in two main flavors: _supervised_ (_e.g.,_ Widrow-Hoff) which take into account the error or distance between the response of the neuron and the target and _unsupervised_ (_e.g.,_ Hebbian) which do not require such "feedback." The Hebbian learning rule modifies the weight vector at iteration $n+1$ as
|
||||
|
||||
$$\boldsymbol{w}_{[n+1]}=\boldsymbol{w}_{[n]}+\eta t\boldsymbol{x}\, \tag{8}$$
|
||||
|
||||
[MISSING_PAGE_FAIL:4]
|
||||
|
||||
if $\lambda\neq 1$). There are, in general, several eigenvectors for a given matrix (at most as many as the dimension of $\mathbf{W}$). They are in general ordered by decreasing order of their eigenvalue. So, the first eigenvector, $\mathbf{u}_{1}$ has the largest eigenvalue $\lambda_{1}$. The number of eigenvectors with a non-zero eigenvalue is the _rank_ of the matrix.
|
||||
|
||||
The eigenvalues of positive semi-definite matrices are always positive or zero (a matrix with strictly positive eigenvalues, is _positive definite_). Also, any two eigenvectors with different eigenvalues are orthogonal, i.e.
|
||||
|
||||
$$\mathbf{u}_{\ell}^{\mathsf{T}}\mathbf{u}_{\ell^{\prime}}=0\qquad\forall\quad\ell\neq \ell^{\prime}. \tag{15}$$
|
||||
|
||||
In addition, the set of eigenvectors of a matrix constitutes an orthogonal basis for its rows and columns. This is expressed by defining two matrices: the eigenvector matrix $\mathbf{U}$, and the diagonal matrix of the eigenvalues: $\mathbf{\Lambda}$. The eigendecomposition of $\mathbf{W}$ (with rank $L$) is
|
||||
|
||||
$$\mathbf{W}=\mathbf{U}\mathbf{\Lambda}\mathbf{U}^{\mathsf{T}}=\sum_{\ell}^{L}\lambda_{\ell}\bm {u}_{\ell}\mathbf{u}_{\ell}^{\mathsf{T}},\ \text{or equivalently:}\quad\mathbf{\Lambda}=\mathbf{U}^{ \mathsf{T}}\mathbf{W}\mathbf{U}. \tag{16}$$
|
||||
|
||||
The _singular value decomposition_ (svd) generalizes the eigendecomposition to rectangular matrices. If $\mathbf{X}$ is an $I\times K$ matrix, its svd is defined as
|
||||
|
||||
$$\mathbf{X}=\mathbf{U}\mathbf{\Delta}\mathbf{V}^{\mathsf{T}}\ \text{with}\ \mathbf{U}^{\mathsf{T}}\mathbf{U}=\mathbf{V}^{ \mathsf{T}}\mathbf{V}=\mathbf{I}\ \text{and}\ \mathbf{\Delta}\ \text{being a diagonal matrix}\ . \tag{17}$$
|
||||
|
||||
($\mathbf{I}$ being the _identity_ matrix). The diagonal elements of $\mathbf{\Delta}$ are real positive numbers called the _singular values_ of $\mathbf{X}$. The matrices $\mathbf{U}$ and $\mathbf{V}$ are the left and right matrices of singular vectors (which are also eigenvectors, see below). The svd is closely related to the eigendecomposition because $\mathbf{U}$, $\mathbf{V}$, and $\mathbf{\Delta}$ can be obtained from the eigendecomposition of matrices $\mathbf{X}^{\mathsf{T}}\mathbf{X}$ and $\mathbf{X}\mathbf{X}^{\mathsf{T}}$ as
|
||||
|
||||
$$\mathbf{X}\mathbf{X}^{\mathsf{T}}=\mathbf{U}\mathbf{\Lambda}\mathbf{U}^{\mathsf{T}},\quad\mathbf{X}^{ \mathsf{T}}\mathbf{X}=\mathbf{V}\mathbf{\Lambda}\mathbf{V}^{\mathsf{T}},\ \text{and}\ \mathbf{\Delta}=\mathbf{\Lambda}^{\frac{1}{2}} \tag{18}$$
|
||||
|
||||
(note that $\mathbf{X}^{\mathsf{T}}\mathbf{X}$ and
|
||||
|
||||
[MISSING_PAGE_EMPTY:6]
|
||||
|
||||
The derivative of $f\left(\mathbf{w}\right)$ with respect to the $I\times 1$ vector $\mathbf{w}$ is denoted by $\mathbf{\nabla}_{\mathbf{f}\left(\mathbf{w}\right)}$. It is also called the _gradient_ of $f$, i.e.,
|
||||
|
||||
$$\mathbf{\nabla}_{\mathbf{f}\left(\mathbf{w}\right)}=\frac{\partial f}{\partial\mathbf{w}}= \left[\frac{\partial f}{\partial w_{1}},\ldots,\frac{\partial f}{\partial w_{ i}},\ldots,\frac{\partial f}{\partial w_{I}}\right]^{\mathsf{T}}. \tag{26}$$
|
||||
|
||||
For example, the derivative of the output of a linear neuron is
|
||||
|
||||
$$\frac{\partial f}{\partial\mathbf{w}}=\left[\frac{\partial\mathbf{w}^{\mathsf{T}}\mathbf{x }}{\partial w_{1}},\ldots,\frac{\partial\mathbf{w}^{\mathsf{T}}\mathbf{x}}{\partial w_ {i}},\ldots,\frac{\partial\mathbf{w}^{\mathsf{T}}\mathbf{x}}{\partial w_{I}}\right]^{ \mathsf{T}}=\left[x_{1},\ldots,x_{i},\ldots,x_{I}\right]^{\mathsf{T}}=\mathbf{x}. \tag{27}$$
|
||||
|
||||
When a function is twice differentiable, the second order derivatives are stored in a matrix called the _Hessian_ matrix of the function. It is often denoted by $\mathbf{H}$ or $\mathbf{\nabla}_{\mathbf{f}}^{\mathsf{2}}$ and is formally defined as
|
||||
|
||||
$$\mathbf{H}=\mathbf{\nabla}_{\mathbf{f}}^{\mathsf{2}}=\begin{bmatrix}\frac{\partial^{2}f}{ \partial w_{1}^{2}}&\frac{\partial^{2}f}{\partial w_{1}w_{2}}&\ldots&\frac{ \partial^{2}f}{\partial w_{1}w_{I}}\\ \frac{\partial^{2}f}{\partial w_{2}w_{1}}&\frac{\partial^{2}f}{\partial w_{2} ^{2}}&\ldots&\frac{\partial^{2}f}{\partial w_{2}w_{I}}\\ \vdots&\vdots&\ddots&\vdots\\ \frac{\partial^{2}f}{\partial w_{I}w_{1}}&\frac{\partial^{2}f}{\partial w_{I} w_{2}}&\ldots&\frac{\partial^{2}f}{\partial w_{I}^{2}}\end{bmatrix}. \tag{28}$$
|
||||
|
||||
### Conditions for minimum
|
||||
|
||||
A standard problem is to show that a given learning rule finds an optimum solution in the sense that a function of the weight vector (or matrix) called the _error function_ reaches its minimum value when learning has converged. Often, the error function is defined as the sum of the squared error over all patterns.
|
||||
|
||||
When the gradient of the error function can be evaluated, a necessary condition for optimality (_i.e.,_ either minimum or maximum) is to find a weight vector $\widetilde{\mathbf{w}}$ such that
|
||||
|
||||
$$\mathbf{\nabla}_{\mathbf{f}\left(\widetilde{\mathbf{w}}\right)}=\mathbf{0}. \tag{29}$$
|
||||
|
||||
This condition is also sufficient provided $\mathbf{H}$ is positive definite (cf. Haykin, 1999).
|
||||
|
||||
### Taylor expansion
|
||||
|
||||
The Taylor expansion is the standard technique used to obtain a linear or a quadratic approximation of a function of one variable. Recall that the Taylor expansion of a continuous function $f(x)$ is
|
||||
|
||||
$$f(x) =f(a)+(x-a)\frac{f^{{}^{\prime}}(a)}{1!}+(x-a)^{2}\frac{f^{{}^{ \prime\prime}}(a)}{2!}+\ldots(x-a)^{n}\frac{f^{[n]}(a)}{n!}+\ldots$$ $$=f(a)+(x-a)\frac{f^{{}^{\prime}}(a)}{1!}+(x-a)^{2}\frac{f^{{}^{ \prime\prime}}(a)}{2!}+\mathcal{R}_{2}. \tag{30}$$(where $\mathcal{R}_{2}$ represents all the terms of higher order than 2, and $a$ is a "convenient" value at which to evaluate $f$).
|
||||
|
||||
This technique can be extended to matrix and vector functions. It involves the notion of gradient and Hessian. Now a vector function $f\left(\boldsymbol{x}\right)$ is expressed as:
|
||||
|
||||
$$f\left(\boldsymbol{x}\right)=f\left(\boldsymbol{a}\right)+f\left(\boldsymbol{ x-a}\right)^{\mathsf{T}}\boldsymbol{\nabla}_{\boldsymbol{f(a)}}+f\left( \boldsymbol{x-a}\right)^{\mathsf{T}}\boldsymbol{\nabla}_{\boldsymbol{f(a)}}^ {2}f\left(\boldsymbol{x-a}\right)+\mathcal{R}_{2}. \tag{31}$$
|
||||
|
||||
### Iterative minimization
|
||||
|
||||
A learning rule can be shown to converge to an optimum if it diminishes the value of the error function at each iteration. When the gradient of the error function can be evaluated, the _gradient_ technique (or _steepest descent_) adjusts the weight vector by moving it in the direction opposite to the gradient of the error function. Formally, the correction for the $(n+1)$-th iteration is
|
||||
|
||||
$$\boldsymbol{w}_{[n+1]}=\boldsymbol{w}_{[n]}+\boldsymbol{\Delta}=\boldsymbol{w }_{[n]}-\eta\boldsymbol{\nabla}_{\boldsymbol{f(w)}} \tag{32}$$
|
||||
|
||||
(where $\boldsymbol{\nabla}_{\boldsymbol{f(w)}}$ is computed for $\boldsymbol{w}_{[n]}$).
|
||||
|
||||
As an example, let us show that for a linear heteroassociator, the Widrow-Hoff learning rule minimizes iteratively the squared error between target and output. The error function is
|
||||
|
||||
$$e^{2}=(t-o)^{2}=t^{2}+o^{2}-2to=t^{2}+\boldsymbol{x}^{\mathsf{T}}\boldsymbol{ w}\boldsymbol{w}^{\mathsf{T}}\boldsymbol{x}-2t\boldsymbol{w}^{\mathsf{T}} \boldsymbol{x}. \tag{33}$$
|
||||
|
||||
The gradient of the error function is
|
||||
|
||||
$$\frac{\partial e}{\partial\boldsymbol{w}}=2(\boldsymbol{w}^{\mathsf{T}} \boldsymbol{x})\boldsymbol{x}-2t\boldsymbol{x}=-2(t-\boldsymbol{w}^{\mathsf{ T}}\boldsymbol{x})\boldsymbol{x}. \tag{34}$$
|
||||
|
||||
The weight vector is corrected by moving it in the opposite direction of the gradient. This is obtained by adding a small vector denoted $\boldsymbol{\Delta}_{\boldsymbol{w}}$ opposite to the gradient. This gives the following correction for iteration $n+1$:
|
||||
|
||||
$$\boldsymbol{w}_{[n+1]}=\boldsymbol{w}_{[n]}+\boldsymbol{\Delta}_{\boldsymbol{ w}}=\boldsymbol{w}_{[n]}-\eta\frac{\partial e}{\partial\boldsymbol{w}}= \boldsymbol{w}_{[n]}+\eta(t-\boldsymbol{w}^{\mathsf{T}}\boldsymbol{x}) \boldsymbol{x}=\boldsymbol{w}_{[n]}+\eta(t-o)\boldsymbol{x}. \tag{35}$$
|
||||
|
||||
This gives the rule defined by Equation 9.
|
||||
|
||||
The gradient method works because the gradient of $\boldsymbol{w}_{[n]}$ is a first order Taylor approximation of the gradient of the optimal weight vector $\widetilde{\boldsymbol{w}}$. It is a favorite technique in neural networks because the popular error backpropagation is a gradient technique.
|
||||
|
||||
_Newton's method_ is a second order Taylor approximation, it uses the inverse of the Hessian of $\boldsymbol{w}$ (supposing it exists). It gives a better numerical approximation but necessitates more computation. Here the correction for iteration $n+1$ is
|
||||
|
||||
$$\boldsymbol{w}_{[n+1]}=\boldsymbol{w}_{[n]}+\boldsymbol{\Delta}=\boldsymbol{w} _{[n]}-(\boldsymbol{H}^{-1})(\boldsymbol{\nabla}_{\boldsymbol{f(w)}}) \tag{36}$$
|
||||
|
||||
(where $\boldsymbol{\nabla}_{\boldsymbol{f(w)}}$ is computed for $\boldsymbol{w}_{[n]}$).
|
||||
|
||||
Useful References
|
||||
|
||||
Linear algebra at the level of this presentation is available in the following recent books: Abdi _et al._ (1999), Bishop (1995) Ellacot and Bose (1996), Haggan, Demuth, and Beale (1996), Haykin (1999), Reed and Marks (1999), Ripley (1996), and Rojas (1996).
|
||||
|
||||
_See also:_ Artificial neural networks: neurocomputation; Backpropagation; Hebb, Donald Olding (1904-1985); Statistical pattern recognition.
|
||||
|
||||
## References
|
||||
|
||||
* [1]abdi, h. (1994a) _Les reseaux de neurones_. Grenoble, France: PUG.
|
||||
* [2]abdi, h., valentin, d., & edelman, b. (1999) _Neural networks_. Thousand Oak, CA: Sage.
|
||||
* [3]bishop, c.m. (1995) _Neural network for pattern recognition_. Oxford, UK: Oxford University Press.
|
||||
* [4]ellacott, s., & bose, d. (1996) _Neural networks: Deterministic methods of analysis_. London: ITC.
|
||||
* [5]hagan, m. t., demuth, h. b., & beale, m. (1996) _Neural networks design_. Boston: PWS.
|
||||
* [6]haykin, s. (1999) _Neural networks: A comprehensive foundation_ (2nd ed). New York: Prentice Hall.
|
||||
* [7]reed, r.d., marks r.j. (1999) _Neural smithing_. Cambridge, MA: MIT press.
|
||||
* [8]ripley, b.d. (1996) _Pattern recognition and neural networks_. Cambridge, MA: Cambridge University Press.
|
||||
* [9]rojas, r. (1996) _Neural networks_. New York: Springer-Verlag.
|
|
@ -1,381 +0,0 @@
|
|||
Orthogonal Matrices and the Singular Value Decomposition
|
||||
|
||||
Carlo Tomasi
|
||||
|
||||
The first Section below extends to $m\times n$ matrices the results on orthogonality and projection we have previously seen for vectors. The Sections thereafter use these concepts to introduce the Singular Value Decomposition (SVD) of a matrix, the pseudo-inverse, and its use for the solution of linear systems.
|
||||
|
||||
## 1 Orthogonal Matrices
|
||||
|
||||
Let ${\cal S}$ be an $n$-dimensional subspace of ${\bf R}^{m}$ (so that we necessarily have $n\leq m$), and let ${\bf v}_{1},\ldots,{\bf v}_{n}$ be an orthonormal basis for ${\cal S}$. Consider a point $P$ in ${\cal S}$. If the coordinates of $P$ in ${\bf R}^{m}$ are collected in an $m$-dimensional vector
|
||||
|
||||
$${\bf p}=\left[\begin{array}{c}p_{1}\\ \vdots\\ p_{m}\end{array}\right]\,$$
|
||||
|
||||
and since $P$ is in ${\cal S}$, it must be possible to write ${\bf p}$ as a linear combination of the ${\bf v}_{j}$s. In other words, there must exist coefficients
|
||||
|
||||
$${\bf q}=\left[\begin{array}{c}q_{1}\\ \vdots\\ q_{n}\end{array}\right]$$
|
||||
|
||||
such that
|
||||
|
||||
$${\bf p}=q_{1}{\bf v}_{1}+\ldots+q_{n}{\bf v}_{n}=V{\bf q}$$
|
||||
|
||||
where
|
||||
|
||||
$$V=\left[\begin{array}{ccc}{\bf v}_{1}&\cdots&{\bf v}_{n}\end{array}\right]$$
|
||||
|
||||
is an $m\times n$ matrix that collects the basis for ${\cal S}$ as its columns. Then for any $i=1,\ldots,n$ we have
|
||||
|
||||
$${\bf v}_{i}^{T}{\bf p}={\bf v}_{i}^{T}\sum_{j=1}^{n}q_{j}{\bf v}_{j}=\sum_{j=1 }^{n}q_{j}{\bf v}_{i}^{T}{\bf v}_{j}=q_{i}\,$$
|
||||
|
||||
since the ${\bf v}_{j}$ are orthonormal. This is important, and may need emphasis:
|
||||
|
||||
_If_
|
||||
|
||||
$${\bf p}=\sum_{j=1}^{n}q_{j}{\bf v}_{j}$$
|
||||
|
||||
_and the vectors of the basis ${\bf v}_{1},\ldots,{\bf v}_{n}$ are orthonormal, then the coefficients $q_{j}$ are the signed magnitudes of the projections of ${\bf p}$ onto the basis vectors:_
|
||||
|
||||
$$q_{j}={\bf v}_{j}^{T}{\bf p}. \tag{1}$$In matrix form,
|
||||
|
||||
$${\bf q}=V^{T}{\bf p}\;. \tag{2}$$
|
||||
|
||||
Also, we can collect the $n^{2}$ equations
|
||||
|
||||
$${\bf v}_{i}^{T}{\bf v}_{j}=\left\{\begin{array}{ll}1&\mbox{ if }i=j\\ 0&\mbox{ otherwise}\end{array}\right.$$
|
||||
|
||||
into the following matrix equation:
|
||||
|
||||
$$V^{T}V=I \tag{3}$$
|
||||
|
||||
where $I$ is the $n\times n$ identity matrix. A matrix $V$ that satisfies equation (3) is said to be _orthogonal_. Thus, a matrix is orthogonal if its columns are orthonormal. Since the _left inverse_ of a matrix $V$ is defined as the matrix $L$ such that
|
||||
|
||||
$$LV=I\;, \tag{4}$$
|
||||
|
||||
comparison with equation (3) shows that the left inverse of an orthogonal matrix $V$ exists, and is equal to the transpose of $V$.
|
||||
|
||||
Of course, this argument requires $V$ to be full rank, so that the solution $L$ to equation (4) is unique. However, $V$ is certainly full rank, because it is made of orthonormal columns.
|
||||
|
||||
Notice that $VR=I$ cannot possibly have a solution when $m>n$, because the $m\times m$ identity matrix has $m$ linearly independent 1 columns, while the columns of $VR$ are linear combinations of the $n$ columns of $V$, so $VR$ can have at most $n$ linearly independent columns.
|
||||
|
||||
Footnote 1: Nay, orthonormal.
|
||||
|
||||
Of course, this result is still valid when $V$ is $m\times m$ and has orthonormal columns, since equation (3) still holds. However, for square, full-rank matrices ($r=m=n$), the distinction between left and right inverse vanishes, as we saw in class. Since the matrix $VV^{T}$ contains the inner products between the _rows_ of $V$ (just as $V^{T}V$ is formed by the inner products of its _columns_), the argument above shows that the rows of a _square_ orthogonal matrix are orthonormal as well. We can summarize this discussion as follows:
|
||||
|
||||
**Theorem 1.1**: _The left inverse of an orthogonal $m\times n$ matrix $V$ with $m\geq n$ exists and is equal to the transpose of $V$:_
|
||||
|
||||
$$V^{T}V=I\;.$$
|
||||
|
||||
_In particular, if $m=n$, the matrix $V^{-1}=V^{T}$ is also the right inverse of $V$:_
|
||||
|
||||
$$V\mbox{ square}\quad\Rightarrow\;\;V^{-1}V=V^{T}V=VV^{-1}=VV^{T}=I\;.$$
|
||||
|
||||
Sometimes, when $m=n$, the geometric interpretation of equation (2) causes confusion, because two interpretations of it are possible. In the interpretation given above, the point $P$ remains the same, and the underlying reference frame is changed from the elementary vectors ${\bf e}_{j}$ (that is, from the columns of $I$) to the vectors ${\bf v}_{j}$ (that is, to the columns of $V$). Alternatively, equation (2) can be seen as a transformation, in a fixed reference system, of point $P$ with coordinates ${\bf p}$ into a different point $Q$ with coordinates ${\bf q}$. This, however, is relativity, and should not be surprising: If you spin clockwise on your feet, or if you stand still and the whole universe spins counterclockwise around you, the result is the same.2
|
||||
|
||||
Footnote 2: At least geometrically. One solution may be more efficient than the other in other ways.
|
||||
|
||||
Consistently with either of these geometric interpretations, we have the following result:
|
||||
|
||||
**Theorem 1.2**: _The norm of a vector ${\bf x}$ is not changed by multiplication by an orthogonal matrix $V$:_
|
||||
|
||||
$$\|V{\bf x}\|=\|{\bf x}\|\.$$
|
||||
|
||||
**Proof.**
|
||||
|
||||
$$\|V{\bf x}\|^{2}={\bf x}^{T}V^{T}V{\bf x}={\bf x}^{T}{\bf x}=\|{\bf x}\|^{2}\.$$
|
||||
|
||||
$\Delta$
|
||||
|
||||
We conclude this section with an obvious but useful consequence of orthogonality. In an earlier note, we defined the projection ${\bf p}$ of a vector ${\bf b}$ onto another vector ${\bf c}$ as the point on the line through ${\bf c}$ that is closest to ${\bf b}$. This notion of projection can be extended from lines to vector spaces by the following definition: The _projection_${\bf p}$ of a point ${\bf b}\in{\bf R}^{n}$_onto a subspace_$C$ is the point in $C$ that is closest to ${\bf b}$.
|
||||
|
||||
Also, for _unit_ vectors ${\bf c}$, the projection matrix is ${\bf c}{\bf c}^{T}$, and the vector ${\bf b}-{\bf p}$ is orthogonal to ${\bf c}$. An analogous result holds for subspace projection, as the following theorem shows.
|
||||
|
||||
**Theorem 1.3**: _Let $U$ be an orthogonal matrix. Then the matrix $UU^{T}$ projects any vector ${\bf b}$ onto ${\rm range}(U)$. Furthermore, the difference vector between ${\bf b}$ and its projection ${\bf p}$ onto ${\rm range}(U)$ is orthogonal to ${\rm range}(U)$:_
|
||||
|
||||
$$U^{T}({\bf b}-{\bf p})={\bf 0}\.$$
|
||||
|
||||
**Proof.** A point ${\bf p}$ in ${\rm range}(U)$ is a linear combination of the columns of $U$:
|
||||
|
||||
$${\bf p}=U{\bf x}$$
|
||||
|
||||
where ${\bf x}$ is the vector of coefficients (as many coefficients as there are columns in $U$). The squared distance between ${\bf b}$ and ${\bf p}$ is
|
||||
|
||||
$$\|{\bf b}-{\bf p}\|^{2}=({\bf b}-{\bf p})^{T}({\bf b}-{\bf p})={\bf b}^{T}{\bf b }+{\bf p}^{T}{\bf p}-2{\bf b}^{T}{\bf p}={\bf b}^{T}{\bf b}+{\bf x}^{T}U^{T}U{ \bf x}-2{\bf b}^{T}U{\bf x}\.$$
|
||||
|
||||
Because of orthogonality, $U^{T}U$ is the identity matrix, so
|
||||
|
||||
$$\|{\bf b}-{\bf p}\|^{2}={\bf b}^{T}{\bf b}+{\bf x}^{T}{\bf x}-2{\bf b}^{T}U{\bf x }\.$$
|
||||
|
||||
The derivative of this squared distance with respect to ${\bf x}$ is the vector
|
||||
|
||||
$$2{\bf x}-2U^{T}{\bf b}$$
|
||||
|
||||
which is zero iff
|
||||
|
||||
$${\bf x}=U^{T}{\bf b}\,$$
|
||||
|
||||
that is, when
|
||||
|
||||
$${\bf p}=U{\bf x}=UU^{T}{\bf b}$$
|
||||
|
||||
as promised.
|
||||
|
||||
For this value of ${\bf p}$ the difference vector ${\bf b}-{\bf p}$ is orthogonal to ${\rm range}(U)$, in the sense that
|
||||
|
||||
$$U^{T}({\bf b}-{\bf p})=U^{T}({\bf b}-UU^{T}{\bf b})=U^{T}{\bf b}-U^{T}{\bf b}= {\bf 0}\.$$
|
||||
|
||||
## 2 The Singular Value Decomposition
|
||||
|
||||
Here is the main intuition captured by the Singular Value Decomposition (SVD) of a matrix:
|
||||
|
||||
An $m\times n$ matrix $A$ of rank $r$ maps the $r$-dimensional unit hypersphere in $\text{rowspace}(A)$ into an $r$-dimensional hyperellipse in $\text{range}(A)$.
|
||||
|
||||
Thus, a hypersphere is stretched or compressed into a hyperellipse, which is a quadratic hypersurface that generalizes the two-dimensional notion of ellipse to an arbitrary number of dimensions. In three dimensions, the hyperellipse is an ellipsoid, in one dimension it is a pair of points. In all cases, the hyperellipse in question is centered at the origin.
|
||||
|
||||
For instance, the rank-2 matrix
|
||||
|
||||
$$A=\frac{1}{\sqrt{2}}\left[\begin{array}{ccc}\sqrt{3}&\sqrt{3}\\ -3&3\\ 1&1\end{array}\right] \tag{5}$$
|
||||
|
||||
transforms the unit circle on the plane into an ellipse embedded in three-dimensional space. Figure 1 shows the map
|
||||
|
||||
$$\mathbf{b}=A\mathbf{x}\.$$
|
||||
|
||||
Two diametrically opposite points on the unit circle are mapped into the two endpoints of the major axis of the ellipse, and two other diametrically opposite points on the unit circle are mapped into the two endpoints of the minor axis of the ellipse. The lines through these two pairs of points on the unit circle are always orthogonal. This result can be generalized to any $m\times n$ matrix.
|
||||
|
||||
Simple and fundamental as this geometric fact may be, its proof by geometric means is cumbersome. Instead, we will prove it algebraically by first introducing the existence of the SVD and then using the latter to prove that matrices map hyperspheres into hyperellipses.
|
||||
|
||||
Figure 1: The matrix in equation (5) maps a circle on the plane into an ellipse in space. The two small boxes are corresponding points.
|
||||
|
||||
**Theorem 2.1**: _If $A$ is a real $m\times n$ matrix then there exist orthogonal matrices_
|
||||
|
||||
$$\begin{array}{rcl}U&=&\left[\begin{array}{ccc}{\bf u}_{1}&\cdots&{\bf u}_{m} \end{array}\right]\in{\cal R}^{m\times m}\\ V&=&\left[\begin{array}{ccc}{\bf v}_{1}&\cdots&{\bf v}_{n}\end{array}\right] \in{\cal R}^{n\times n}\end{array}$$
|
||||
|
||||
_such that_
|
||||
|
||||
$$U^{T}AV=\Sigma={\rm diag}(\sigma_{1},\ldots,\sigma_{p})\in{\cal R}^{m\times n}$$
|
||||
|
||||
_where $p=\min(m,n)$ and $\sigma_{1}\geq\ldots\geq\sigma_{p}\geq 0$. Equivalently,_
|
||||
|
||||
$$A=U\Sigma V^{T}\.$$
|
||||
|
||||
The columns of $V$ are the _right singular vectors_ of $A$, and those of $U$ are its _left singular vectors_. The diagonal entries of $\Sigma$ are the _singular values_ of $A$. The ratio
|
||||
|
||||
$$\kappa(A)=\sigma_{1}/\sigma_{p} \tag{6}$$
|
||||
|
||||
is the _condition number_ of $A$, and is possibly infinite.
|
||||
|
||||
**Proof.** Let ${\bf x}$ and ${\bf y}$ be unit vectors in ${\bf R}^{n}$ and ${\bf R}^{m}$, respectively, and consider the bilinear form
|
||||
|
||||
$$z={\bf y}^{T}A{\bf x}\.$$
|
||||
|
||||
The set
|
||||
|
||||
$${\cal S}\,=\,\{{\bf x},\,{\bf y}\,\,|\,{\bf x}\in{\bf R}^{n},\,\,{\bf y}\in{ \bf R}^{m},\,\,\|{\bf x}\|=\|{\bf y}\|=1\}$$
|
||||
|
||||
is compact, so that the scalar function $z({\bf x},{\bf y})$ must achieve a maximum value on ${\cal S}$, possibly at more than one point 3. Let ${\bf u}_{1},\,{\bf v}_{1}$ be two unit vectors in ${\bf R}^{m}$ and ${\bf R}^{n}$ respectively where this maximum is achieved, and let $\sigma_{1}$ be the corresponding value of $z$:
|
||||
|
||||
Footnote 3: Actually, at least at two points: if ${\bf u}_{1}^{T}A{\bf v}_{1}$ is a maximum, so is $(-{\bf u}_{1})^{T}A(-{\bf v}_{1})$.
|
||||
|
||||
$$\max_{\|{\bf x}\|=\|{\bf y}\|=1}{\bf y}^{T}A{\bf x}={\bf u}_{1}^{T}A{\bf v}_{1 }=\sigma_{1}\.$$
|
||||
|
||||
It is easy to see that ${\bf u}_{1}$ is parallel to the vector $A{\bf v}_{1}$. If this were not the case, their inner product ${\bf u}_{1}^{T}A{\bf v}_{1}$ could be increased by rotating ${\bf u}_{1}$ towards the direction of $A{\bf v}_{1}$, thereby contradicting the fact that ${\bf u}_{1}^{T}A{\bf v}_{1}$ is a maximum. Similarly, by noticing that
|
||||
|
||||
$${\bf u}_{1}^{T}A{\bf v}_{1}={\bf v}_{1}^{T}A^{T}{\bf u}_{1}$$
|
||||
|
||||
and repeating the argument above, we see that ${\bf v}_{1}$ is parallel to $A^{T}{\bf u}_{1}$.
|
||||
|
||||
The vectors ${\bf u}_{1}$ and ${\bf v}_{1}$ can be extended into orthonormal bases for ${\bf R}^{m}$ and ${\bf R}^{n}$, respectively. Collect these orthonormal basis vectors into orthogonal matrices $U_{1}$ and $V_{1}$. Then
|
||||
|
||||
$$U_{1}^{T}AV_{1}=S_{1}=\left[\begin{array}{cc}\sigma_{1}&{\bf 0}^{T}\\ {\bf 0}&A_{1}\end{array}\right]\.$$
|
||||
|
||||
In fact, the first column of $AV_{1}$ is $A{\bf v}_{1}=\sigma_{1}{\bf u}_{1}$, so the first entry of $U_{1}^{T}AV_{1}$ is ${\bf u}_{1}^{T}\sigma_{1}{\bf u}_{1}=\sigma_{1}$, and its other entries are ${\bf u}_{j}^{T}A{\bf v}_{1}=0$ because $A{\bf v}_{1}$ is parallel to ${\bf u}_{1}$ and therefore orthogonal, by construction, to$\mathbf{u}_{2},\ldots,\mathbf{u}_{m}$. A similar argument shows that the entries after the first in the first row of $S_{1}$ are zero: the row vector $\mathbf{u}_{1}^{T}A$ is parallel to $\mathbf{v}_{1}^{T}$, and therefore orthogonal to $\mathbf{v}_{2},\ldots,\mathbf{v}_{n}$, so that $\mathbf{u}_{1}^{T}A\mathbf{v}_{2}=\ldots=\mathbf{u}_{1}^{T}A\mathbf{v}_{n}=0$.
|
||||
|
||||
The matrix $A_{1}$ has one fewer row and column than $A$. We can repeat the same construction on $A_{1}$ and write
|
||||
|
||||
$$U_{2}^{T}A_{1}V_{2}=S_{2}=\left[\begin{array}{cc}\sigma_{2}&\mathbf{0}^{T} \\ \mathbf{0}&A_{2}\end{array}\right]$$
|
||||
|
||||
so that
|
||||
|
||||
$$\left[\begin{array}{cc}1&\mathbf{0}^{T}\\ \mathbf{0}&U_{2}^{T}\end{array}\right]U_{1}^{T}AV_{1}\left[\begin{array}{cc} 1&\mathbf{0}^{T}\\ \mathbf{0}&V_{2}\end{array}\right]=\left[\begin{array}{cc}\sigma_{1}&0& \mathbf{0}^{T}\\ 0&\sigma_{2}&\mathbf{0}^{T}\\ \mathbf{0}&\mathbf{0}&A_{2}\end{array}\right]\.$$
|
||||
|
||||
This procedure can be repeated until $A_{k}$ vanishes (zero rows or zero columns) to obtain
|
||||
|
||||
$$U^{T}AV=\Sigma$$
|
||||
|
||||
where $U^{T}$ and $V$ are orthogonal matrices obtained by multiplying together all the orthogonal matrices used in the procedure, and
|
||||
|
||||
$$\Sigma=\mathrm{diag}(\sigma_{1},\ldots,\sigma_{p})\.$$
|
||||
|
||||
Since matrices $U$ and $V$ are orthogonal, we can premultiply the matrix product in the theorem by $U$ and postmultiply it by $V^{T}$ to obtain
|
||||
|
||||
$$A=U\Sigma V^{T}\,$$
|
||||
|
||||
which is the desired result.
|
||||
|
||||
It only remains to show that the elements on the diagonal of $\Sigma$ are nonnegative and arranged in non-increasing order. To see that $\sigma_{1}\geq\ldots\geq\sigma_{p}$ (where $p=\min(m,n)$), we can observe that the successive maximization problems that yield $\sigma_{1},\ldots,\sigma_{p}$ are performed on a sequence of sets each of which contains the next. To show this, we just need to show that $\sigma_{2}\leq\sigma_{1}$, and induction will do the rest. We have
|
||||
|
||||
$$\begin{array}{rcl}\sigma_{2}&=&\max_{\|\hat{\mathbf{x}}\|=\|\hat{\mathbf{y }}\|=1}\hat{\mathbf{y}}^{T}A_{1}\hat{\mathbf{x}}=\max_{\|\hat{\mathbf{x}}\|= \|\hat{\mathbf{y}}\|=1}\left[\begin{array}{cc}0&\hat{\mathbf{y}}\end{array} \right]^{T}S_{1}\left[\begin{array}{c}0\\ \hat{\mathbf{x}}\end{array}\right]\\ &=&\max_{\|\hat{\mathbf{x}}\|=\|\hat{\mathbf{y}}\|=1}\left[\begin{array}{cc}0 &\hat{\mathbf{y}}\end{array}\right]^{T}U_{1}^{T}AV_{1}\left[\begin{array}{c} 0\\ \hat{\mathbf{x}}\end{array}\right]=&\max_{\|\mathbf{x}\|=\|\mathbf{y}\|=1} \mathbf{y}^{T}A\mathbf{x}\leq\sigma_{1}\.\\ &&\mathbf{x}^{T}\mathbf{v}_{1}=\mathbf{y}^{T}\mathbf{u}_{1}=0\end{array}$$
|
||||
|
||||
To explain the last equality above, consider the vectors
|
||||
|
||||
$$\mathbf{x}=V_{1}\left[\begin{array}{c}0\\ \hat{\mathbf{x}}\end{array}\right]\quad\mbox{and}\quad\mathbf{y}=U_{1}\left[ \begin{array}{c}0\\ \hat{\mathbf{y}}\end{array}\right]\.$$
|
||||
|
||||
The vector $\mathbf{x}$ is equal to the unit vector $[0\ \hat{\mathbf{x}}]^{T}$ transformed by the orthogonal matrix $V_{1}$, and is therefore itself a unit vector. In addition, it is a linear combination of $\mathbf{v}_{2},\ldots,\mathbf{v}_{n}$, and is therefore orthogonal to $\mathbf{v}_{1}$. A similar argument shows that $\mathbf{y}$ is a unit vector orthogonal to $\mathbf{u}_{1}$. Because $\mathbf{x}$ and $\mathbf{y}$ thus defined belong to subsets (actually sub-spheres) of the unit spheres in $\mathbf{R}^{n}$ and $\mathbf{R}^{m}$, we conclude that $\sigma_{2}\leq\sigma_{1}$.
|
||||
|
||||
The $\sigma_{i}$ are nonnegative because all these maximizations are performed on unit hyper-spheres. The $\sigma_{i}$s are maxima of the function $z(\mathbf{x},\mathbf{y})$ which always assumes both positive and negative values on any hyper-sphere: If $z(\mathbf{x},\mathbf{y})$ is negative, then $z(-\mathbf{x},\mathbf{y})$ is positive, and if $\mathbf{x}$ is on a hyper-sphere, so is $-\mathbf{x}$. $\Delta$
|
||||
|
||||
[MISSING_PAGE_FAIL:7]
|
||||
|
||||
Finally, both the 2-norm and the Frobenius norm
|
||||
|
||||
$$\|A\|_{F}=\sqrt{\sum_{i=1}^{m}\sum_{j=1}^{n}|a_{ij}|^{2}}$$
|
||||
|
||||
and
|
||||
|
||||
$$\|A\|_{2}=\sup_{{\bf x}\neq 0}\frac{\|A{\bf x}\|}{\|{\bf x}\|}$$
|
||||
|
||||
are neatly characterized in terms of the SVD:
|
||||
|
||||
$$\|A\|_{F}^{2} = \sigma_{1}^{2}+\ldots+\sigma_{p}^{2}$$ $$\|A\|_{2} = \sigma_{1}\.$$
|
||||
|
||||
In the next few sections we introduce fundamental results and applications that testify to the importance of the SVD.
|
||||
|
||||
## 3 The Pseudoinverse
|
||||
|
||||
One of the most important applications of the SVD is the solution of linear systems in the least squares sense. A linear system of the form
|
||||
|
||||
$$A{\bf x}={\bf b} \tag{7}$$
|
||||
|
||||
arising from a real-life application may or may not admit a solution, that is, a vector ${\bf x}$ that satisfies this equation exactly. Often more measurements are available than strictly necessary, because measurements are unreliable. This leads to more equations than unknowns (the number $m$ of rows in $A$ is greater than the number $n$ of columns), and equations are often mutually incompatible because they come from inexact measurements. Even when $m\leq n$ the equations can be incompatible, because of errors in the measurements that produce the entries of $A$. In these cases, it makes more sense to find a vector ${\bf x}$ that minimizes the norm
|
||||
|
||||
$$\|A{\bf x}-{\bf b}\|$$
|
||||
|
||||
of the _residual_ vector
|
||||
|
||||
$${\bf r}=A{\bf x}-{\bf b}\.$$
|
||||
|
||||
where the double bars henceforth refer to the Euclidean norm. Thus, ${\bf x}$ cannot exactly satisfy any of the $m$ equations in the system, but it tries to satisfy all of them as closely as possible, as measured by the sum of the squares of the discrepancies between left- and right-hand sides of the equations.
|
||||
|
||||
In other circumstances, not enough measurements are available. Then, the linear system (7) is under-determined, in the sense that it has fewer independent equations than unknowns (its rank $r$ is less than $n$).
|
||||
|
||||
Incompatibility and under-determinacy can occur together: the system admits no solution, and the least-squares solution is not unique. For instance, the system
|
||||
|
||||
$$x_{1}+x_{2} = 1$$ $$x_{1}+x_{2} = 3$$ $$x_{3} = 2$$has three unknowns, but rank 2, and its first two equations are incompatible: $x_{1}+x_{2}$ cannot be equal to both 1 and 3. A least-squares solution turns out to be ${\bf x}=[1\ 1\ 2]^{T}$ with residual ${\bf r}=A{\bf x}-{\bf b}=[1\ -1\ 0]$, which has norm $\sqrt{2}$ (admittedly, this is a rather high residual, but this is the best we can do for this problem, in the least-squares sense). However, any other vector of the form
|
||||
|
||||
$${\bf x}^{\prime}=\left[\begin{array}{c}1\\ 1\\ 2\end{array}\right]+\alpha\left[\begin{array}{c}-1\\ 1\\ 0\end{array}\right]$$
|
||||
|
||||
is as good as ${\bf x}$. For instance, ${\bf x}^{\prime}=[0\ 2\ 2]$, obtained for $\alpha=1$, yields exactly the same residual as ${\bf x}$ (check this).
|
||||
|
||||
In summary, an exact solution to the system (7) may not exist, or may not be unique. An approximate solution, in the least-squares sense, always exists, but may fail to be unique.
|
||||
|
||||
If there are several least-squares solutions, all equally good (or bad), then one of them turns out to be shorter than all the others, that is, its norm $\|{\bf x}\|$ is smallest. One can therefore redefine what it means to "solve" a linear system so that there is always exactly one solution. This minimum norm solution is the subject of the following theorem, which both proves uniqueness and provides a recipe for the computation of the solution.
|
||||
|
||||
**Theorem 3.1**: _The minimum-norm least squares solution to a linear system $A{\bf x}={\bf b}$, that is, the shortest vector ${\bf x}$ that achieves the_
|
||||
|
||||
$$\min_{{\bf x}}\|A{\bf x}-{\bf b}\|\,$$
|
||||
|
||||
_is unique, and is given by_
|
||||
|
||||
$$\hat{{\bf x}}=V\Sigma^{\dagger}U^{T}{\bf b} \tag{8}$$
|
||||
|
||||
_where_
|
||||
|
||||
$$\Sigma^{\dagger}=\left[\begin{array}{cccccc}1/\sigma_{1}&&&&0\ \ \cdots\ \ 0\\ &\ddots&&&&\\ &&1/\sigma_{r}&&&&\vdots&\vdots\\ &&&0&&&&\\ &&&\ddots&&&\\ &&&&0\ \ 0\ \cdots\ \ 0\end{array}\right]$$
|
||||
|
||||
_is an $n\times m$ diagonal matrix._
|
||||
|
||||
The matrix
|
||||
|
||||
$$A^{\dagger}=V\Sigma^{\dagger}U^{T}$$
|
||||
|
||||
is called the _pseudoinverse_ of $A$.
|
||||
|
||||
**Proof.** The minimum-norm Least Squares solution to
|
||||
|
||||
$$A{\bf x}={\bf b}$$
|
||||
|
||||
is the shortest vector ${\bf x}$ that minimizes
|
||||
|
||||
$$\|A{\bf x}-{\bf b}\|$$
|
||||
|
||||
that is,
|
||||
|
||||
$$\|U\Sigma V^{T}{\bf x}-{\bf b}\|\.$$
|
||||
|
||||
[MISSING_PAGE_FAIL:10]
|
||||
|
||||
as promised. The residual, that is, the norm of $\|A{\bf x}-{\bf b}\|$ when ${\bf x}$ is the solution vector, is the norm of $\Sigma{\bf y}-{\bf c}$, since this vector is related to $A{\bf x}-{\bf b}$ by an orthogonal transformation (see equation (9)). In conclusion, the square of the residual is
|
||||
|
||||
$$\|A{\bf x}-{\bf b}\|^{2}=\|\Sigma{\bf y}-{\bf c}\|^{2}=\sum_{i=r+1}^{m}c_{i}^{2 }=\sum_{i=r+1}^{m}({\bf u}_{i}^{T}{\bf b})^{2}$$
|
||||
|
||||
which is the projection of the right-hand side vector ${\bf b}$ onto the complement of the range of $A$. $\Delta$
|
||||
|
||||
## 4 Least-Squares Solution of a Homogeneous Linear Systems
|
||||
|
||||
Theorem 3.1 works regardless of the value of the right-hand side vector ${\bf b}$. When ${\bf b}={\bf 0}$, that is, when the system is _homogeneous_, the solution is trivial: the minimum-norm solution to
|
||||
|
||||
$$A{\bf x}={\bf 0} \tag{10}$$
|
||||
|
||||
is
|
||||
|
||||
$${\bf x}=0\,$$
|
||||
|
||||
which happens to be an exact solution. Of course it is not necessarily the only one (any vector in the null space of $A$ is also a solution, by definition), but it is obviously the one with the smallest norm.
|
||||
|
||||
Thus, ${\bf x}=0$ is the minimum-norm solution to any homogeneous linear system. Although correct, this solution is not too interesting. In many applications, what is desired is a _nonzero_ vector ${\bf x}$ that satisfies the system (10) as well as possible. Without any constraints on ${\bf x}$, we would fall back to ${\bf x}=0$ again. For homogeneous linear systems, the meaning of a least-squares solution is therefore usually modified, once more, by imposing the constraint
|
||||
|
||||
$$\|{\bf x}\|=1$$
|
||||
|
||||
on the solution. Unfortunately, the resulting constrained minimization problem does not necessarily admit a _unique_ solution. The following theorem provides a recipe for finding this solution, and shows that there is in general a whole hypersphere of solutions.
|
||||
|
||||
**Theorem 4.1**: _Let_
|
||||
|
||||
$$A=U\Sigma V^{T}$$
|
||||
|
||||
_be the singular value decomposition of $A$. Furthermore, let ${\bf v}_{n-k+1},\ldots,{\bf v}_{n}$ be the $k$ columns of $V$ whose corresponding singular values are equal to the last singular value $\sigma_{n}$, that is, let $k$ be the largest integer such that_
|
||||
|
||||
$$\sigma_{n-k+1}=\ldots=\sigma_{n}\.$$
|
||||
|
||||
_Then, all vectors of the form_
|
||||
|
||||
$${\bf x}=\alpha_{1}{\bf v}_{n-k+1}+\ldots+\alpha_{k}{\bf v}_{n} \tag{11}$$
|
||||
|
||||
_with_
|
||||
|
||||
$$\alpha_{1}^{2}+\ldots+\alpha_{k}^{2}=1 \tag{12}$$_are unit-norm least squares solutions to the homogeneous linear system_
|
||||
|
||||
$$A{\bf x}={\bf 0},$$
|
||||
|
||||
_that is, they achieve the_
|
||||
|
||||
$$\min_{\|{\bf x}\|=1}\|A{\bf x}\|\.$$
|
||||
|
||||
Note: when $\sigma_{n}$ is greater than zero the most common case is $k=1$, since it is very unlikely that different singular values have _exactly_ the same numerical value. When $A$ is rank deficient, on the other case, it may often have more than one singular value equal to zero. In any event, if $k=1$, then the minimum-norm solution is unique, ${\bf x}={\bf v}_{n}$. If $k>1$, the theorem above shows how to express _all_ solutions as a linear combination of the last $k$ columns of $V$.
|
||||
|
||||
**Proof.** The reasoning is very similar to that for the previous theorem. The unit-norm Least Squares solution to
|
||||
|
||||
$$A{\bf x}={\bf 0}$$
|
||||
|
||||
is the vector ${\bf x}$ with $\|{\bf x}\|=1$ that minimizes
|
||||
|
||||
$$\|A{\bf x}\|$$
|
||||
|
||||
that is,
|
||||
|
||||
$$\|U\Sigma V^{T}{\bf x}\|\.$$
|
||||
|
||||
Since orthogonal matrices do not change the norm of vectors they are applied to (theorem 1.2), this norm is the same as
|
||||
|
||||
$$\|\Sigma V^{T}{\bf x}\|$$
|
||||
|
||||
or, with ${\bf y}=V^{T}{\bf x}$,
|
||||
|
||||
$$\|\Sigma{\bf y}\|\.$$
|
||||
|
||||
Since $V$ is orthogonal, $\|{\bf x}\|=1$ translates to $\|{\bf y}\|=1$. We thus look for the unit-norm vector ${\bf y}$ that minimizes the norm (squared) of $\Sigma{\bf y}$, that is,
|
||||
|
||||
$$\sigma_{1}^{2}y_{1}^{2}+\ldots+\sigma_{n}^{2}y_{n}^{2}\.$$
|
||||
|
||||
This is obviously achieved by concentrating all the (unit) mass of ${\bf y}$ where the $\sigma$s are smallest, that is by letting
|
||||
|
||||
$$y_{1}=\ldots=y_{n-k}=0. \tag{13}$$
|
||||
|
||||
From ${\bf y}=V^{T}{\bf x}$ we obtain ${\bf x}=V{\bf y}=y_{1}{\bf v}_{1}+\ldots+y_{n}{\bf v}_{n}$, so that equation (13) is equivalent to equation (11) with $\alpha_{1}=y_{n-k+1},\ldots,\alpha_{k}=y_{n}$, and the unit-norm constraint on ${\bf y}$ yields equation (12). $\Delta$
|
BIN
documents/pdfs/Chapter 1. The Euclidean Space.2016-2.pdf
Normal file
BIN
documents/pdfs/Chapter 1. The Euclidean Space.2016-2.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
documents/pdfs/linear_algebra_for_nn.pdf
Normal file
BIN
documents/pdfs/linear_algebra_for_nn.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -2,12 +2,14 @@ import gradio as gr
|
|||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
|
||||
|
||||
from backend.embeddings_manager import get_embedding_model, MODELS_DICT
|
||||
from backend.vector_db_manager import VectorDbManager
|
||||
from backend.inference import InferenceInstance
|
||||
from backend.inference import InferenceInstance, read_relevant_content
|
||||
from backend.pdf_to_mmd import pdf_to_mmd
|
||||
from backend.logger import Logger, read_logs
|
||||
|
||||
import time
|
||||
|
||||
|
||||
|
@ -35,14 +37,20 @@ def start_server():
|
|||
# Start the server
|
||||
start_server()
|
||||
|
||||
# Create VectorDbManager and Inference instance
|
||||
# Global variable etc...
|
||||
|
||||
embedding_func = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large", model_kwargs={'device': 'cuda'})
|
||||
base_db_directory = Path(r"../documents/vector_db")
|
||||
vector_db_manager = VectorDbManager(embedding_name="multilingual-e5-large", embedding_function=embedding_func, chunk_size=512, db_directory=base_db_directory)
|
||||
inference_instance = InferenceInstance(vector_db_manager=vector_db_manager, nb_chunks_retrieved=4)
|
||||
BASE_DB_DIRECTORY = Path(r"../documents/vector_db")
|
||||
|
||||
|
||||
def update_embedding(embedding_name):
|
||||
global BASE_DB_DIRECTORY, VECTOR_DB_MANAGER, INFERENCE_INSTANCE
|
||||
embedding_func = get_embedding_model(embedding_name)
|
||||
VECTOR_DB_MANAGER = VectorDbManager(embedding_name=embedding_name, embedding_function=embedding_func, chunk_size=512, db_directory=BASE_DB_DIRECTORY)
|
||||
INFERENCE_INSTANCE = InferenceInstance(vector_db_manager=VECTOR_DB_MANAGER, nb_chunks_retrieved=4)
|
||||
print(f"Updated embedding model to {embedding_name}")
|
||||
|
||||
|
||||
update_embedding("intfloat/multilingual-e5-large")
|
||||
user_message_global = ""
|
||||
|
||||
|
||||
|
@ -59,16 +67,16 @@ def bot(history):
|
|||
print(f"FOUND DOC_PATH {doc_path}")
|
||||
doc_extension = doc_path.split(".")[-1]
|
||||
if doc_extension == "mmd":
|
||||
vector_db_manager.create_vector_store_from_latex(Path(doc_path))
|
||||
VECTOR_DB_MANAGER.create_vector_store_from_latex(Path(doc_path))
|
||||
elif doc_extension == "pdf":
|
||||
vector_db_manager.create_vector_store_from_pdf(doc_path)
|
||||
VECTOR_DB_MANAGER.create_vector_store_from_pdf(doc_path)
|
||||
else:
|
||||
print(f"Unsupported extension: {doc_extension}")
|
||||
else:
|
||||
print("NOT FOUND DOC_PATH")
|
||||
|
||||
doc_name = Path(doc_path).stem + ".mmd" if math_checkbox.value else Path(doc_path).name
|
||||
bot_message = inference_instance.get_next_token(user_message_global, doc_name)
|
||||
bot_message = INFERENCE_INSTANCE.get_next_token(user_message_global, doc_name)
|
||||
history[-1][1] = ""
|
||||
for message in bot_message:
|
||||
history[-1][1] = message
|
||||
|
@ -84,7 +92,6 @@ def update_path(p, checked):
|
|||
stem = Path(p).stem
|
||||
if checked:
|
||||
if not (Path(r"../documents/mmds") / (stem + ".mmd")).exists():
|
||||
print(f"Converting {name} to MMD")
|
||||
pdf_to_mmd(r"../documents/pdfs/" + name)
|
||||
print(f"Selected DOC path: {stem}.mmd")
|
||||
doc_path = r"../documents/mmds/" + stem + ".mmd"
|
||||
|
@ -116,31 +123,46 @@ def pdf_viewer(pdf_file):
|
|||
|
||||
# Define main Gradio tab
|
||||
with gr.Blocks() as main_tab:
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
with gr.Column(scale=12):
|
||||
pdf_output = gr.HTML()
|
||||
with gr.Row():
|
||||
with gr.Column(scale=12):
|
||||
file_input = gr.File(label="Select a PDF file")
|
||||
math_checkbox = gr.Checkbox(label="Interpret as LaTeX (a latex version will be created then given to "
|
||||
"the chatbot, the conversion take some time)")
|
||||
with gr.Row():
|
||||
with gr.Column(scale = 3):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=12):
|
||||
pdf_output = gr.HTML()
|
||||
with gr.Row():
|
||||
with gr.Column(scale=12):
|
||||
file_input = gr.File(label="Select a PDF file")
|
||||
math_checkbox = gr.Checkbox(label="Interpret as LaTeX (a latex version will be created then given to "
|
||||
"the chatbot, the conversion take some time)")
|
||||
|
||||
with gr.Column():
|
||||
with gr.Group():
|
||||
chatbot = gr.Chatbot(scale=2,
|
||||
latex_delimiters=[{"left": "$$", "right": "$$", "display": True},
|
||||
{"left": "$", "right": "$", "display": False}])
|
||||
msg = gr.Textbox(label="User message", scale=2)
|
||||
with gr.Group():
|
||||
chatbot = gr.Chatbot(scale=2,
|
||||
latex_delimiters=[{"left": "$$", "right": "$$", "display": True},
|
||||
{"left": "$", "right": "$", "display": False}])
|
||||
msg = gr.Textbox(label="User message", scale=2)
|
||||
|
||||
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
||||
bot, chatbot, chatbot
|
||||
)
|
||||
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
||||
bot, chatbot, chatbot
|
||||
)
|
||||
|
||||
|
||||
|
||||
with gr.Column():
|
||||
references = gr.Markdown(label="References",
|
||||
latex_delimiters=[{"left": "$$", "right": "$$", "display": True},
|
||||
{"left": "$", "right": "$", "display": False}])
|
||||
main_tab.load(read_relevant_content, None, references, every=1)
|
||||
|
||||
file_input.change(pdf_viewer, inputs=file_input, outputs=pdf_output)
|
||||
file_input.upload(update_path, inputs=[file_input, math_checkbox])
|
||||
|
||||
|
||||
# Define the log tab
|
||||
with gr.Blocks() as log_tab:
|
||||
logs = gr.Textbox(lines=50, interactive=False)
|
||||
sys.stdout = Logger("../temp_file/output.log")
|
||||
log_tab.load(read_logs, None, logs, every=1)
|
||||
|
||||
|
||||
# Define options tab
|
||||
with gr.Blocks() as options_tab:
|
||||
with gr.Column():
|
||||
|
@ -148,6 +170,8 @@ with gr.Blocks() as options_tab:
|
|||
with gr.Column(scale=12):
|
||||
# TODO: Add options for the inference instance
|
||||
gr.Textbox(label="Options", scale=2)
|
||||
embedding_model_dropdown = gr.Dropdown(label="Embedding model", choices=MODELS_DICT.keys(), value="intfloat/multilingual-e5-large")
|
||||
embedding_model_dropdown.change(update_embedding, inputs=embedding_model_dropdown)
|
||||
|
||||
|
||||
# Define conversion tab
|
||||
|
@ -156,6 +180,8 @@ with gr.Blocks() as conversion_tab:
|
|||
file_input = gr.File(label="Select a PDF file to convert to MMD")
|
||||
html_output = gr.HTML(label="Output")
|
||||
|
||||
|
||||
|
||||
def upload_func(file_input):
|
||||
name = Path(file_input).name
|
||||
file_path = fr"../documents/pdfs/{name}"
|
||||
|
@ -165,6 +191,8 @@ with gr.Blocks() as conversion_tab:
|
|||
file_input.upload(upload_func, inputs=file_input)
|
||||
|
||||
|
||||
app = gr.TabbedInterface([main_tab, options_tab, conversion_tab], ["Main", "Options", "Conversion"])
|
||||
app = gr.TabbedInterface([main_tab, log_tab, options_tab, conversion_tab],
|
||||
["Main", "Logs", "Options", "Conversion"])
|
||||
app.queue()
|
||||
app.launch()
|
||||
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>PDF Viewer</title>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<!--
|
||||
Place the following <div> element where you want the PDF to be displayed in your website. You can change the size using the width and height attributes.
|
||||
-->
|
||||
<div>
|
||||
|
||||
<iframe
|
||||
src="C:\Users\CLEME\Pictures\corr_exam.pdf"
|
||||
width="500"
|
||||
height="678"
|
||||
>
|
||||
</iframe>
|
||||
</div>
|
||||
|
||||
</body>
|
|
@ -1,3 +1,47 @@
|
|||
import os
|
||||
print("---")
|
||||
print(os.getcwd())
|
||||
import gradio as gr
|
||||
import sys
|
||||
|
||||
|
||||
class Logger:
|
||||
def __init__(self, filename):
|
||||
self.terminal = sys.stdout
|
||||
self.log = open(filename, "w")
|
||||
|
||||
def write(self, message):
|
||||
self.terminal.write(message)
|
||||
self.log.write(message)
|
||||
|
||||
def flush(self):
|
||||
self.terminal.flush()
|
||||
self.log.flush()
|
||||
|
||||
def isatty(self):
|
||||
return False
|
||||
|
||||
|
||||
sys.stdout = Logger("../temp_file/output.log")
|
||||
|
||||
|
||||
def test(x):
|
||||
print("This is a test")
|
||||
print(f"Your function is running with input {x}...")
|
||||
return x
|
||||
|
||||
|
||||
def read_logs():
|
||||
sys.stdout.flush()
|
||||
with open("../temp_file/output.log", "r") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
with gr.Blocks() as demo:
|
||||
with gr.Row():
|
||||
input = gr.Textbox()
|
||||
output = gr.Textbox()
|
||||
btn = gr.Button("Run")
|
||||
btn.click(test, input, output)
|
||||
|
||||
logs = gr.Textbox()
|
||||
demo.load(read_logs, None, logs, every=1)
|
||||
|
||||
demo.queue().launch()
|
26
temp_file/output.log
Normal file
26
temp_file/output.log
Normal file
|
@ -0,0 +1,26 @@
|
|||
Running on local URL: http://127.0.0.1:7860
|
||||
|
||||
To create a public link, set `share=True` in `launch()`.
|
||||
Updating path
|
||||
Selected DOC path: linear_algebra_for_nn.mmd
|
||||
FOUND DOC_PATH ../documents/mmds/linear_algebra_for_nn.mmd
|
||||
..\documents\vector_db\intfloat\multilingual-e5-large\linear_algebra_for_nn.mmd found, not recreating a vector store
|
||||
doc_name: linear_algebra_for_nn.mmd
|
||||
input_user: Explain the projection of one vector onto another vector
|
||||
search results: [Document(page_content='### Projection of one vector onto another vector\n\nThe (orthogonal) projection of vector $\\boldsymbol{x}$ on vector $\\boldsymbol{w}$ is defined as\n\n$$\\mathsf{proj}_{\\boldsymbol{w}}\\boldsymbol{x}=\\frac{\\boldsymbol{x}^{\\mskip-1.5mu \\mathsf{T}} \\boldsymbol{w}}{\\boldsymbol{w}^{\\mskip-1.5mu \\mathsf{T}}\\boldsymbol{w}} \\boldsymbol{w}=\\mathsf{cos}(\\boldsymbol{x},\\boldsymbol{w})\\times\\frac{\\| \\boldsymbol{x}\\|}{\\|\\boldsymbol{w}\\|}\\boldsymbol{w}. \\tag{3}$$'), Document(page_content='This technique can be extended to matrix and vector functions. It involves the notion of gradient and Hessian. Now a vector function $f\\left(\\boldsymbol{x}\\right)$ is expressed as:\n\n$$f\\left(\\boldsymbol{x}\\right)=f\\left(\\boldsymbol{a}\\right)+f\\left(\\boldsymbol{ x-a}\\right)^{\\mathsf{T}}\\boldsymbol{\\nabla}_{\\boldsymbol{f(a)}}+f\\left( \\boldsymbol{x-a}\\right)^{\\mathsf{T}}\\boldsymbol{\\nabla}_{\\boldsymbol{f(a)}}^ {2}f\\left(\\boldsymbol{x-a}\\right)+\\mathcal{R}_{2}. \\tag{31}$$'), Document(page_content='So the activation is proportional to the norm of the projection of the input vector onto the weight vector. The _response_ or _output_ of the cell is denoted $o$. For a _linear cell_, it is proportional to the activation (for convenience, assume that the proportionality constant is equal to $1$). _Linear heteroassociators_ and _autoassociators_ are made of linear cells. In general, the output of a cell is a _function_ (often, but not necessarily, continuous), called the _transfer function_, of its'), Document(page_content='The weight vector is corrected by moving it in the opposite direction of the gradient. This is obtained by adding a small vector denoted $\\boldsymbol{\\Delta}_{\\boldsymbol{w}}$ opposite to the gradient. This gives the following correction for iteration $n+1$:')]
|
||||
FOUND DOC_PATH ../documents/mmds/linear_algebra_for_nn.mmd
|
||||
..\documents\vector_db\intfloat\multilingual-e5-large\linear_algebra_for_nn.mmd found, not recreating a vector store
|
||||
doc_name: linear_algebra_for_nn.mmd
|
||||
input_user: Explain the projwx formula
|
||||
search results: [Document(page_content='The norm of $\\mathsf{proj}_{\\boldsymbol{w}}\\boldsymbol{x}$ is its distance to the origin of the space. It is equal to\n\n$$\\|\\mathsf{proj}_{\\boldsymbol{w}}\\boldsymbol{x}\\|=\\frac{|\\boldsymbol{x}^{ \\mskip-1.5mu \\mathsf{T}}\\boldsymbol{w}|}{\\|\\boldsymbol{w}\\|}=|\\mathsf{ cos}(\\boldsymbol{x},\\boldsymbol{y})|\\times\\|\\boldsymbol{x}\\|\\enspace. \\tag{4}$$'), Document(page_content='$\\boldsymbol{x}$, and $\\boldsymbol{w}$, the activation of the output cell is obtained as'), Document(page_content='$$\\boldsymbol{w}_{[n+1]}=\\boldsymbol{w}_{[n]}+\\boldsymbol{\\Delta}_{\\boldsymbol{ w}}=\\boldsymbol{w}_{[n]}-\\eta\\frac{\\partial e}{\\partial\\boldsymbol{w}}= \\boldsymbol{w}_{[n]}+\\eta(t-\\boldsymbol{w}^{\\mathsf{T}}\\boldsymbol{x}) \\boldsymbol{x}=\\boldsymbol{w}_{[n]}+\\eta(t-o)\\boldsymbol{x}. \\tag{35}$$\n\nThis gives the rule defined by Equation 9.'), Document(page_content='### Projection of one vector onto another vector\n\nThe (orthogonal) projection of vector $\\boldsymbol{x}$ on vector $\\boldsymbol{w}$ is defined as\n\n$$\\mathsf{proj}_{\\boldsymbol{w}}\\boldsymbol{x}=\\frac{\\boldsymbol{x}^{\\mskip-1.5mu \\mathsf{T}} \\boldsymbol{w}}{\\boldsymbol{w}^{\\mskip-1.5mu \\mathsf{T}}\\boldsymbol{w}} \\boldsymbol{w}=\\mathsf{cos}(\\boldsymbol{x},\\boldsymbol{w})\\times\\frac{\\| \\boldsymbol{x}\\|}{\\|\\boldsymbol{w}\\|}\\boldsymbol{w}. \\tag{3}$$')]
|
||||
FOUND DOC_PATH ../documents/mmds/linear_algebra_for_nn.mmd
|
||||
..\documents\vector_db\intfloat\multilingual-e5-large\linear_algebra_for_nn.mmd found, not recreating a vector store
|
||||
doc_name: linear_algebra_for_nn.mmd
|
||||
input_user: Write the Hessian matrix to flex your latex capacities
|
||||
search results: [Document(page_content='This technique can be extended to matrix and vector functions. It involves the notion of gradient and Hessian. Now a vector function $f\\left(\\boldsymbol{x}\\right)$ is expressed as:\n\n$$f\\left(\\boldsymbol{x}\\right)=f\\left(\\boldsymbol{a}\\right)+f\\left(\\boldsymbol{ x-a}\\right)^{\\mathsf{T}}\\boldsymbol{\\nabla}_{\\boldsymbol{f(a)}}+f\\left( \\boldsymbol{x-a}\\right)^{\\mathsf{T}}\\boldsymbol{\\nabla}_{\\boldsymbol{f(a)}}^ {2}f\\left(\\boldsymbol{x-a}\\right)+\\mathcal{R}_{2}. \\tag{31}$$'), Document(page_content='When a function is twice differentiable, the second order derivatives are stored in a matrix called the _Hessian_ matrix of the function. It is often denoted by $\\mathbf{H}$ or $\\mathbf{\\nabla}_{\\mathbf{f}}^{\\mathsf{2}}$ and is formally defined as'), Document(page_content="_Newton's method_ is a second order Taylor approximation, it uses the inverse of the Hessian of $\\boldsymbol{w}$ (supposing it exists). It gives a better numerical approximation but necessitates more computation. Here the correction for iteration $n+1$ is\n\n$$\\boldsymbol{w}_{[n+1]}=\\boldsymbol{w}_{[n]}+\\boldsymbol{\\Delta}=\\boldsymbol{w} _{[n]}-(\\boldsymbol{H}^{-1})(\\boldsymbol{\\nabla}_{\\boldsymbol{f(w)}}) \\tag{36}$$\n\n(where $\\boldsymbol{\\nabla}_{\\boldsymbol{f(w)}}$ is computed for $\\boldsymbol{w}_{[n]}$)."), Document(page_content='So the activation is proportional to the norm of the projection of the input vector onto the weight vector. The _response_ or _output_ of the cell is denoted $o$. For a _linear cell_, it is proportional to the activation (for convenience, assume that the proportionality constant is equal to $1$). _Linear heteroassociators_ and _autoassociators_ are made of linear cells. In general, the output of a cell is a _function_ (often, but not necessarily, continuous), called the _transfer function_, of its')]
|
||||
FOUND DOC_PATH ../documents/mmds/linear_algebra_for_nn.mmd
|
||||
..\documents\vector_db\intfloat\multilingual-e5-large\linear_algebra_for_nn.mmd found, not recreating a vector store
|
||||
doc_name: linear_algebra_for_nn.mmd
|
||||
input_user: sadly it doesn't render proprely ...
|
||||
search results: [Document(page_content='## References'), Document(page_content='from:\n\n**N.J., Smelter, & P.B., Baltes (Eds.) (2001).**\n\n**Encyclopedia of the Social and Behavioral Sciences.**\n\n**London: Elsevier Science.**\n\n**Article Title: Linear Algebra for Neural Networks**\n\n**By: Herve Abdi**\n\n**Author Address:** Herve Abdi, School of Human Development, MS: Gr.4.1, The University of Texas at Dallas, Richardson, TX 750833-0688, USA\n\n**Phone:** 972 883 2065, **fax:** 972 883 2491 **Date:** June 1, 2001\n\n**E-mail:** herve@utdallas.edu\n\n**Abstract**'), Document(page_content='$$o=f\\left(a\\right)\\enspace. \\tag{6}$$\n\nFor example, in _backpropagation networks_, the (nonlinear) transfer function is usually the logistic function\n\n$$o=f\\left(a\\right)=\\operatorname{logist}\\boldsymbol{w}^{\\mskip-1.5mu \\mathsf{T}} \\boldsymbol{x}=\\frac{1}{1+\\exp\\{-a\\}}\\enspace. \\tag{7}$$'), Document(page_content='$\\boldsymbol{x}$, and $\\boldsymbol{w}$, the activation of the output cell is obtained as')]
|
||||
Keyboard interruption in main thread... closing server.
|
54
temp_file/relevant_content.mmd
Normal file
54
temp_file/relevant_content.mmd
Normal file
|
@ -0,0 +1,54 @@
|
|||
**Relevant content viewed in the document**:
|
||||
|
||||
|
||||
|
||||
## References
|
||||
|
||||
-----------------------------------
|
||||
|
||||
**Relevant content viewed in the document**:
|
||||
|
||||
|
||||
|
||||
from:
|
||||
|
||||
**N.J., Smelter, & P.B., Baltes (Eds.) (2001).**
|
||||
|
||||
**Encyclopedia of the Social and Behavioral Sciences.**
|
||||
|
||||
**London: Elsevier Science.**
|
||||
|
||||
**Article Title: Linear Algebra for Neural Networks**
|
||||
|
||||
**By: Herve Abdi**
|
||||
|
||||
**Author Address:** Herve Abdi, School of Human Development, MS: Gr.4.1, The University of Texas at Dallas, Richardson, TX 750833-0688, USA
|
||||
|
||||
**Phone:** 972 883 2065, **fax:** 972 883 2491 **Date:** June 1, 2001
|
||||
|
||||
**E-mail:** herve@utdallas.edu
|
||||
|
||||
**Abstract**
|
||||
|
||||
-----------------------------------
|
||||
|
||||
**Relevant content viewed in the document**:
|
||||
|
||||
|
||||
|
||||
$$o=f\left(a\right)\enspace. \tag{6}$$
|
||||
|
||||
For example, in _backpropagation networks_, the (nonlinear) transfer function is usually the logistic function
|
||||
|
||||
$$o=f\left(a\right)=\operatorname{logist}\boldsymbol{w}^{\mskip-1.5mu \mathsf{T}} \boldsymbol{x}=\frac{1}{1+\exp\{-a\}}\enspace. \tag{7}$$
|
||||
|
||||
-----------------------------------
|
||||
|
||||
**Relevant content viewed in the document**:
|
||||
|
||||
|
||||
|
||||
$\boldsymbol{x}$, and $\boldsymbol{w}$, the activation of the output cell is obtained as
|
||||
|
||||
-----------------------------------
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue