% Universitat Politecnica de Catalunya
% Dept. of LSI
% Technical Report LSI-90-49
% November, 1990.
% Revised version, April, 1993.
 
\documentstyle[11pt, titlepage]{article}
 
\newcounter{savenumi}
\newenvironment{savenumerate}{\begin{enumerate}
\setcounter{enumi}{\value{savenumi}}}{\end{enumerate}
\setcounter{savenumi}{\value{enumi}}}
\newtheorem{theoremfoo}{Theorem}
\newenvironment{theorem}{\pagebreak[1]\begin{theoremfoo}}{\end{theoremfoo}}
\newtheorem{propositionfoo}[theoremfoo]{Proposition}
\newenvironment{proposition}{\pagebreak[1]\begin{propositionfoo}}
          {\end{propositionfoo}}
\newtheorem{lemmafoo}[theoremfoo]{Lemma}
\newenvironment{lemma}{\pagebreak[1]\begin{lemmafoo}}{\end{lemmafoo}}
\newtheorem{conjecturefoo}[theoremfoo]{Conjecture}
\newenvironment{conjecture}{\pagebreak[1]\begin{conjecturefoo}}
    {\end{conjecturefoo}}
\newtheorem{corollaryfoo}[theoremfoo]{Corollary}
\newenvironment{corollary}{\pagebreak[1]\begin{corollaryfoo}}
    {\end{corollaryfoo}}
\newtheorem{nttn}[theoremfoo]{Notation}
\newenvironment{notation}{\pagebreak[1]\begin{nttn}\rm}{\end{nttn}}
\newtheorem{dfntn}[theoremfoo]{Definition}
\newenvironment{definition}{\pagebreak[1]\begin{dfntn}\rm}{\end{dfntn}}
\newenvironment{proof}
    {\pagebreak[1]{\narrower\noindent {\bf Proof:\quad\nopagebreak}}}{\QED}
\renewcommand{\labelenumi}{\roman{enumi}.}
\newcommand{\yyskip}{\penalty-50\vskip 5pt plus 3pt minus 2pt}
\newcommand{\blackslug}{\hbox{\hskip 1pt
        \vrule width 4pt height 8pt depth 1.5pt\hskip 1pt}}
\newcommand{\QED}{{\penalty10000\parindent 0pt\penalty10000
        \hskip 8 pt\nolinebreak\blackslug\hfill\lower 8.5pt\null}
        \par\yyskip\pagebreak[1]}
 
\newcommand{\PH}{{\rm PH}}
\newcommand{\NP}{{\rm NP}}
\newcommand{\coNP}{\mbox{\rm co-{\rm NP}}}
\let\paragraph\P
\renewcommand{\P}{{\rm P}}
\newcommand{\PARITY}{\oplus}
\newcommand{\OR}{\vee}
\newcommand{\AND}{\wedge}
\newcommand{\implies}{\mathrel{\Rightarrow}}
\newcommand{\Iff}{\mathrel{\Leftrightarrow}}
\newcommand{\PARITYP}{\PARITY\space\space\P}
\newcommand{\PP}{{\rm PP}}
\newcommand{\pptoph}{\PP^{\PH}}
\newcommand{\pptophtoa}{\PP^{\PH^{A}}}
\newcommand{\tac}{\rm TAC}
\newcommand{\tacnul}{\tac^{(0)}}
\newcommand{\tacnulk}{\tac^{(0)}_k}
\newcommand{\montac}{\widehat {\tac}}
\newcommand{\tc} {\rm TC}
\newcommand{\tcnul}{\tc^{(0)}}
\newcommand{\montc}{\widehat {\rm TC}}
\newcommand{\montcnul}{\montc^{(0)}}
\newcommand{\montcnulk}{\montc^{(0)}_k}
\newcommand{\montcnulkplusone}{\montc^{(0)}_{k+1}}
\newcommand{\ac}{\rm AC}
\newcommand{\acnul}{\ac^{(0)}}
\newcommand{\notcontainedin} {\not\subseteq}
\newcommand{\notcontainedinexp} {\notcontainedin_{exp}}
\newcommand{\parity} {\rm parity}
\newcommand{\nat}{{\rm I\kern -.16em N}}
 
% These allow switching interline spacing.  The size changes ensure that the
% change takes effect immediately.
\makeatletter
\newcommand{\niceonespacing}
     {\let\CS=\@currsize\renewcommand{\baselinestretch}{11}\tiny\CS} 
\newcommand{\nicetwospacing}{\let\CS=\@currsize\renewcommand{\baselinestretch}
{1.2}\tiny\CS}
\newcommand{\nicethreespacing}{\let\CS=\@currsize\renewcommand{\baselinestretch}
{1.3}\tiny\CS}
\newcommand{\singlespacingplusplus}{\let\CS=\@currsize\renewcommand{\baselinestr
etch}{1.35}\tiny\CS}
\newcommand{\nicefivespacing}{\let\CS=\@currsize\renewcommand{\baselinestretch}{
1.5}\tiny\CS}
\newcommand{\nicesixpacing}{\let\CS=\@currsize\renewcommand{\baselinestretch}
{1.6}\tiny\CS}
\makeatother

\tolerance=2000
\textwidth 6.55in
\textheight 9.0in
\oddsidemargin 0.001in
\evensidemargin 0.001in
\topmargin -0.4in
 
\nicetwospacing

\sloppy
 
\begin{document}
\title{A Lower Bound for Monotone Perceptrons\\
              {\small (Revised Version)}}
\author{
Frederic Green  \\{\small Department of Mathematics and Computer Science}
                \\{\small Clark University}
                \\{\small Worcester, Massachusetts 01610}}
\date{}
\maketitle
 
\begin{abstract}
 It is proved that there is a monotone function in $\ac^{(0)}_4$ which
requires exponential size monotone perceptrons of depth 3. This solves the
monotone version of a problem which, in the general case, would imply an
oracle separation of $\PP^{\PH}$.\\
\end{abstract}
 
\section{Introduction}
 
  Recently perceptrons (see definition below) have received some renewed
attention because of their relevance to some important issues in
structural complexity theory. In \cite{Green} it was shown that perceptrons
cannot compute parity unless they are of exponential size
\footnote{In \cite{Green} perceptrons were referred to as $\pptoph$-circuits.
In \cite{BeiPPvsPH} the connection with perceptrons was first noted,
and we adopt this older terminology here.}. This has
the consequence that $(\exists A) (\PARITYP \notcontainedin \pptophtoa)$.
A result for perceptrons of depth 2 was used by Beigel
\cite{BeiPPvsPH} to provide an oracle $A$ such that $\P^{\NP^A} \notcontainedin
\PP^A$. Some interesting results for perceptrons were also obtained by Beigel,
Reingold and Spielman \cite{BeiPPclosed} as a consequence of the closure under
intersection of $\PP$. Further related results and extensions have been
obtained by Beigel, Reingold and Spielman~\cite{BRS}, Aspnes, Beigel, Furst
and Rudich~\cite{ABFR}, and Tarui~\cite{Tarui}.
 
  An interesting open question left by the work of \cite{Green} is whether
there is an oracle separating the hierarchy $\pptoph$. More precisely, is
there an oracle $A$ such that the following is true:
   $(\forall d) (\PP^{\Sigma^{p,A}_{d+1}} \notcontainedin
                 \PP^{\Sigma^{p,A}_{d}})$.
Indeed, in light of surprising results such as Toda's theorem \cite{Toda}
one might wonder
if it is possible that $\pptoph$ is a proper hierarchy in any relativized
world. One step in this direction was provided by \cite{BeiPPvsPH}, since
the result of that paper trivially implies an oracle $A$ such that
$\PP^{\NP^A} \notcontainedin \PP^A$. \cite{BeiPPvsPH} also leaves open
the possibility
that there is an oracle such that the separations between the levels
in $\pptoph$ are via levels of the polynomial hierarchy. That is, it is
possible that there is an oracle $A$ such that
   $(\forall d) ({\Sigma^{p,A}_{d+2}} \notcontainedin
                 \PP^{\Sigma^{p,A}_{d}})$
or even
   $(\forall d) ({\Delta^{p,A}_{d+2}} \notcontainedin
                 \PP^{\Sigma^{p,A}_{d}})$.
 
   In this paper we investigate a circuit problem which is motivated by
the above questions. The associated circuit problems are also of
intrinsic interest. We suspect that certain
$\acnul$ boolean functions of depth $d+1$ cannot be computed by even
exponentially large perceptrons of depth $d$. In fact this problem can
be reduced to a problem about circuits of a specific depth via the
H{\aa}stad switching lemma \cite{Hastad}. If we could show the result for
$d=3$, it then
follows for all $d > 3$ by applying that lemma. Lacking the techniques to
prove this in full generality, we concentrate here on monotone perceptrons.
Our main result is that certain monotone depth 4 $\acnul$ functions
cannot be computed by
exponentially large depth 3 {\em monotone} perceptrons. This result
represents a refinement of a result due to Yao~\cite{yao} (whose techniques are
also adopted). However to explain the relationship with Yao's work and
to describe our results more precisely, we must introduce
the basic concepts and notation. A threshold gate over the $N$ Boolean inputs
$x_1,...,x_N$ with weights $w_i, i=1,...,N$ and threshold $t$ is a Boolean gate
which outputs 1 iff $\sum_{i=1}^m w_i x_i \ge t$. Throughout this paper we
consider threshold circuits of bounded weight, that is with $w_i$ bounded from
above by a constant independent of $N$. A threshold gate is {\em monotone} if
all its weights are non-negative.
 
\begin{definition}
A {\em perceptron} is a circuit consisting of a single threshold gate whose
inputs are
the outputs of constant depth boolean circuits
over the basis \{ $\AND, \OR$, not \}. We refer to these subcircuits as the
{\em AND/OR subcircuits}. A perceptron is {\em monotone} if its threshold gate
is monotone and its AND/OR subcircuits contain no negations.
\end{definition}
\begin{definition}
The class of sets recognizable by perceptrons of polynomial size and depth
$k$ is denoted $\tac^{(0)}_{\em k}$, and the union over $k$ of these classes is
denoted $\tac^{(0)}$. The monotone versions of these classes are denoted
(respectively) $\montac^{(0)}_k$ and $\montac^{(0)}$.
\end{definition}
 
In general, we denote the monotone version of any circuit class $C$ by
${\widehat {C}}$.
When it is unambiguous, we use the same notation for a complexity
class and the associated set of circuits.
 
\begin{nttn}
Let $A$ and $B$ be classes of circuits. If there
is a function computable by circuits in class $A$ which requires
circuits in class $B$ of exponential size, we write
$A \notcontainedinexp B$.
\end{nttn}
 
Yao \cite{yao} has shown the following,
\begin{theorem}\label{le:yao-main}
{\rm {\bf (Yao)}} For all k, $\montcnulkplusone \notcontainedinexp \montcnulk$.
\end{theorem}
 
In fact this theorem was proved in \cite{yao} via the following stronger result,
\begin{theorem}\label{le:yao-main-2}
{\rm {\bf (Yao)}} For all k, ${\widehat {\ac}}^{(0)}_{2k} {\notcontainedinexp}
\montcnulk$.
\end{theorem}
 
Since ${\widehat {\ac}}^{(0)}_k \subseteq \montac^{(0)}_k$  and
      $\montac^{(0)}_k \subseteq \montcnulk$ for any k, the following is
immediate,
\begin{corollaryfoo}\label{le:yao-tac}
For all k, ${\widehat {\ac}}^{(0)}_{2k} {\notcontainedinexp} \montac^{(0)}_k$
and therefore
$\montac^{(0)}_{2k} {\notcontainedinexp} \montac^{(0)}_k$.
\end{corollaryfoo}
 
In this paper we sharpen Theorem~\ref{le:yao-main-2} and
Corollary~\ref{le:yao-tac} for $k=3$, that is, we show
\begin{theorem}\label{le:main-1} ${\widehat {\ac}}^{(0)}_4 {\notcontainedinexp}
\montac^{(0)}_3$ and therefore $\montac^{(0)}_4 {\notcontainedinexp}
\montac^{(0)}_3$.
\end{theorem}
 
     While this paper was being written, a stronger result due to
H{\aa}stad and Goldmann \cite{Hastad2} appeared, which states that for all $k$,
${\widehat {\ac}}^{(0)}_{k+1} \notcontainedinexp {\widehat {\tc}}^{(0)}_k$.
The results reported here were obtained independently and differ in some
important technical details. A discussion of H{\aa}stad and
Goldmann's results and their relationship with this paper is given at the end of
this section.
 
  To repeat our main motivation more precisely, if Theorem~\ref{le:main-1}
could be shown to hold in the {\sl non}-monotone case, i.e., if $\ac^{(0)}_4
{\notcontainedinexp} \tacnul_3$, then an application of the H{\aa}stad
switching lemma \cite{Hastad} would establish a separation of the hierarchy
$\pptoph$ relative to some oracle. Depth 4 versus depth 3 is critical, since it
would be the base case in such a proof. The switching lemma says that, applying
a random restriction to the inputs, the lowest levels of AND and OR can be
switched with high probability. Suppose for example we originally have AND's at
the second lowest level and OR's at the lowest level. Applying a random
restriction, with high probability we can then make any AND of OR's into an OR
of AND's and reduce the depth by absorbing one layer of OR's with the OR's
above it. It is clear that the minimum number of levels of AND's and OR's
to which we can reduce via this method is 2. Since
neither AND's nor OR's can be ``absorbed" into a threshold gate, the smallest
depth perceptron to which we can reduce consists of a threshold gate over depth
2 AND/OR subcircuits.
 
Observe that Corollary~\ref{le:yao-tac} does not immediately imply the second
clause of Theorem~\ref{le:main-1}. It is not clear that $\montac^{(0)}_{k+1}
\subseteq \montac^{(0)}_k$ implies $\montac^{(0)}_{2k} \subseteq
\montac^{(0)}_k$. This is because $\montac^{(0)}$ circuits are not uniform
on every level (the root is a threshold while lower levels consists of
AND's and OR's). For the same reason, $\montac^{(0)}_{2k} {\notcontainedinexp}
\montac^{(0)}_k$ is not known to imply $\montac^{(0)}_{k+1}
{\notcontainedinexp} \montac^{(0)}_k$, nor is this known in the
non-monotone case. Similarly it is not known if the collapse of $\PP^{\PH}$
translates upwards, i.e., if $\PP^{\Sigma^p_{d+1}} \subseteq
\PP^{\Sigma^p_{d}}$ implies $\pptoph \subseteq \PP^{\Sigma^p_{d}}$.
 
  The canonical AC$^{(0)}_k$ functions $f_{k,N}$ (see section 2 for
definitions), which we use for achieving the separations, are monotone. Upper
bounds for monotone perceptrons computing $f_{k,N}$ imply identical upper bounds
for non-monotone perceptrons computing arbitrary AC$^{(0)}_k$ languages. Hence it
makes sense to look for monotone simulations of $f_{k,N}$ before looking for
more general simulations of AC$^{(0)}_k$ functions. Our results show that small
monotone perceptrons computing these functions cannot exist.

  It is necessary to address a result due to Allender \cite{All}, which issues
a warning as to the significance of results about monotone
threshold circuits. Essentially due to the fact that Toda's theorem
\cite{Toda}
relativizes, Yao's result (Theorem~\ref{le:yao-main-2} above) does not carry
over to the
non-monotone case, at least not via exponential lower bounds. Allender showed
explicitly that any $\acnul$ circuit can be simulated by a depth-3
{\sl non}-monotone $\tcnul$-circuit of sub-exponential but superpolynomial size.
However, the situation with perceptrons may very well be
different. It seems quite possible that $\acnul$ is {\sl not} contained
in some specific level of $\tacnul$. Thus far there is evidence for this,
but only in the known lower bounds for $\tac^{(0)}_2$. Minsky and
Papert prove there is an $\ac^{(0)}_2$ function which cannot be computed by any
$\tac^{(0)}_2$-circuit with $\sqrt{N}/2$ bottom fan-in, regardless of circuit
size or weights (see \cite{MinPap}, section 3.2). Beigel~\cite{BeiPPvsPH}
sharpened this result for perceptrons with small weights. 

    We now describe our results in more detail and clarify the relationship
between this paper and the one of H{\aa}stad and Goldmann~\cite{Hastad2}.
H{\aa}stad and Goldmann prove that for all $k$, ${\widehat {\ac}}^{(0)}_{k+1}
\notcontainedinexp {\widehat {\tc}}^{(0)}_k$. They also adopt similar
techniques to Yao's, using a more detailed version of his inductive argument.
Their theorem clearly implies the main Theorem~\ref{le:main-1} as stated above.
However the  more precise statement given in
the main result, Theorem~\ref{le:main-lower-bound} (section 3), is not implied
by their results, and the techniques of the current paper take advantage of the
special circuits we have to deal with. Theorem~\ref{le:main-lower-bound} gives a
trade-off between the size of the AND/OR subcircuits and the number of these
subcircuits, and this leads to a lower bound which is closer to optimal. The
upper bound is $2^{(N+1)lgN}$. The lower bound of H{\aa}stad and Goldmann is
$2^{cN}$ while the one obtained here is $2^{cNlglgN}$ (see
Corollary~\ref{le:opt}). If the AND/OR subcircuits are sufficiently ``small"
(namely, $2^{N^{\epsilon}}$ for $\epsilon < 1$), our lower bound is $N^{\alpha
N}$ (see Corollary~\ref{le:N-to-the-epsilon}), which is much closer to
optimal.  For simpler circuits we obtain results that are optimal. In
Theorem~\ref{le:one-in-a-box} we present the optimal improvement of Minsky and
Papert's result on $\ac^{(0)}_2$ vs. $\tac^{(0)}_2$ in the monotone setting,
and Theorem~\ref{le:optimal-lower-bound} gives the optimal lower bound for
${\widehat{\ac}}^{(0)}_3$ vs. ${\widehat {\tac}}^{(0)}_2$.

    Thus in the context of perceptrons, the current result strengthens
quantitatively the main message of the H{\aa}stad and Goldmann paper. Both the
H{\aa}stad and Goldmann results and the current results show that in the
monotone setting, in some  circumstances threshold gates are no more powerful
than AND and OR gates.  H{\aa}stad and Goldmann show that to compute the
canonical depth $k$ AC$^{(0)}$ function we  need a depth $k-1$ monotone
threshold circuit of exponential size. We show here (at  least for $k=4$) that
to compute the canonical depth $k$ AC$^{(0)}$ function we need a depth $k-1$
monotone perceptron whose size is almost the same as what we would  need if the
single threshold gate were replaced by an OR gate.  In this
sense, the results obtained here explore the limits of Yao's techniques as
applied to perceptrons.

\section{Preliminaries}
   We define our notation and conventions and introduce additional necessary
concepts. Many of these are from
Yao \cite{yao}, sometimes in a slightly different form.
 
   Let $f_{k,N}(x_1, x_2,..., x_{N^k})$ denote the canonical
$\ac^{(0)}_{\em k}$-function.
The defining circuit for $f_{k,N}$ consists of an $\OR$ ($\AND$) as the top gate
if $k$ is even (odd), and $k-1$ alternating levels of $\AND$'s and $\OR$'s below
that. The fan-in of all gates in the defining circuit is $N$, so that the number
of inputs is $N^k$. More precisely, we define $f_{1,N}(x_1,...,x_N) =
\bigwedge \limits_{i=1}^N {x_i}$, and
\[ f_{k,N}(x_1,...,x_{N^k}) =   
      \left\{ \begin{array}{ll}
               \bigvee \limits_{i=1}^N {f_{k-1,N}(x_{(i-1)N^{k-1}+1},
                                             x_{(i-1)N^{k-1}+2},...,
                                             x_{(i-1)N^{k-1}+N^{k-1}})}
                        & \mbox{if $k$ is even}\\
               \bigwedge \limits_{i=1}^N {f_{k-1,N}(x_{(i-1)N^{k-1}+1},
                                             x_{(i-1)N^{k-1}+2},...,
                                             x_{(i-1)N^{k-1}+N^{k-1}})}
                        & \mbox{if $k$ is odd.}
                 \end{array}
         \right. \]
 Clearly for any $k$, $f_{k,N} \in {\widehat {\ac}}^{(0)}_k$.
 
  The notation $f_{k,N}$ will be used both for the function and the defining
circuit. Note that the defining circuits for $f_{k,N}$ are ``stratified",
that is, the output of any gate can only go to gates on the next highest level.
It will be useful to refer to individual gates in $f_{k,N}$, as
well as the set of inputs that can affect the output of that gate. Since it
will be of central importance, consider $f_{4,N}$. The ``level 1" gate is the
top ($\OR$) gate, the ``level 2" gates are the $\AND$-gates with wires leading
to the level 1 gate, and so forth. We refer to the set of inputs which can
affect a given gate on a given level as a ``block". For example, the first
level 2 $\AND$-block for $f_{4,N}$ consists of the inputs $\{x_1, x_2, ...,
x_{N^3}\}$, and the $l^{th}$ level 2 $\AND$-block consists of inputs
$\{x_{(l-1)N^3+1}, ..., x_{lN^3}\}$. Note that we only apply the terminology of
$\AND$ and $\OR$ blocks to the defining circuits of $f_{k,N}$, and not to
arbitrary circuits. See Figure 1 for an illustration of these concepts for
$f_{4,3}$.

The following definition is essentially
due to Yao, although he phrases it in terms of probability distributions.
 
\begin{definition}
   For any function $h : \{0,1\}^N \rightarrow \{0,1\}$, a pair of
sets $(p,q)$ is called a {\em separator} if
$p$ is a subset of $h^{-1}(1)$ and $q$ is a subset of
$h^{-1}(0)$.
\end{definition}
 
We use the same separators as in \cite{yao}. $(p_{k,N},q_{k,N})$ denotes
the separator for $f_{k,N}$. Let $(x_1,..., x_{N^k})$ denote the inputs to
$f_{k,N}$. $p_{1,N}$ consists of the single element
in which all of the $x_i$'s, $i \in \{1,...,N\}$ are $1$ (more formally,
$p_{1,N} = \{(1,...,1)\}$ where there are $N$ 1's). $q_{1,N}$ consists of all
settings of $(x_1,..., x_{N})$ in which exactly one of the $x_i$'s, $i \in
\{1,...,N\}$ is $1$ (i.e., $q_{1,N}$ consists of all input settings of the
form $(0,...,0,0,1,0,0,...,0)$). The remaining separators are defined
inductively.

\begin{itemize}
\item Even $k$: 
   Define $p_{k,N}$ to include all input settings of
the following form. Choose one of the level 2 $\AND$-blocks of $f_{k,N}$. Set
the inputs in that block according to an element of $p_{k-1,N}$, and set all
others to 0.

Define $q_{k,N}$ to include all input settings in which the inputs in
each level 2 $\AND$-block are set independently according to some
element of $q_{k-1,N}$. 

\item Odd $k$:
  Define $p_{k,N}$ to include all settings in which the inputs
in each level 2 $\OR$-block are set independently according to some
element of $p_{k-1,N}$.

   Define $q_{k,N}$ to include all input settings of the following
form. Choose one of the level 2 $\OR$-blocks of $f_{k,N}$. Set the inputs in
that block according to an element of $q_{k-1,N}$, and set all other inputs to
1.
\end{itemize}
 
   For any set $p$ of input settings and any boolean function
$h : \{0,1\}^N \rightarrow \{0,1\}$, $Pr_p(h(x)=1)$ denotes the probability
that $h(x)=1$ when the input setting $x$ is chosen randomly and uniformally
from the set $p$. We will always use the sets $p_{k,N}$ and $q_{k,N}$ in this
context, and we use the phrase ``choosing from $p$" to mean ``choosing an
element randomly and uniformally from the set $p$".
 
The $\epsilon$-discriminator method for proving lower bounds on threshold
circuits was introduced by Hajnal, Maass, Pudl{\'a}k, Szegedy and Tur{\'a}n
in~\cite{HMPST}. For the separator $(p,q)$ and a subcircuit $C$, define the
{\sl advantage} $\Delta$ of $C$ as $\Delta = Pr_p(C(x)=1) - Pr_q(C(x)=1)$.
If we can find a separator such that the advantage of any circuit of the same
type as $C$ is small, then it follows that a circuit consisting of a
threshold over subcircuits of this type must be large. This is stated
precisely in the following lemma, which is Lemma 3.3 in reference \cite{HMPST}.
It is valid for non-monotone as well as monotone circuits.

\begin{lemmafoo}\label{le:hmpst}\cite{HMPST}
  Let $T$ be a threshold circuit with subcircuits $C_1, C_2, ..., C_m$, so
that $T$ outputs 1 if and only if
       $\sum \limits_{i=1}^m {C_i(x)} \ge t$ for some threshold t.
Let $T$ compute the boolean function $f : \{0,1\}^n \rightarrow \{0,1\}$,
and let
$(p,q)$ be a separator for $f$. Let $\Delta_i = Pr_p(C_i(x)=1) -
Pr_q(C_i(x)=1)$. Then if $\Delta_i \le s^{-1}$ for all $i \in \{1,...,m\}$,
then $m \ge s$.
\end{lemmafoo}
 
The intuitive reason for choosing
the given separator is as follows. $p_{k,N}$
is chosen so that the fewest possible inputs are 1 when $f_{k,N}$ is 1.
$q_{k,N}$ is chosen such that the fewest number of inputs are
0 when $f_{k,N}$ is 0. By choosing $p_{k,N}$ in this way we tend to make
$Pr_p(C_i(x)=1)$ small, and by choosing $q_{k,N}$ as we have, we tend to
make $Pr_q(C_i(x)=1)$ large. Putting the two together conspires to make
$\Delta_i$ as small as possible. This strategy makes essential use of the
monotonicity of the perceptron.
 
\section{Main Results}
     In this section we prove Theorem~\ref{le:main-1},
which asserted that  ${\widehat {\ac}}^{(0)}_4 {\notcontainedinexp}
\montac^{(0)}_3$. We first present two simpler results which illustrate the
proof technique and are also interesting in themselves. The first one says that
in any $\montac^{(0)}_2$ circuit which computes $f_{2,N}$, the subcircuits
(which are single AND or OR gates) must have fan-in $N$ and the circuit size
must be $N$, which are both optimal. In the non-monotone case, as stated in the
introduction, the best known lower bound on the fan-in is $\sqrt{N}/2$, as
proved by Minsky and Papert. It would be interesting to see if the optimal
bound in the monotone case extends to the non-monotone case.

\begin{theorem}\label{le:one-in-a-box}
Let $T$ be a  $\montac^{(0)}_2$ circuit
which computes $f_{2,N}$. Then the fan-in of at least one of the $\AND$ or
$\OR$ gates of $T$ must be at least $N$ and $T$ must consist of at least $N$
gates.
\end{theorem}
\begin{proof}
 Each subcircuit (in this case, a single gate) of $T$
can be either an $\OR$, in which case we will refer to it as $D$,
or an $\AND$, in which case we will refer to it as $C$. For notational
convenience, we drop the $N$ subscript on the seperators, that is, we denote
$p_{k,N}$ and $q_{k,N}$ by $p_k$ and $q_k$ respectively. For any $\AND$ or
$\OR$ gate $G$ of $T$, define $\Delta = Pr_{p_2}(G=1) - Pr_{q_2}(G=1)$. By
Lemma~\ref{le:hmpst}, if $T$ has finitely many gates, for some gate $G$ we
must have $\Delta > 0$. This gate may be an $\OR$ or an $\AND$. We consider
these two possibilities separately.
 
Case 1 (OR): Let $\Delta = Pr_{p_2}(D=1) - Pr_{q_2}(D=1)$. Suppose that $\Delta
> 0$. Observe that $Pr_{q_2}(D=1)=1$ (and therefore $\Delta \le 0$) unless $D$
is of a special form: It has at most one input from each level 2
$\AND$-block. Thus we may suppose $D$ is of this form, and that it gets its
inputs from $s$ $\AND$-blocks. Note that
since there are $N$ $\AND$-blocks, it follows that $s \le N$. Choosing from
$q_2$, the probability that an input from a given level 2 $\AND$-block is zero
is ${1 \over N}$. Since the assignments to level 2 $\AND$-blocks are independent
in $q_2$, it follows that
  $Pr_{q_2}(D=0) = {1 \over N^s}$
and therefore,
  $$Pr_{q_2}(D=1) = 1 - {1 \over N^s}.$$
Now choosing from $p_2$, the probability that an input from a given level 2
$\AND$-block is 1 is ${1 \over N}$. Since $s$ such blocks
contribute to $D$, this implies that $Pr_{p_2}(D=1) \le {s \over N}$. Combining
these observations, we have,
$$ \Delta \le {s \over N} + {1 \over N^s} - 1.$$
For any fixed $N$, the quantity ${s \over N} + {1 \over N^s}$ is increasing for
$1 \le s \le N$. If $s = N-1$, note that ${s \over N} + {1 \over N^s} = 1 - {1
\over N} + {1 \over N^{N-1}}$, which implies that $\Delta < 0$. Thus if
$\Delta > 0$, it follows that $s \ge N$ and therefore $s=N$, as claimed. Note
that this also implies that $\Delta \le N^{-N} \le {1 \over N}$.

Case 2 (AND): Let $C$ denote the $\AND$ gate. Define $\Delta = Pr_{p_2}(C=1)
- Pr_{q_2}(C=1)$, and again assume $\Delta > 0$. Now
$Pr_{p_2}(C=1) = 0$ (and hence $\Delta \le 0$) unless $C$ is of a special
form: All of its inputs can come from at most one $\AND$-block.
Suppose $C$ is of this form and that the fan-in of $C$ is $s$. Again, $s \le
N$. The probability that $C=1$ choosing from $p_2$ is independent of $s$:
$$ Pr_{p_2}(C=1) \le {1
\over N}.$$ For this same $C$, it is
easy to see that $Pr_{q_2}(C=1) = 1 - {s \over N}$. Therefore,
  $$ \Delta \le {1 \over N} - (1 - {s \over N}).$$
Similar to case 1, we find that if $\Delta > 0$, then $s=N$, as claimed. Note
that again we have established that $\Delta \le {1 \over N}$.

In both cases, we have the inequality $\Delta \le {1 \over N}$, which, by
Lemma~\ref{le:hmpst}, implies that the size of the circuit must be at least
$N$.
\end{proof}

   We further illustrate the technique of the main theorem by proving optimal
lower bounds for $\montac^{(0)}_2$ versus ${\widehat {\ac}}^{(0)}_3$. The proof
is simpler in detail but follows the same broad outline as the proof of
Theorem~\ref{le:main-lower-bound} which follows. The $N^N$ lower bound is
optimal because in the defining circuit for $f_{3,N}$ we can use the
distributive law to switch the two upper AND and OR levels, yielding $N^N$ AND
gates, and then collapse the lower two levels of AND's, to yield an OR of $N^N$
AND's each of fan-in $N^2$.

\begin{theorem}\label{le:optimal-lower-bound}
 If $N > 2$, any depth 2 monotone perceptron $T$ computing $f_{3,N}(x)$
requires $N^N$ gates.
\end{theorem}
\begin{proof}
   We use the same notations and conventions as in the previous proof.
  We consider each case for a subcircuit of $T$ ($\OR$, which we denote as $D$,
or $\AND$, which we refer to as $C$) separately. Since in the defining circuit
for $f_{3,N}$ the only $\OR$'s are at level 2, we lose no precision in
referring to the set of inputs that affect these gates as $\OR$-blocks.
Defining $\Delta = Pr_{p_3}(G=1) - Pr_{q_3}(G=1)$, it
suffices, by Lemma~\ref{le:hmpst}, to show that $\Delta \le N^{-N}$. 
 
Case 1 (OR): Let $\Delta = Pr_{p_3}(D=1) - Pr_{q_3}(D=1)$. Observe
that $Pr_{q_3}(D=1)=1$ (and therefore $\Delta \le 0$) unless $D$ is of a
special form: All of its inputs come from at most one $\OR$-block, and
furthermore there can be at most one input from each level 3 $\AND$-block.
Suppose $D$ is of this form, and that its inputs come from $s$
$\AND$-blocks ($s \le N$). For such a gate, it is not hard to see that
  $$Pr_{q_3}(D=0) = {1 \over N} \cdot {1 \over N^s} = {1 \over N^{s+1}}$$
and therefore,
  $$Pr_{q_3}(D=1) = 1 - {1 \over N^{s+1}}.$$
Now choosing from $p_3$, the probability that there are inputs from a given
level 3 $\AND$-block that are 1 is ${1 \over N}$. Since $s$ such blocks
contribute to $D$, this implies that $Pr_{p_3}(D=1) \le {s \over N}$. Combining
these observations, we have,
$$ \Delta \le {s \over N} + {1 \over N^{s+1}} - 1.$$
Arguing exactly as in case 1 of Theorem~\ref{le:one-in-a-box}, if
$\Delta \ge 0$, it follows that $s = N$.  Hence, $\Delta \le N^{-N-1} <
N^{-N}$, as claimed.

Case 2 (AND): Let $C$ denote the $\AND$ gate. Let $\Delta = Pr_{p_3}(C=1)
- Pr_{q_3}(C=1)$. Now
$Pr_{p_3}(C=1) = 0$ (and hence $\Delta \le 0$) unless $C$ has inputs from at
most one level 3 $\AND$-block of any $\OR$-block. Suppose $C$ is of this form
and that the fan-in of $C$ is $s$. Then,
$$ Pr_{p_3}(C=1) \le {1 \over N^s}.$$
For this same $C$, it is easy to see that $Pr_{q_3}(C=1) \ge 1 - {s \over N}$.
Therefore,
  $$ \Delta \le {1 \over N^s} - (1 - {s \over N}).$$
Similar to case 1, we find that if $\Delta \ge 0$, then $s \ge N$. Hence $\Delta
\le N^{-N}$, as claimed.
\end{proof}

   As in the proof of Theorem~\ref{le:one-in-a-box}, a crucial result in the
proof of the previous theorem is the lower bound  $N$ on the fan-in $s$ of the
$\AND$ or $\OR$-gates of the perceptron. It should be noted that the lower
bound on fan-in is tight if the perceptron is a threshold of $\OR$'s, although
in that case the bound on size is not tight since then the optimal bound would
be $N^{N+1}$, as demonstrated in case 1 of the proof of
Theorem~\ref{le:optimal-lower-bound}. Similarly, the lower bound on size is
tight if the perceptron is a threshold of $\AND$'s, although in that case the
bound on fan-in is not tight since then the optimal bound would be $N^2$.
Again it would be interesting to see if the Minsky and Papert lower bound on the
fan-in, $\sqrt{N}/2$, can be improved to $N$ in the non-monotone case.

   We now turn to the main result.
 We derive a trade-off between the size of the AND/OR
subcircuits and the number of such subcircuits. We first give the lower bound
on the number of gates when the maximum size of an AND/OR subcircuit is given
by any function $u(N)$ bounded from above by $N^{const \cdot N}$. Later we
consider various possibilities for $u(N)$.

\begin{theorem}\label{le:main-lower-bound}
Let $0 < \delta < 1$ be a constant and $u:\nat \rightarrow \nat$
be any function such that
$u(N) \le N^{\delta N}$ for all $N$. There exist constants $N_0 > 0$ and
$0 < \beta < 1$ such that the following is true. Let $T$ be
any depth 3 perceptron which computes $f_{4,N}(x)$ where $N \ge N_0$ and the
largest of the AND/OR subcircuits of $T$ has 
 $u(N)$ gates. Then the number of AND/OR subcircuits in $T$ is at least
$$min(N^{\beta N}, ({Nlg(N) \over lg(N u(N))})^{\beta N}).$$  
\end{theorem}
\begin{proof}
   Let $N_0$ be chosen such that $N_0^{(1-\delta)N_0} > N_0^3$ and let $\beta$
be such that $\beta + \delta \le 1-{2 \over N_0}$. Assume throughout the
following that $N > N_0$.  Each subcircuit of $T$
can be either an $\OR$-of-$\AND$'s, in which case we refer to it as $D$,
or an $\AND$-of-$\OR$'s, in which case we refer to it as $C$. We
consider each of these cases separately. For any subcircuit $G$ of $T$, let
$\Delta = Pr_{p_4}(G=1) - Pr_{q_4}(G=1)$.  By Lemma~\ref{le:hmpst}, it suffices
to show that either $\Delta \le N^{-\beta N}$ or
 $\Delta \le ({ lg(N u(N)) \over Nlg(N)})^{\beta N}$. Also as in the proofs of
Theorems~\ref{le:one-in-a-box} and~\ref{le:optimal-lower-bound}, drop the $N$
subscript on the separators, denoting $p_{k,N}$ and $q_{k,N}$ by $p_k$ and $q_k$
respectively.
 
Case 1 (OR-OF-AND's): In this case $\Delta = Pr_{p_4}(D=1) - Pr_{q_4}(D=1)$.
Let the $\AND$-gates of $D$ be denoted $C_i$. Let $c \le N$ be
the number of level 2 $\AND$-blocks from which $D$ takes its inputs. Let $D_l$
($l \in \{1,...,c\}$) denote the function obtained from $D$ when we restrict all
inputs in $f_4$ to be 0 except those in level 2 $\AND$-block $l$. It is clear
that for input settings in $p_4$, $D=1 \Iff {\bigvee \limits_{l=1}^c}{(D_l=1)}$.
We thus have
$$Pr_{p_4}(D=1)  = Pr_{p_4}({\bigvee \limits_{l=1}^c}{(D_l=1)}).$$
Using the fact that the $D_l$'s are mutally exclusive and the fact
that $Pr_{p_4}(D_l=1) = {1 \over N}Pr_{p_3}(D_l=1)$, this implies,
\begin{eqnarray*}
Pr_{p_4}(D=1) & = & {\sum \limits_{l=1}^c}{ Pr_{p_4}(D_l=1)}\\
              & = & {1 \over N}{\sum \limits_{l=1}^c}{ Pr_{p_3}(D_l=1)}
\end{eqnarray*}
Now ${\bigvee \limits_{l=1}^c}{(D_l=1)} \implies D=1$
because $D$ is monotone. Hence $Pr_{q_4}({\bigvee \limits_{l=1}^c}{(D_l=1)})
\le Pr_{q_4}(D=1)$, so that we have
\begin{eqnarray*}
Pr_{q_4}(D=1) & \ge & 1 - Pr_{q_4}({\bigwedge \limits_{l=1}^c}{(D_l=0)})\\
              & = & 1 - {\prod \limits_{l=1}^c}{Pr_{q_3}(D_l=0)}
\end{eqnarray*}
where the equality holds because the $D_l$'s are independent in
$q_4$ (for each $l$, $D_l$ depends only on inputs from a level 2
$\AND$-block, and in $q_4$ these inputs are assigned independently according to
$q_3$). Thus we find that
\begin{eqnarray}\label{le:delta-inequality}
  \Delta \le {1 \over N}{\sum \limits_{l=1}^c}{ Pr_{p_3}(D_l=1)} +
             {\prod \limits_{l=1}^c}{Pr_{q_3}(D_l=0)} - 1.
\end{eqnarray}
Let $C^l_i$ denote the function that $C_i$ computes when all inputs
except those in level 2 $\AND$-block $l$ are 0. Like $C_i$, $C^l_i$ is also an
$\AND$-gate. Fix an $l$ and consider assignments to level 2
$\AND$-block $l$ chosen from $p_3$. Observe that the only way in which we can
have $Pr_{p_3}(C^l_i=1) \ne 0$ is that in which the set of inputs to $C^l_i$ is
a subset of a level 2 $\AND$-block, and furthermore the subset can be
partitioned into groups of inputs, each group from at most one of the level 4
$\AND$-blocks of a level 3 $\OR$-block of $f_4$. Suppose $Pr_{p_3}(C^l_i=1) \ne
0$ and that $C^l_i$ has inputs from exactly $k^l_i$ such $\AND$-blocks.
(Refer to Figure 2 for clarification.) The
probability that any one of these $\AND$-blocks is on is $1 \over N$, and hence
for any $C^l_i$ such that $Pr_{p_3}(C^l_i=1) \ne 0$, we have,
\begin{eqnarray*}
Pr_{p_3}(C^l_i = 1) = {1 \over N^{k^l_i}}.
\end{eqnarray*}
 Let $s_l$ denote the
number of $C^l_i$'s, i.e., $s_l$ is the fan-in of $D_l$. Thus
\begin{eqnarray*}
Pr_{p_3}(D_l=1) & = & Pr_{p_3}({\bigvee \limits_{i=1}^{s_l}}{(C^l_i=1)})\\
        & \le & {\sum \limits_{i=1}^{s_l}}{Pr_{p_3}(C^l_i=1)}\\
        & \le & {\sum \limits_{i=1}^{s_l}}{1 \over N^{k^l_i}}\\
        & \le & {s_l \over N^{k^l_{min}}}
\end{eqnarray*}
where $k^l_{min} = {\rm min}_i ({k^l_i})$.
 By hypothesis, $s_l \le u(N)$. Therefore,
\begin{eqnarray*}
Pr_{p_3}(D_l=1) \le {u(N) \over N^{k^l_{min}}}.
\end{eqnarray*} 
Then since $c \le N$, we find
\begin{eqnarray}\label{le:star}
{1 \over N} {\sum \limits_{l=1}^c}{Pr_{p_3}(D_l=1)} \le
                               {\rm max}_l Pr_{p_3}(D_l=1)
        \le  {u(N) \over N^{k_m}}
\end{eqnarray}
where $k_m = {\rm min}_l ({k^l_{min}})$. Now for any $l$ and $i$,
$Pr_{q_3}(D_l=0) \le Pr_{q_3}(C^l_i=0)$. For any assignment in $q_3$, $C^l_i$
can be 0 only if one of its inputs from a level 4 $\AND$-block is 0. Thus for
any $i$ such that $Pr_{p_3}(C^l_i=1) \not = 0$, we have $Pr_{q_3}(C^l_i=0) \le
{k^l_i \over N}$.
Therefore
\begin{eqnarray}\label{le:starstar}
\prod \limits_{l=1}^c
Pr_{q_3}(D_l=0) \le {k_m \over N}.
\end{eqnarray}
Putting this together with
eq.~\ref{le:star} and eq.~\ref{le:delta-inequality},
\begin{eqnarray}\label{le:delta-inequality2}
   \Delta \le {u(N) \over N^{k_m}} + {k_m \over N} - 1.
\end{eqnarray}

  Consider two possibilities for the size of $k_m$. First suppose $k_m \ge
{lg(u(N)) \over lg(N)} + 1$. (Note that since $k^l_{min}$ is defined only
if $Pr_{p_3}(D_l=1) \ne 0$, the inequality $k_m \ge {lg(u(N)) \over lg(N)} +
1$ really means that for all $l$, either $Pr_{p_3}(D_l=1)=0$ or $k^l_{min}
\ge {lg(u(N)) \over lg(N)} + 1$.) Then ${lg(u(N))
\over lg(N)} + s \le k_m \le {lg(u(N)) \over lg(N)} + s + 1$, for some integer
$s$ with $1 \le s \le N-{lg(u(N)) \over lg(N)}$. Then by
eq.~\ref{le:delta-inequality2},
\begin{eqnarray*}
    \Delta \le {1 \over N^s} + {1 \over N} \left({lg(u(N)) \over lg(N)} +s+1
                                           \right) - 1.
\end{eqnarray*}
If $\Delta \ge 0$ it follows that,
   ${1\over N^s} \ge 1-{1\over N}({lg(u(N)) \over lg(N)}+s+1)$,
which, by the upper bound on $u(N)$, yields
\begin{eqnarray*}
   {1 \over N^s} \ge 1 - \delta - {s+1 \over N}.
\end{eqnarray*}
If $s = (1-\delta)N -2$, the above inequality
implies $N^{(1-\delta)N} \le N^3$, which contradicts the choice of $N$.
Again, as noted in the proof of Theorem~\ref{le:one-in-a-box},
${1 \over N^s} +
{s \over N}$ is an increasing function of $s$ for $1 \le s \le N$, and
therefore $s \ge (1-\delta)N - 1$, which implies $s \ge \beta N$, by the choice
of $\beta$. Therefore, $$ \Delta \le {1 \over N^s} \le N^{-\beta N},$$ as
desired.

  Now consider the case $k_m < {lg(u(N)) \over lg(N)} + 1$. We show that
$\Delta \le ({lg(N u(N)) \over Nlg(N)})^{\beta N}$. For suppose that for
$\sigma$
 values of $l$ ($1\le \sigma \le N)$, $Pr_{p_3}(D_l=1) \ne 0$ and $k^l_{min} <
{lg(u(N)) \over lg(N)} + 1 = {lg(Nu(N)) \over lg(N)}$. Wlog suppose this is true
for $1 \le l \le \sigma$. The best upper bound we can put on
$Pr_{p_3}(D_l=1)$ for $1 \le l \le \sigma$ is 1.
For the remaining $N-\sigma$ values of $l$,
$$ Pr_{p_3}(D_l=1) \le {u(N) \over N^{k^l_{min}}} \le {u(N) \over N^{{lg(u(N))
\over lg(N)}+1}} = {1\over N}.$$
Therefore,
$$ {1 \over N} {\sum \limits_{l=1}^c}{Pr_{p_3}(D_l=1)}  \le 
      {\sigma \over N} + {N - \sigma \over N^2}.$$
Combining these observations with eqs.~\ref{le:delta-inequality}
and~\ref{le:starstar}, we find that
\begin{eqnarray*}
\Delta & \le & {\sigma \over N} + {N-\sigma \over N^2} +
          {lg(u(N)) \over N lg(N)} + {1 \over N} - 1\\
       & \le & {1 \over N}(1-{1\over N})\sigma +
          {1 \over N}(2 + \delta N - N).
\end{eqnarray*}
Thus if $\Delta \ge 0$, it follows that,
$$ \sigma \ge (1-\delta)N - 2.$$
Since $c \ge \sigma$, applying eq.~\ref{le:delta-inequality} again we then have,
\begin{eqnarray*}
   \Delta & \le & \prod \limits_{l=1}^c Pr_{q_3}(D_l=0)\\
          & \le & \prod \limits_{l=1}^{\sigma} {\mbox {\rm min}}_i
                                     Pr_{q_3}(C^l_i = 0)\\
          & \le & \prod \limits_{l=1}^{\sigma} {k^l_{min} \over N}\\
          & \le & \left( {lg(N u(N)) \over Nlg(N)} \right)^{\sigma}\\
          & \le & \left( {lg(N u(N)) \over Nlg(N)} \right)^{(1-\delta)N-2}.
\end{eqnarray*}
Using the choice of $\beta$, this implies that
$\Delta \le \left( {lg(N u(N)) \over Nlg(N)} \right)^{\beta N}$, as desired.
(End case 1).
 
  Case 2 (AND-OF-OR'S): Similar to case 1, we let
$\Delta = Pr_{p_4}(C=1) - Pr_{q_4}(C=1)$.
Let the $\OR$-gates of $C$ be denoted $D_i$. Let $C_l$
denote the function obtained from $C$ when we restrict all inputs in
$f_4$ to be 0 except those in $\AND$-block $l$ ($l \in \{1,...,c\}$, where
$c \le N$ is
the number of level 2 $\AND$-blocks from which $C$ takes its inputs).
The same considerations as in case 1 yield
\begin{eqnarray*}
Pr_{p_4}(C=1) & = & Pr_{p_4}({\bigvee \limits_{l=1}^c}{(C_l=1)})\\
              & = & {\sum \limits_{l=1}^c}{ Pr_{p_4}(C_l=1)}\\
              & = & {1 \over N}{\sum \limits_{l=1}^c}{ Pr_{p_3}(C_l=1)}\\
              & \le & {\rm max}_l (Pr_{p_3}(C_l=1)).
\end{eqnarray*}
(Note that the final inequality is the first departure with the argument
of case 1.) Denote the $l$ for which $Pr_{p_3}(C_l=1)$ is a maximum by
$l'$. Thus $Pr_{p_4}(C=1) \le Pr_{p_3}(C^{l'}=1)$. For the quantity
$Pr_{q_4}(C=1)$ we again follow an argument similar to that of case 1
to obtain,
\begin{eqnarray*}
  \Delta & \le & Pr_{p_3}(C^{l'}=1) + {\prod \limits_{l=1}^c}{Pr_{q_3}(C_l=0)}
                              - 1\\
         & \le & Pr_{p_3}(C^{l'}=1) + Pr_{q_3}(C^{l'}=0) - 1\\
         & \le & Pr_{p_3}(C^{l'}=1) - Pr_{q_3}(C^{l'}=1)
\end{eqnarray*}
(The alert reader will wonder why a similar strategy was not followed in
case 1, since it appears to be simpler here. The reason is that the
product in the second term on the LHS of eq.~\ref{le:delta-inequality} is
crucial to the argument in case 1, but not here.)
 
  A dual version of the argument for case 1, now simplified since we are
using the distribution $p_3$, gives the desired bound on $\Delta$.
(End case 2).
\end{proof}
 
The restriction $N^{\delta N}$ on the size of the AND/OR subcircuits in
Theorem~\ref{le:main-lower-bound} is ``optimal" in the following sense.
The proof would not
work if $\delta = 1$. Allowing $\delta = 1$ means we allow the $\AND$
and $\OR$ gates of $T$ to have fan-in $N^N$ (that is, we may take $u(N)=N^N$).
This fan-in is just large enough that the circuit $T$ could compute $f_4$: simply
expand the lowest two levels of $f_4$ (this gives $N^N$ gates for each
$\OR$-gate expanded), collapse the resulting two adjacent levels of
$\AND$'s, and set the threshold to 1 (so the threshold gate is an $\OR$-gate).

  If the upper bound on $u(N)$ is realized then trivially we have circuits
whose size is close to optimal (that is, $N^{\delta N}$). This is also true if
$u(N)$ is sufficiently small.

\begin{corollaryfoo}\label{le:N-to-the-epsilon}
   Let $\epsilon$ be any constant such that $0 < \epsilon < 1$. There exist
constants $0 < \alpha < 1$ and $N_0$ such that for any $N > N_0$, if T is any
depth 3 perceptron which computes $f_{4,N}$ and the size of the AND/OR 
subcircuits of T is at most $2^{N^{\epsilon}}$, the size of T is at least
$N^{\alpha N}$.
\end{corollaryfoo}
\begin{proof}
   In Theorem~\ref{le:main-lower-bound}, consider any $u(N) \le
2^{N^{\epsilon}}$. It is easy to see, for some $0 < \epsilon' < 1$ and for
sufficiently large $N$, it holds that
$$ \left( {N lg(N) \over lg(N u(N)) } \right) \ge N^{\epsilon'}.$$
This implies that the number of gates of T is at least $N^{\epsilon' \beta
N}$, from which the result follows by taking $\alpha = \epsilon' \beta$.
\end{proof}

 For intermediate subcircuit sizes (that is, between
$2^{N^{\epsilon}}$ and $N^{\delta N}$), we have not been able to obtain bounds
that are as close to optimal. The best one can do using
Theorem~\ref{le:main-lower-bound} is given in the following.

\begin{corollaryfoo}\label{le:opt}
Let $0 < \delta < 1$ be a constant. There exists a constant $N_0 > 0$ and
a constant $\alpha$, $0 < \alpha < 1$ such that the following is true. Let $T$
be any depth 3 perceptron which computes $f_{4,N}(x)$ where the number of
inputs is $N^4$ ($N \ge N_0$) and the fan-in of any of the $\AND$ or $\OR$-gates
of $T$ is bounded from above by $N^{\delta N}$. Then the number of gates in $T$
is at least $2^{\alpha N lg(lg(N))}$.  
\end{corollaryfoo}
\begin{proof}
 Let $u(N)$ be as in Theorem~\ref{le:main-lower-bound}. If $u(N) \ge
2^{Nlg(lg(N))}$, we are done. If $u(N) < 2^{Nlg(lg(N))}$, then it is not hard
to see that if $\beta <1$, there is a constant $\alpha < 1$ such that for
sufficiently large $N$ 
$$\left( {Nlg(N) \over lg(N u(N))} \right)^{\beta N}
        \ge 2^{\alpha N lg(lg(N))}.$$
Applying Theorem~\ref{le:main-lower-bound}, the result follows.
\end{proof}

 
\section{Acknowledgements}
  I wish to thank the Departament de Llenguatges i Sistemes Inform{\`a}tics,
Barcelona,
where this paper was originally written, for their hospitality. Conversations on
this subject with Arthur Chou and Richard Beigel are gratefully acknowledged.
The author is greatful to two anonymous referees for comments which greatly
improved the paper. 
 
\begin{thebibliography}{1}
 
\bibitem{All}
E.~Allender, ``A note on the power of threshold circuits", {\em Proceedings
of the 30th IEEE Symposium on Foundations of Computer Science} (1989)
580-584.
 

\bibitem{ABFR}
J.~Aspnes, R.~Beigel, M.~Furst and S.~Rudich, ``The Expressive Power of Voting
Polynomials", in {Proceedings of the 23rd Annual ACM Symposium on Theory of
Computing}, ACM Press (1991) 402-409.

\bibitem{BeiPPclosed}
R.~Beigel, N.~Reingold, and D.~Spielman, ``PP is closed under intersection",
in {\em Proceedings of the 23rd Annual ACM Symposium on Theory of Computing},
IEEE Computer Society Press (1991) 1-9. Also, {J. Comp. Syst. Sci.}, to appear.

\bibitem{BRS}
R.~Beigel, N.~Reingold, and D.~Spielman, ``The Perceptron Strikes Back",
in {\em Proceedings of the Sixth Annual Conference on Structure in Complexity
Theory}, IEEE Computer Society Press (1991) 286-291.
 
\bibitem{BeiPPvsPH}
R.~Beigel, ``Perceptrons, PP, and the
Polynomial Hierarchy", in {\em Proceedings of the Seventh Annual Conference on
Structure in Complexity Theory}, IEEE Computer Society Press (1992) 14-19.
 
\bibitem{Green}
F.~Green, ``An oracle separating $\PARITYP$ from $\pptoph$", {\em Information
Processing Letters} {\bf 37} (1991) 149-153.
 
\bibitem{Hastad}
J.~H{\aa}stad, ``Computational limitations for small-depth circuits", The MIT
press, Cambridge 1987.
 
\bibitem{Hastad2}
J.~H{\aa}stad and M.~Goldmann, ``On the Power of Small-Depth Threshold
Circuits", {\em Computational Complexity} {\bf 1} (1991) 113-129.
 
\bibitem{HMPST}
A.~Hajnal, W.~Maass, P.~Pudl{\'a}k, M.~Szegedy, and G.~Tur{\'a}n,
"Threshold circuits of bounded depth", in {\em Proceedings 28th Annual IEEE
Symposium on Foundations of Computer Science}, IEEE Computer
Society Press (1987) 99-110.
 
\bibitem{MinPap}
M.~L.~Minsky and S.~A.~Papert, ``Perceptrons" (expanded edition), MIT Press,
Cambridge, 1988.

\bibitem{Tarui}
J.~Tarui, ``Randomized Polynomials, Threshold Circuits, and the Polynomial
Hierarchy", in {8th Annual Symposium on Theoretical Aspects of Computer
Science}, Springer-Verlag LNCS 480, (1991) 238-250.

\bibitem{Toda}
S.~Toda, ``On the computational power of $\PP$ and $\PARITYP$", in {\em
Proceedings 30th IEEE Symposium on Foundations of Computer Science}, IEEE
Computer Society Press (1989) 514-519.
 
\bibitem{yao}
A.~C.-C.~Yao, ``Circuits and local computation", in {\em Proceedings of
the 21st ACM Symposium on Theory of Computing}, ACM Press (1989) 186-196.
 
\end{thebibliography}
 
\end{document}